2 based on test code from Rusty
18 static struct timeval tp1,tp2;
20 static void start_timer()
22 gettimeofday(&tp1,NULL);
25 static double end_timer()
27 gettimeofday(&tp2,NULL);
28 return (tp2.tv_sec + (tp2.tv_usec*1.0e-6)) -
29 (tp1.tv_sec + (tp1.tv_usec*1.0e-6));
32 /* Returns true if it was already set. */
33 static bool set_bit(unsigned char bits[], unsigned int idx)
35 if (bits[idx / CHAR_BIT] & (1 << (idx % CHAR_BIT)))
37 bits[idx / CHAR_BIT] |= (1 << (idx % CHAR_BIT));
41 /* We use the excellent Jenkins lookup3 hash; this is based on hash_word2.
42 * See: http://burtleburtle.net/bob/c/lookup3.c
45 * My best guess at if you are big-endian or little-endian. This may
48 #if (defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && \
49 __BYTE_ORDER == __LITTLE_ENDIAN) || \
50 (defined(i386) || defined(__i386__) || defined(__i486__) || \
51 defined(__i586__) || defined(__i686__) || defined(vax) || defined(MIPSEL))
52 # define HASH_LITTLE_ENDIAN 1
53 # define HASH_BIG_ENDIAN 0
54 #elif (defined(__BYTE_ORDER) && defined(__BIG_ENDIAN) && \
55 __BYTE_ORDER == __BIG_ENDIAN) || \
56 (defined(sparc) || defined(POWERPC) || defined(mc68000) || defined(sel))
57 # define HASH_LITTLE_ENDIAN 0
58 # define HASH_BIG_ENDIAN 1
60 # define HASH_LITTLE_ENDIAN 0
61 # define HASH_BIG_ENDIAN 0
64 #define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k))))
67 -------------------------------------------------------------------------------
68 mix -- mix 3 32-bit values reversibly.
70 This is reversible, so any information in (a,b,c) before mix() is
71 still in (a,b,c) after mix().
73 If four pairs of (a,b,c) inputs are run through mix(), or through
74 mix() in reverse, there are at least 32 bits of the output that
75 are sometimes the same for one pair and different for another pair.
77 * pairs that differed by one bit, by two bits, in any combination
78 of top bits of (a,b,c), or in any combination of bottom bits of
80 * "differ" is defined as +, -, ^, or ~^. For + and -, I transformed
81 the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
82 is commonly produced by subtraction) look like a single 1-bit
84 * the base values were pseudorandom, all zero but one bit set, or
85 all zero plus a counter that starts at zero.
87 Some k values for my "a-=c; a^=rot(c,k); c+=b;" arrangement that
92 Well, "9 15 3 18 27 15" didn't quite get 32 bits diffing
93 for "differ" defined as + with a one-bit base and a two-bit delta. I
94 used http://burtleburtle.net/bob/hash/avalanche.html to choose
95 the operations, constants, and arrangements of the variables.
97 This does not achieve avalanche. There are input bits of (a,b,c)
98 that fail to affect some output bits of (a,b,c), especially of a. The
99 most thoroughly mixed value is c, but it doesn't really even achieve
102 This allows some parallelism. Read-after-writes are good at doubling
103 the number of bits affected, so the goal of mixing pulls in the opposite
104 direction as the goal of parallelism. I did what I could. Rotates
105 seem to cost as much as shifts on every machine I could lay my hands
106 on, and rotates are much kinder to the top and bottom bits, so I used
108 -------------------------------------------------------------------------------
112 a -= c; a ^= rot(c, 4); c += b; \
113 b -= a; b ^= rot(a, 6); a += c; \
114 c -= b; c ^= rot(b, 8); b += a; \
115 a -= c; a ^= rot(c,16); c += b; \
116 b -= a; b ^= rot(a,19); a += c; \
117 c -= b; c ^= rot(b, 4); b += a; \
121 -------------------------------------------------------------------------------
122 final -- final mixing of 3 32-bit values (a,b,c) into c
124 Pairs of (a,b,c) values differing in only a few bits will usually
125 produce values of c that look totally different. This was tested for
126 * pairs that differed by one bit, by two bits, in any combination
127 of top bits of (a,b,c), or in any combination of bottom bits of
129 * "differ" is defined as +, -, ^, or ~^. For + and -, I transformed
130 the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
131 is commonly produced by subtraction) look like a single 1-bit
133 * the base values were pseudorandom, all zero but one bit set, or
134 all zero plus a counter that starts at zero.
136 These constants passed:
139 and these came close:
143 -------------------------------------------------------------------------------
145 #define final(a,b,c) \
147 c ^= b; c -= rot(b,14); \
148 a ^= c; a -= rot(c,11); \
149 b ^= a; b -= rot(a,25); \
150 c ^= b; c -= rot(b,16); \
151 a ^= c; a -= rot(c,4); \
152 b ^= a; b -= rot(a,14); \
153 c ^= b; c -= rot(b,24); \
157 * hashlittle2: return 2 32-bit hash values
159 * This is identical to hashlittle(), except it returns two 32-bit hash
160 * values instead of just one. This is good enough for hash table
161 * lookup with 2^^64 buckets, or if you want a second hash if you're not
162 * happy with the first, or if you want a probably-unique 64-bit ID for
163 * the key. *pc is better mixed than *pb, so use *pc first. If you want
164 * a 64-bit value do something like "*pc + (((uint64_t)*pb)<<32)".
166 static void hashlittle2(
167 const void *key, /* the key to hash */
168 size_t length, /* length of the key */
169 uint32_t *pc, /* IN: primary initval, OUT: primary hash */
170 uint32_t *pb) /* IN: secondary initval, OUT: secondary hash */
172 uint32_t a,b,c; /* internal state */
173 union { const void *ptr; size_t i; } u; /* needed for Mac Powerbook G4 */
175 /* Set up the internal state */
176 a = b = c = 0xdeadbeef + ((uint32_t)length) + *pc;
180 if (HASH_LITTLE_ENDIAN && ((u.i & 0x3) == 0)) {
181 const uint32_t *k = (const uint32_t *)key; /* read 32-bit chunks */
186 /*------ all but last block: aligned reads and affect 32 bits of (a,b,c) */
197 /*----------------------------- handle the last (probably partial) block */
199 * "k[2]&0xffffff" actually reads beyond the end of the string, but
200 * then masks off the part it's not allowed to read. Because the
201 * string is aligned, the masked-off tail is in the same word as the
202 * rest of the string. Every machine with memory protection I've seen
203 * does it on word boundaries, so is OK with this. But VALGRIND will
204 * still catch it and complain. The masking trick does make the hash
205 * noticably faster for short strings (like English words).
211 case 12: c+=k[2]; b+=k[1]; a+=k[0]; break;
212 case 11: c+=k[2]&0xffffff; b+=k[1]; a+=k[0]; break;
213 case 10: c+=k[2]&0xffff; b+=k[1]; a+=k[0]; break;
214 case 9 : c+=k[2]&0xff; b+=k[1]; a+=k[0]; break;
215 case 8 : b+=k[1]; a+=k[0]; break;
216 case 7 : b+=k[1]&0xffffff; a+=k[0]; break;
217 case 6 : b+=k[1]&0xffff; a+=k[0]; break;
218 case 5 : b+=k[1]&0xff; a+=k[0]; break;
219 case 4 : a+=k[0]; break;
220 case 3 : a+=k[0]&0xffffff; break;
221 case 2 : a+=k[0]&0xffff; break;
222 case 1 : a+=k[0]&0xff; break;
223 case 0 : *pc=c; *pb=b; return; /* zero length strings require no mixing */
226 #else /* make valgrind happy */
228 k8 = (const uint8_t *)k;
231 case 12: c+=k[2]; b+=k[1]; a+=k[0]; break;
232 case 11: c+=((uint32_t)k8[10])<<16; /* fall through */
233 case 10: c+=((uint32_t)k8[9])<<8; /* fall through */
234 case 9 : c+=k8[8]; /* fall through */
235 case 8 : b+=k[1]; a+=k[0]; break;
236 case 7 : b+=((uint32_t)k8[6])<<16; /* fall through */
237 case 6 : b+=((uint32_t)k8[5])<<8; /* fall through */
238 case 5 : b+=k8[4]; /* fall through */
239 case 4 : a+=k[0]; break;
240 case 3 : a+=((uint32_t)k8[2])<<16; /* fall through */
241 case 2 : a+=((uint32_t)k8[1])<<8; /* fall through */
242 case 1 : a+=k8[0]; break;
243 case 0 : *pc=c; *pb=b; return; /* zero length strings require no mixing */
246 #endif /* !valgrind */
248 } else if (HASH_LITTLE_ENDIAN && ((u.i & 0x1) == 0)) {
249 const uint16_t *k = (const uint16_t *)key; /* read 16-bit chunks */
252 /*--------------- all but last block: aligned reads and different mixing */
255 a += k[0] + (((uint32_t)k[1])<<16);
256 b += k[2] + (((uint32_t)k[3])<<16);
257 c += k[4] + (((uint32_t)k[5])<<16);
263 /*----------------------------- handle the last (probably partial) block */
264 k8 = (const uint8_t *)k;
267 case 12: c+=k[4]+(((uint32_t)k[5])<<16);
268 b+=k[2]+(((uint32_t)k[3])<<16);
269 a+=k[0]+(((uint32_t)k[1])<<16);
271 case 11: c+=((uint32_t)k8[10])<<16; /* fall through */
273 b+=k[2]+(((uint32_t)k[3])<<16);
274 a+=k[0]+(((uint32_t)k[1])<<16);
276 case 9 : c+=k8[8]; /* fall through */
277 case 8 : b+=k[2]+(((uint32_t)k[3])<<16);
278 a+=k[0]+(((uint32_t)k[1])<<16);
280 case 7 : b+=((uint32_t)k8[6])<<16; /* fall through */
282 a+=k[0]+(((uint32_t)k[1])<<16);
284 case 5 : b+=k8[4]; /* fall through */
285 case 4 : a+=k[0]+(((uint32_t)k[1])<<16);
287 case 3 : a+=((uint32_t)k8[2])<<16; /* fall through */
292 case 0 : *pc=c; *pb=b; return; /* zero length strings require no mixing */
295 } else { /* need to read the key one byte at a time */
296 const uint8_t *k = (const uint8_t *)key;
298 /*--------------- all but the last block: affect some 32 bits of (a,b,c) */
302 a += ((uint32_t)k[1])<<8;
303 a += ((uint32_t)k[2])<<16;
304 a += ((uint32_t)k[3])<<24;
306 b += ((uint32_t)k[5])<<8;
307 b += ((uint32_t)k[6])<<16;
308 b += ((uint32_t)k[7])<<24;
310 c += ((uint32_t)k[9])<<8;
311 c += ((uint32_t)k[10])<<16;
312 c += ((uint32_t)k[11])<<24;
318 /*-------------------------------- last block: affect all 32 bits of (c) */
319 switch(length) /* all the case statements fall through */
321 case 12: c+=((uint32_t)k[11])<<24;
322 case 11: c+=((uint32_t)k[10])<<16;
323 case 10: c+=((uint32_t)k[9])<<8;
325 case 8 : b+=((uint32_t)k[7])<<24;
326 case 7 : b+=((uint32_t)k[6])<<16;
327 case 6 : b+=((uint32_t)k[5])<<8;
329 case 4 : a+=((uint32_t)k[3])<<24;
330 case 3 : a+=((uint32_t)k[2])<<16;
331 case 2 : a+=((uint32_t)k[1])<<8;
334 case 0 : *pc=c; *pb=b; return; /* zero length strings require no mixing */
342 static unsigned string_in_list(const char *str, char **list, int length)
345 for (i=0; i<length; i++) {
346 if (strcmp(str, list[i]) == 0) return 1;
351 static int remove_duplicates(char **list, unsigned count)
353 unsigned int num_bits, hash_mask, num_hashes;
356 unsigned output_count;
358 /* See http://en.wikipedia.org/wiki/Bloom_filter:
359 * num_bits = - (num_entries * ln(probability)) / (ln(2)^2).
361 * Assume probability = one in a million.
362 * ln(p) = -13.8155106.
363 * ln(2)^2 = 0.480453014
365 * => num_bits = num_entries * 28.7551751
367 num_bits = count * 29;
369 /* Expand num_bits to the next power of 2. */
370 for (i = 1; i < num_bits; i *= 2);
374 hash = calloc(num_bits, 32 / CHAR_BIT);
380 /* Now, ideal number of hashes = ln(2) * num_bits / num_entries,
381 * and ln(2) approximates 9/13. */
382 num_hashes = (9 * num_bits) / (13 * count);
385 /* We generate two at a time anyway. */
389 printf("num_hashes=%d\n", num_hashes);
392 for (i = 0; i < count; i++) {
393 unsigned int len = strlen(list[i]), bits_set = 0;
394 uint32_t h1 = len, h2 = 0;
395 for (j = 0; j < num_hashes; j += 2) {
396 hashlittle2(list[i], len, &h1, &h2);
397 bits_set += set_bit(hash, h1 & hash_mask);
398 bits_set += set_bit(hash, h2 & hash_mask);
400 if (bits_set == num_hashes) {
401 /* might be a collision - check the slow way */
402 if (string_in_list(list[i], list, i) != 0) {
406 if (output_count != i) {
407 list[output_count] = list[i];
415 static int string_cmp(const void *p1, const void *p2)
417 return strcmp(* (char * const *) p1, * (char * const *) p2);
420 static int remove_duplicates_qsort(char **list, unsigned count)
424 qsort(list, count, sizeof(char *), string_cmp);
427 for (i = 1; i < count; i++) {
428 if (strcmp(list[i-1], list[i]) == 0) {
431 if (output_count != i) {
432 list[output_count] = list[i];
439 char **copy_list(char **list, int count)
441 char **ret = malloc(count*sizeof(char *));
442 memcpy(ret, list, count*sizeof(char *));
446 static char *randstring(int len)
448 const char *clist = "ABCDEF";
449 char *ret = malloc(len+1);
451 for (i=0; i<len; i++) {
452 ret[i] = clist[random() % strlen(clist)];
458 int main(int argc, char *argv[])
460 int count, nondup, i;
461 char **orig, **list1, **list2;
462 int output_count1, output_count2, str_len;
466 printf("Usage: <count> <pctdup> <strlen>\n");
472 count = strtoul(argv[1], NULL, 0);
473 pctdup = strtod(argv[2], NULL);
474 str_len = strtoul(argv[3], NULL, 0);
476 orig = malloc(sizeof(char *) * (count+1));
477 nondup = ((100 - pctdup)/100.0)*count;
478 if (nondup == 0) nondup = 1;
480 for (i=0; i<nondup; i++) {
481 orig[i] = randstring(str_len);
483 for (; i<count; i++) {
484 orig[i] = orig[random() % nondup];
487 list1 = copy_list(orig, count);
489 output_count1 = remove_duplicates(list1, count);
490 if (output_count1 == -1) {
493 printf("Removed %d duplicates from %d strings took %f usec (hashing)\n",
494 count - output_count1, count, end_timer() * 1.0e6);
495 qsort(list1, output_count1, sizeof(char *), string_cmp);
497 list2 = copy_list(orig, count);
499 output_count2 = remove_duplicates_qsort(list2, count);
500 if (output_count2 == -1) {
503 printf("Removed %d duplicates from %d strings took %f usec (qsort)\n",
504 count - output_count2, count, end_timer() * 1.0e6);
506 if (output_count1 != output_count2 ||
507 memcmp(list1, list2, output_count1*sizeof(char *)) != 0) {
508 printf("output mismatch!\n");