stringdup.c

   1 /*
   2   based on test code from Rusty
   3  */
   4 #include <stdlib.h>
   5 #include <err.h>
   6 #include <limits.h>
   7 #include <stdbool.h>
   8 #include <stdint.h>
   9 #include <string.h>
  10 #include <stdio.h>
  11 #include <unistd.h>
  12 #include <sys/stat.h>
  13 #include <fcntl.h>
  14
  15 #include <sys/time.h>
  16 #include <time.h>
  17
  18 static struct timeval tp1,tp2;
  19
  20 static void start_timer()
  21 {
  22         gettimeofday(&tp1,NULL);
  23 }
  24
  25 static double end_timer()
  26 {
  27         gettimeofday(&tp2,NULL);
  28         return (tp2.tv_sec + (tp2.tv_usec*1.0e-6)) -
  29                 (tp1.tv_sec + (tp1.tv_usec*1.0e-6));
  30 }
  31
  32 /* Returns true if it was already set. */
  33 static bool set_bit(unsigned char bits[], unsigned int idx)
  34 {
  35         if (bits[idx / CHAR_BIT] & (1 << (idx % CHAR_BIT)))
  36                 return true;
  37         bits[idx / CHAR_BIT] |= (1 << (idx % CHAR_BIT));
  38         return false;
  39 }
  40
  41 /* We use the excellent Jenkins lookup3 hash; this is based on hash_word2.
  42  * See: http://burtleburtle.net/bob/c/lookup3.c
  43  */
  44 /*
  45  * My best guess at if you are big-endian or little-endian.  This may
  46  * need adjustment.
  47  */
  48 #if (defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && \
  49      __BYTE_ORDER == __LITTLE_ENDIAN) || \
  50     (defined(i386) || defined(__i386__) || defined(__i486__) || \
  51      defined(__i586__) || defined(__i686__) || defined(vax) || defined(MIPSEL))
  52 # define HASH_LITTLE_ENDIAN 1
  53 # define HASH_BIG_ENDIAN 0
  54 #elif (defined(__BYTE_ORDER) && defined(__BIG_ENDIAN) && \
  55        __BYTE_ORDER == __BIG_ENDIAN) || \
  56       (defined(sparc) || defined(POWERPC) || defined(mc68000) || defined(sel))
  57 # define HASH_LITTLE_ENDIAN 0
  58 # define HASH_BIG_ENDIAN 1
  59 #else
  60 # define HASH_LITTLE_ENDIAN 0
  61 # define HASH_BIG_ENDIAN 0
  62 #endif
  63
  64 #define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k))))
  65
  66 /*
  67 -------------------------------------------------------------------------------
  68 mix -- mix 3 32-bit values reversibly.
  69
  70 This is reversible, so any information in (a,b,c) before mix() is
  71 still in (a,b,c) after mix().
  72
  73 If four pairs of (a,b,c) inputs are run through mix(), or through
  74 mix() in reverse, there are at least 32 bits of the output that
  75 are sometimes the same for one pair and different for another pair.
  76 This was tested for:
  77 * pairs that differed by one bit, by two bits, in any combination
  78   of top bits of (a,b,c), or in any combination of bottom bits of
  79   (a,b,c).
  80 * "differ" is defined as +, -, ^, or ~^.  For + and -, I transformed
  81   the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
  82   is commonly produced by subtraction) look like a single 1-bit
  83   difference.
  84 * the base values were pseudorandom, all zero but one bit set, or
  85   all zero plus a counter that starts at zero.
  86
  87 Some k values for my "a-=c; a^=rot(c,k); c+=b;" arrangement that
  88 satisfy this are
  89     4  6  8 16 19  4
  90     9 15  3 18 27 15
  91    14  9  3  7 17  3
  92 Well, "9 15 3 18 27 15" didn't quite get 32 bits diffing
  93 for "differ" defined as + with a one-bit base and a two-bit delta.  I
  94 used http://burtleburtle.net/bob/hash/avalanche.html to choose
  95 the operations, constants, and arrangements of the variables.
  96
  97 This does not achieve avalanche.  There are input bits of (a,b,c)
  98 that fail to affect some output bits of (a,b,c), especially of a.  The
  99 most thoroughly mixed value is c, but it doesn't really even achieve
 100 avalanche in c.
 101
 102 This allows some parallelism.  Read-after-writes are good at doubling
 103 the number of bits affected, so the goal of mixing pulls in the opposite
 104 direction as the goal of parallelism.  I did what I could.  Rotates
 105 seem to cost as much as shifts on every machine I could lay my hands
 106 on, and rotates are much kinder to the top and bottom bits, so I used
 107 rotates.
 108 -------------------------------------------------------------------------------
 109 */
 110 #define mix(a,b,c) \
 111 { \
 112   a -= c;  a ^= rot(c, 4);  c += b; \
 113   b -= a;  b ^= rot(a, 6);  a += c; \
 114   c -= b;  c ^= rot(b, 8);  b += a; \
 115   a -= c;  a ^= rot(c,16);  c += b; \
 116   b -= a;  b ^= rot(a,19);  a += c; \
 117   c -= b;  c ^= rot(b, 4);  b += a; \
 118 }
 119
 120 /*
 121 -------------------------------------------------------------------------------
 122 final -- final mixing of 3 32-bit values (a,b,c) into c
 123
 124 Pairs of (a,b,c) values differing in only a few bits will usually
 125 produce values of c that look totally different.  This was tested for
 126 * pairs that differed by one bit, by two bits, in any combination
 127   of top bits of (a,b,c), or in any combination of bottom bits of
 128   (a,b,c).
 129 * "differ" is defined as +, -, ^, or ~^.  For + and -, I transformed
 130   the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
 131   is commonly produced by subtraction) look like a single 1-bit
 132   difference.
 133 * the base values were pseudorandom, all zero but one bit set, or
 134   all zero plus a counter that starts at zero.
 135
 136 These constants passed:
 137  14 11 25 16 4 14 24
 138  12 14 25 16 4 14 24
 139 and these came close:
 140   4  8 15 26 3 22 24
 141  10  8 15 26 3 22 24
 142  11  8 15 26 3 22 24
 143 -------------------------------------------------------------------------------
 144 */
 145 #define final(a,b,c) \
 146 { \
 147   c ^= b; c -= rot(b,14); \
 148   a ^= c; a -= rot(c,11); \
 149   b ^= a; b -= rot(a,25); \
 150   c ^= b; c -= rot(b,16); \
 151   a ^= c; a -= rot(c,4);  \
 152   b ^= a; b -= rot(a,14); \
 153   c ^= b; c -= rot(b,24); \
 154 }
 155
 156 /*
 157  * hashlittle2: return 2 32-bit hash values
 158  *
 159  * This is identical to hashlittle(), except it returns two 32-bit hash
 160  * values instead of just one.  This is good enough for hash table
 161  * lookup with 2^^64 buckets, or if you want a second hash if you're not
 162  * happy with the first, or if you want a probably-unique 64-bit ID for
 163  * the key.  *pc is better mixed than *pb, so use *pc first.  If you want
 164  * a 64-bit value do something like "*pc + (((uint64_t)*pb)<<32)".
 165  */
 166 static void hashlittle2(
 167   const void *key,       /* the key to hash */
 168   size_t      length,    /* length of the key */
 169   uint32_t   *pc,        /* IN: primary initval, OUT: primary hash */
 170   uint32_t   *pb)        /* IN: secondary initval, OUT: secondary hash */
 171 {
 172   uint32_t a,b,c;                                          /* internal state */
 173   union { const void *ptr; size_t i; } u;     /* needed for Mac Powerbook G4 */
 174
 175   /* Set up the internal state */
 176   a = b = c = 0xdeadbeef + ((uint32_t)length) + *pc;
 177   c += *pb;
 178
 179   u.ptr = key;
 180   if (HASH_LITTLE_ENDIAN && ((u.i & 0x3) == 0)) {
 181     const uint32_t *k = (const uint32_t *)key;         /* read 32-bit chunks */
 182 #ifdef VALGRIND
 183     const uint8_t  *k8;
 184 #endif
 185
 186     /*------ all but last block: aligned reads and affect 32 bits of (a,b,c) */
 187     while (length > 12)
 188     {
 189       a += k[0];
 190       b += k[1];
 191       c += k[2];
 192       mix(a,b,c);
 193       length -= 12;
 194       k += 3;
 195     }
 196
 197     /*----------------------------- handle the last (probably partial) block */
 198     /*
 199      * "k[2]&0xffffff" actually reads beyond the end of the string, but
 200      * then masks off the part it's not allowed to read.  Because the
 201      * string is aligned, the masked-off tail is in the same word as the
 202      * rest of the string.  Every machine with memory protection I've seen
 203      * does it on word boundaries, so is OK with this.  But VALGRIND will
 204      * still catch it and complain.  The masking trick does make the hash
 205      * noticably faster for short strings (like English words).
 206      */
 207 #ifndef VALGRIND
 208
 209     switch(length)
 210     {
 211     case 12: c+=k[2]; b+=k[1]; a+=k[0]; break;
 212     case 11: c+=k[2]&0xffffff; b+=k[1]; a+=k[0]; break;
 213     case 10: c+=k[2]&0xffff; b+=k[1]; a+=k[0]; break;
 214     case 9 : c+=k[2]&0xff; b+=k[1]; a+=k[0]; break;
 215     case 8 : b+=k[1]; a+=k[0]; break;
 216     case 7 : b+=k[1]&0xffffff; a+=k[0]; break;
 217     case 6 : b+=k[1]&0xffff; a+=k[0]; break;
 218     case 5 : b+=k[1]&0xff; a+=k[0]; break;
 219     case 4 : a+=k[0]; break;
 220     case 3 : a+=k[0]&0xffffff; break;
 221     case 2 : a+=k[0]&0xffff; break;
 222     case 1 : a+=k[0]&0xff; break;
 223     case 0 : *pc=c; *pb=b; return;  /* zero length strings require no mixing */
 224     }
 225
 226 #else /* make valgrind happy */
 227
 228     k8 = (const uint8_t *)k;
 229     switch(length)
 230     {
 231     case 12: c+=k[2]; b+=k[1]; a+=k[0]; break;
 232     case 11: c+=((uint32_t)k8[10])<<16;  /* fall through */
 233     case 10: c+=((uint32_t)k8[9])<<8;    /* fall through */
 234     case 9 : c+=k8[8];                   /* fall through */
 235     case 8 : b+=k[1]; a+=k[0]; break;
 236     case 7 : b+=((uint32_t)k8[6])<<16;   /* fall through */
 237     case 6 : b+=((uint32_t)k8[5])<<8;    /* fall through */
 238     case 5 : b+=k8[4];                   /* fall through */
 239     case 4 : a+=k[0]; break;
 240     case 3 : a+=((uint32_t)k8[2])<<16;   /* fall through */
 241     case 2 : a+=((uint32_t)k8[1])<<8;    /* fall through */
 242     case 1 : a+=k8[0]; break;
 243     case 0 : *pc=c; *pb=b; return;  /* zero length strings require no mixing */
 244     }
 245
 246 #endif /* !valgrind */
 247
 248   } else if (HASH_LITTLE_ENDIAN && ((u.i & 0x1) == 0)) {
 249     const uint16_t *k = (const uint16_t *)key;         /* read 16-bit chunks */
 250     const uint8_t  *k8;
 251
 252     /*--------------- all but last block: aligned reads and different mixing */
 253     while (length > 12)
 254     {
 255       a += k[0] + (((uint32_t)k[1])<<16);
 256       b += k[2] + (((uint32_t)k[3])<<16);
 257       c += k[4] + (((uint32_t)k[5])<<16);
 258       mix(a,b,c);
 259       length -= 12;
 260       k += 6;
 261     }
 262
 263     /*----------------------------- handle the last (probably partial) block */
 264     k8 = (const uint8_t *)k;
 265     switch(length)
 266     {
 267     case 12: c+=k[4]+(((uint32_t)k[5])<<16);
 268              b+=k[2]+(((uint32_t)k[3])<<16);
 269              a+=k[0]+(((uint32_t)k[1])<<16);
 270              break;
 271     case 11: c+=((uint32_t)k8[10])<<16;     /* fall through */
 272     case 10: c+=k[4];
 273              b+=k[2]+(((uint32_t)k[3])<<16);
 274              a+=k[0]+(((uint32_t)k[1])<<16);
 275              break;
 276     case 9 : c+=k8[8];                      /* fall through */
 277     case 8 : b+=k[2]+(((uint32_t)k[3])<<16);
 278              a+=k[0]+(((uint32_t)k[1])<<16);
 279              break;
 280     case 7 : b+=((uint32_t)k8[6])<<16;      /* fall through */
 281     case 6 : b+=k[2];
 282              a+=k[0]+(((uint32_t)k[1])<<16);
 283              break;
 284     case 5 : b+=k8[4];                      /* fall through */
 285     case 4 : a+=k[0]+(((uint32_t)k[1])<<16);
 286              break;
 287     case 3 : a+=((uint32_t)k8[2])<<16;      /* fall through */
 288     case 2 : a+=k[0];
 289              break;
 290     case 1 : a+=k8[0];
 291              break;
 292     case 0 : *pc=c; *pb=b; return;  /* zero length strings require no mixing */
 293     }
 294
 295   } else {                        /* need to read the key one byte at a time */
 296     const uint8_t *k = (const uint8_t *)key;
 297
 298     /*--------------- all but the last block: affect some 32 bits of (a,b,c) */
 299     while (length > 12)
 300     {
 301       a += k[0];
 302       a += ((uint32_t)k[1])<<8;
 303       a += ((uint32_t)k[2])<<16;
 304       a += ((uint32_t)k[3])<<24;
 305       b += k[4];
 306       b += ((uint32_t)k[5])<<8;
 307       b += ((uint32_t)k[6])<<16;
 308       b += ((uint32_t)k[7])<<24;
 309       c += k[8];
 310       c += ((uint32_t)k[9])<<8;
 311       c += ((uint32_t)k[10])<<16;
 312       c += ((uint32_t)k[11])<<24;
 313       mix(a,b,c);
 314       length -= 12;
 315       k += 12;
 316     }
 317
 318     /*-------------------------------- last block: affect all 32 bits of (c) */
 319     switch(length)                   /* all the case statements fall through */
 320     {
 321     case 12: c+=((uint32_t)k[11])<<24;
 322     case 11: c+=((uint32_t)k[10])<<16;
 323     case 10: c+=((uint32_t)k[9])<<8;
 324     case 9 : c+=k[8];
 325     case 8 : b+=((uint32_t)k[7])<<24;
 326     case 7 : b+=((uint32_t)k[6])<<16;
 327     case 6 : b+=((uint32_t)k[5])<<8;
 328     case 5 : b+=k[4];
 329     case 4 : a+=((uint32_t)k[3])<<24;
 330     case 3 : a+=((uint32_t)k[2])<<16;
 331     case 2 : a+=((uint32_t)k[1])<<8;
 332     case 1 : a+=k[0];
 333              break;
 334     case 0 : *pc=c; *pb=b; return;  /* zero length strings require no mixing */
 335     }
 336   }
 337
 338   final(a,b,c);
 339   *pc=c; *pb=b;
 340 }
 341
 342 static unsigned string_in_list(const char *str, char **list, int length)
 343 {
 344         int i;
 345         for (i=0; i<length; i++) {
 346                 if (strcmp(str, list[i]) == 0) return 1;
 347         }
 348         return 0;
 349 }
 350
 351 static int remove_duplicates(char **list, unsigned count)
 352 {
 353         unsigned int num_bits, hash_mask, num_hashes;
 354         unsigned int i, j;
 355         unsigned char *hash;
 356         unsigned output_count;
 357
 358         /* See http://en.wikipedia.org/wiki/Bloom_filter:
 359          * num_bits = - (num_entries * ln(probability)) / (ln(2)^2).
 360          *
 361          * Assume probability = one in a million.
 362          * ln(p) = -13.8155106.
 363          * ln(2)^2 = 0.480453014
 364          *
 365          * => num_bits = num_entries * 28.7551751
 366          */
 367         num_bits = count * 29;
 368
 369         /* Expand num_bits to the next power of 2. */
 370         for (i = 1; i < num_bits; i *= 2);
 371         num_bits = i;
 372         hash_mask = i - 1;
 373
 374         hash = calloc(num_bits, 32 / CHAR_BIT);
 375         if (!hash) {
 376                 return -1;
 377         }
 378
 379
 380         /* Now, ideal number of hashes = ln(2) * num_bits / num_entries,
 381          * and ln(2) approximates 9/13. */
 382         num_hashes = (9 * num_bits) / (13 * count);
 383         if (num_hashes == 0)
 384                 num_hashes++;
 385         /* We generate two at a time anyway. */
 386         if (num_hashes % 2)
 387                 num_hashes++;
 388
 389         printf("num_hashes=%d\n", num_hashes);
 390
 391         output_count = 0;
 392         for (i = 0; i < count; i++) {
 393                 unsigned int len = strlen(list[i]), bits_set = 0;
 394                 uint32_t h1 = len, h2 = 0;
 395                 for (j = 0; j < num_hashes; j += 2) {
 396                         hashlittle2(list[i], len, &h1, &h2);
 397                         bits_set += set_bit(hash, h1 & hash_mask);
 398                         bits_set += set_bit(hash, h2 & hash_mask);
 399                 }
 400                 if (bits_set == num_hashes) {
 401                         /* might be a collision - check the slow way */
 402                         if (string_in_list(list[i], list, i) != 0) {
 403                                 continue;
 404                         }
 405                 }
 406                 if (output_count != i) {
 407                         list[output_count] = list[i];
 408                 }
 409                 output_count++;
 410         }
 411         free(hash);
 412         return output_count;
 413 }
 414
 415 static int string_cmp(const void *p1, const void *p2)
 416 {
 417         return strcmp(* (char * const *) p1, * (char * const *) p2);
 418 }
 419
 420 static int remove_duplicates_qsort(char **list, unsigned count)
 421 {
 422         int output_count, i;
 423
 424         qsort(list, count, sizeof(char *), string_cmp);
 425
 426         output_count = 1;
 427         for (i = 1; i < count; i++) {
 428                 if (strcmp(list[i-1], list[i]) == 0) {
 429                         continue;
 430                 }
 431                 if (output_count != i) {
 432                         list[output_count] = list[i];
 433                 }
 434                 output_count++;
 435         }
 436         return output_count;
 437 }
 438
 439 char **copy_list(char **list, int count)
 440 {
 441         char **ret = malloc(count*sizeof(char *));
 442         memcpy(ret, list, count*sizeof(char *));
 443         return ret;
 444 }
 445
 446 static char *randstring(int len)
 447 {
 448         const char *clist = "ABCDEF";
 449         char *ret = malloc(len+1);
 450         int i;
 451         for (i=0; i<len; i++) {
 452                 ret[i] = clist[random() % strlen(clist)];
 453         }
 454         ret[i] = 0;
 455         return ret;
 456 }
 457
 458 int main(int argc, char *argv[])
 459 {
 460         int count, nondup, i;
 461         char **orig, **list1, **list2;
 462         int output_count1, output_count2, str_len;
 463         double pctdup;
 464
 465         if (argc < 4) {
 466                 printf("Usage: <count> <pctdup> <strlen>\n");
 467                 exit(1);
 468         }
 469
 470         setlinebuf(stdout);
 471
 472         count = strtoul(argv[1], NULL, 0);
 473         pctdup = strtod(argv[2], NULL);
 474         str_len = strtoul(argv[3], NULL, 0);
 475
 476         orig = malloc(sizeof(char *) * (count+1));
 477         nondup = ((100 - pctdup)/100.0)*count;
 478         if (nondup == 0) nondup = 1;
 479
 480         for (i=0; i<nondup; i++) {
 481                 orig[i] = randstring(str_len);
 482         }
 483         for (; i<count; i++) {
 484                 orig[i] = orig[random() % nondup];
 485         }
 486
 487         list1 = copy_list(orig, count);
 488         start_timer();
 489         output_count1 = remove_duplicates(list1, count);
 490         if (output_count1 == -1) {
 491                 exit(1);
 492         }
 493         printf("Removed %d duplicates from %d strings took %f usec (hashing)\n",
 494                count - output_count1, count, end_timer() * 1.0e6);
 495         qsort(list1, output_count1, sizeof(char *), string_cmp);
 496
 497         list2 = copy_list(orig, count);
 498         start_timer();
 499         output_count2 = remove_duplicates_qsort(list2, count);
 500         if (output_count2 == -1) {
 501                 exit(1);
 502         }
 503         printf("Removed %d duplicates from %d strings took %f usec (qsort)\n",
 504                count - output_count2, count, end_timer() * 1.0e6);
 505
 506         if (output_count1 != output_count2 ||
 507             memcmp(list1, list2, output_count1*sizeof(char *)) != 0) {
 508                 printf("output mismatch!\n");
 509                 exit(1);
 510         }
 511
 512         return 0;
 513 }