Preparing for release of 3.0.9

[rsync.git] / match.c
diff --git a/match.c b/match.c

index 01e91173e51e7f766a4a290d32c330ed0af4aee6..154099d121eb344c32596d19a9185d3d9b566736 100644 (file)
--- a/match.c
+++ b/match.c
@@ -3,7 +3,7 @@
   *
   * Copyright (C) 1996 Andrew Tridgell
   * Copyright (C) 1996 Paul Mackerras
- * Copyright (C) 2003-2007 Wayne Davison
+ * Copyright (C) 2003-2009 Wayne Davison
   *
   * This program is free software; you can redistribute it and/or modify
   * it under the terms of the GNU General Public License as published by
@@ -39,48 +39,58 @@ static int total_matches;
  
  extern struct stats stats;
  
-#define TABLESIZE (1<<16)
+#define TRADITIONAL_TABLESIZE (1<<16)
  
+static uint32 tablesize;
  static int32 *hash_table;
  
  #define SUM2HASH2(s1,s2) (((s1) + (s2)) & 0xFFFF)
  #define SUM2HASH(sum) SUM2HASH2((sum)&0xFFFF,(sum)>>16)
  
-static int32 build_hash_table(struct sum_struct *s, int32 start)
-{
-       int32 i, end = s->count;
+#define BIG_SUM2HASH(sum) ((sum)%tablesize)
  
-       if (!hash_table) {
-               hash_table = new_array(int32, TABLESIZE);
+static void build_hash_table(struct sum_struct *s)
+{
+       static uint32 alloc_size;
+       int32 i;
+
+       /* Dynamically calculate the hash table size so that the hash load
+        * for big files is about 80%.  A number greater than the traditional
+        * size must be odd or s2 will not be able to span the entire set. */
+       tablesize = (uint32)(s->count/8) * 10 + 11;
+       if (tablesize < TRADITIONAL_TABLESIZE)
+               tablesize = TRADITIONAL_TABLESIZE;
+       if (tablesize > alloc_size || tablesize < alloc_size - 16*1024) {
+               if (hash_table)
+                       free(hash_table);
+               hash_table = new_array(int32, tablesize);
                 if (!hash_table)
                         out_of_memory("build_hash_table");
+               alloc_size = tablesize;
         }
  
-       memset(hash_table, 0xFF, TABLESIZE * sizeof hash_table[0]);
-
-       if (end - start > TABLESIZE*8/10)
-               end = start + TABLESIZE*8/10;
-
-       for (i = start; i < end; i++) {
-               uint32 t = SUM2HASH(s->sums[i].sum1);
-               s->sums[i].chain = hash_table[t];
-               hash_table[t] = i;
-       }
+       memset(hash_table, 0xFF, tablesize * sizeof hash_table[0]);
  
-       if (verbose > 2) {
-               rprintf(FINFO, "built hash table for entries %ld - %ld\n",
-                       (long)start, (long)end - 1);
+       if (tablesize == TRADITIONAL_TABLESIZE) {
+               for (i = 0; i < s->count; i++) {
+                       uint32 t = SUM2HASH(s->sums[i].sum1);
+                       s->sums[i].chain = hash_table[t];
+                       hash_table[t] = i;
+               }
+       } else {
+               for (i = 0; i < s->count; i++) {
+                       uint32 t = BIG_SUM2HASH(s->sums[i].sum1);
+                       s->sums[i].chain = hash_table[t];
+                       hash_table[t] = i;
+               }
         }
-
-       return end;
  }
  
  
  static OFF_T last_match;
  
  
-/**
- * Transmit a literal and/or match token.
+/* Transmit a literal and/or match token.
   *
   * This delightfully-named function is called either when we find a
   * match and need to transmit all the unmatched data leading up to it,
@@ -88,9 +98,9 @@ static OFF_T last_match;
   * transmit it.  As a result of this second case, it is called even if
   * we have not matched at all!
   *
- * @param i If >0, the number of a matched token.  If 0, indicates we
- * have only literal data.
- **/
+ * If i >= 0, the number of a matched token.  If < 0, indicates we have
+ * only literal data.  A -1 will send a 0-token-int too, and a -2 sends
+ * only literal data, w/o any token-int. */
  static void matched(int f, struct sum_struct *s, struct map_struct *buf,
                     OFF_T offset, int32 i)
  {
@@ -130,8 +140,8 @@ static void matched(int f, struct sum_struct *s, struct map_struct *buf,
  static void hash_search(int f,struct sum_struct *s,
                         struct map_struct *buf, OFF_T len)
  {
-       OFF_T offset, end, reset = 0;
-       int32 k, want_i, backup, sum_pos = 0;
+       OFF_T offset, aligned_offset, end;
+       int32 k, want_i, aligned_i, backup;
         char sum2[SUM_LENGTH];
         uint32 s1, s2, sum;
         int more;
@@ -156,7 +166,7 @@ static void hash_search(int f,struct sum_struct *s,
         if (verbose > 3)
                 rprintf(FINFO, "sum=%.8x k=%ld\n", sum, (long)k);
  
-       offset = 0;
+       offset = aligned_offset = aligned_i = 0;
  
         end = len + 1 - s->sums[s->count-1].len;
  
@@ -169,21 +179,21 @@ static void hash_search(int f,struct sum_struct *s,
                 int done_csum2 = 0;
                 int32 i;
  
-               if (offset >= reset) {
-                       sum_pos = build_hash_table(s, sum_pos);
-                       reset = sum_pos * s->blength;
-               }
-
                 if (verbose > 4) {
                         rprintf(FINFO, "offset=%.0f sum=%04x%04x\n",
                                 (double)offset, s2 & 0xFFFF, s1 & 0xFFFF);
                 }
  
-               i = hash_table[SUM2HASH2(s1,s2)];
-               if (i < 0)
-                       goto null_hash;
+               if (tablesize == TRADITIONAL_TABLESIZE) {
+                       if ((i = hash_table[SUM2HASH2(s1,s2)]) < 0)
+                               goto null_hash;
+                       sum = (s1 & 0xffff) | (s2 << 16);
+               } else {
+                       sum = (s1 & 0xffff) | (s2 << 16);
+                       if ((i = hash_table[BIG_SUM2HASH(sum)]) < 0)
+                               goto null_hash;
+               }
  
-               sum = (s1 & 0xffff) | (s2 << 16);
                 hash_hits++;
                 do {
                         int32 l;
@@ -221,27 +231,28 @@ static void hash_search(int f,struct sum_struct *s,
  
                         /* When updating in-place, the best possible match is
                          * one with an identical offset, so we prefer that over
-                        * the following want_i optimization. */
+                        * the adjacent want_i optimization. */
                         if (updating_basis_file) {
-                               int32 i2;
-                               for (i2 = i; i2 >= 0; i2 = s->sums[i2].chain) {
-                                       if (s->sums[i2].offset != offset)
-                                               continue;
-                                       if (i2 != i) {
-                                               if (sum != s->sums[i2].sum1)
-                                                       break;
-                                               if (memcmp(sum2, s->sums[i2].sum2,
-                                                          s->s2length) != 0)
-                                                       break;
-                                               i = i2;
+                               /* All the generator's chunks start at blength boundaries. */
+                               while (aligned_offset < offset) {
+                                       aligned_offset += s->blength;
+                                       aligned_i++;
+                               }
+                               if (offset == aligned_offset && aligned_i < s->count) {
+                                       if (i != aligned_i) {
+                                               if (sum != s->sums[aligned_i].sum1
+                                                || l != s->sums[aligned_i].len
+                                                || memcmp(sum2, s->sums[aligned_i].sum2, s->s2length) != 0)
+                                                       goto check_want_i;
+                                               i = aligned_i;
                                         }
-                                       /* This chunk was at the same offset on
-                                        * both the sender and the receiver. */
+                                       /* This identical chunk is in the same spot in the old and new file. */
                                         s->sums[i].flags |= SUMFLG_SAME_OFFSET;
-                                       goto set_want_i;
+                                       want_i = i;
                                 }
                         }
  
+                 check_want_i:
                         /* we've found a match, but now check to see
                          * if want_i can hint at a better match. */
                         if (i != want_i && want_i < s->count
@@ -253,7 +264,6 @@ static void hash_search(int f,struct sum_struct *s,
                                  * will be happy */
                                 i = want_i;
                         }
-                   set_want_i:
                         want_i = i + 1;
  
                         matched(f,s,buf,offset,i);
@@ -351,6 +361,11 @@ void match_sums(int f, struct sum_struct *s, struct map_struct *buf, OFF_T len)
         }
  
         if (len > 0 && s->count > 0) {
+               build_hash_table(s);
+
+               if (verbose > 2)
+                       rprintf(FINFO,"built hash table\n");
+
                 hash_search(f, s, buf, len);
  
                 if (verbose > 2)