Remove bypassed checksums in --inplace to improve speed.
authorWayne Davison <wayned@samba.org>
Sat, 3 Aug 2013 16:44:13 +0000 (09:44 -0700)
committerWayne Davison <wayned@samba.org>
Sat, 3 Aug 2013 16:59:38 +0000 (09:59 -0700)
When checking a checksum that refers to a part of an --inplace file that
has been overwritten w/o getting SUMFLG_SAME_OFFSET set, we remove the
checksum from the list.  This will speed up files that have a lot of
identical checksum blocks (e.g. sequences of zeros) that we can't use
due to them not getting marked as being the same.  Patch provided by
Michael Chapman.

NEWS
match.c

diff --git a/NEWS b/NEWS
index 040ac2d756776d7e3d1584a6647ea06238d300bd..eec631d373dd9fe957919a591f268a3549267fa7 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -154,6 +154,9 @@ Changes since 3.0.9:
       file for one way to package the resulting files.  (Suggestions for
       how to make this even easier to install & use are welcomed.)
 
+    - Improved the speed of some --inplace updates when there are lots of
+      identical checksum blocks that end up being unsuable.
+
     - Added the --outbuf=N|L|B option for chosing the output buffering.
 
     - Repating the --fuzzy option now causes the code to look for fuzzy matches
diff --git a/match.c b/match.c
index bafab9f35292445c9fb20ccea789013eef375a87..a8bd1f304f44983925c1c8b5b1664b0c7fbf8192 100644 (file)
--- a/match.c
+++ b/match.c
@@ -178,7 +178,8 @@ static void hash_search(int f,struct sum_struct *s,
 
        do {
                int done_csum2 = 0;
-               int32 i;
+               uint32 hash_entry;
+               int32 i, *prev;
 
                if (DEBUG_GTE(DELTASUM, 4)) {
                        rprintf(FINFO, "offset=%s sum=%04x%04x\n",
@@ -186,19 +187,32 @@ static void hash_search(int f,struct sum_struct *s,
                }
 
                if (tablesize == TRADITIONAL_TABLESIZE) {
-                       if ((i = hash_table[SUM2HASH2(s1,s2)]) < 0)
+                       hash_entry = SUM2HASH2(s1,s2);
+                       if ((i = hash_table[hash_entry]) < 0)
                                goto null_hash;
                        sum = (s1 & 0xffff) | (s2 << 16);
                } else {
                        sum = (s1 & 0xffff) | (s2 << 16);
-                       if ((i = hash_table[BIG_SUM2HASH(sum)]) < 0)
+                       hash_entry = BIG_SUM2HASH(sum);
+                       if ((i = hash_table[hash_entry]) < 0)
                                goto null_hash;
                }
+               prev = &hash_table[hash_entry];
 
                hash_hits++;
                do {
                        int32 l;
 
+                       /* When updating in-place, the chunk's offset must be
+                        * either >= our offset or identical data at that offset.
+                        * Remove any bypassed entries that we can never use. */
+                       if (updating_basis_file && s->sums[i].offset < offset
+                           && !(s->sums[i].flags & SUMFLG_SAME_OFFSET)) {
+                               *prev = s->sums[i].chain;
+                               continue;
+                       }
+                       prev = &s->sums[i].chain;
+
                        if (sum != s->sums[i].sum1)
                                continue;
 
@@ -207,12 +221,6 @@ static void hash_search(int f,struct sum_struct *s,
                        if (l != s->sums[i].len)
                                continue;
 
-                       /* in-place: ensure chunk's offset is either >= our
-                        * offset or that the data didn't move. */
-                       if (updating_basis_file && s->sums[i].offset < offset
-                           && !(s->sums[i].flags & SUMFLG_SAME_OFFSET))
-                               continue;
-
                        if (DEBUG_GTE(DELTASUM, 3)) {
                                rprintf(FINFO,
                                        "potential match at %s i=%ld sum=%08x\n",