1 This patch makes rsync look try to find a basis file for a file that
2 doesn't already have one.
4 Be sure to run "make proto" before "make".
6 --- orig/flist.c 2005-02-13 21:17:16
7 +++ flist.c 2005-02-13 09:49:22
8 @@ -330,7 +330,7 @@ void send_file_entry(struct file_struct
9 char fname[MAXPATHLEN];
17 @@ -975,7 +975,8 @@ void send_file_name(int f, struct file_l
18 struct file_struct *file;
19 char fbuf[MAXPATHLEN];
21 - if (!(file = make_file(fname, flist, ALL_FILTERS)))
22 + file = make_file(fname, flist, f == -2 ? SERVER_FILTERS : ALL_FILTERS);
26 maybe_emit_filelist_progress(flist);
27 @@ -1315,7 +1316,7 @@ struct file_list *recv_file_list(int f)
29 clean_flist(flist, relative_paths, 1);
33 /* Now send the uid/gid list. This was introduced in
34 * protocol version 15 */
35 recv_uid_list(f, flist);
36 @@ -1715,6 +1716,25 @@ static int is_backup_file(char *fn)
37 return k > 0 && strcmp(fn+k, backup_suffix) == 0;
40 +struct file_list *get_dirlist(const char *dirname, int ignore_excludes)
42 + struct file_list *dirlist;
43 + char dirbuf[MAXPATHLEN];
45 + int save_recurse = recurse;
47 + dlen = strlcpy(dirbuf, dirname, MAXPATHLEN);
48 + if (dlen >= MAXPATHLEN)
51 + dirlist = flist_new(WITHOUT_HLINK, "get_dirlist");
53 + send_directory(ignore_excludes ? -2 : -1, dirlist, dirbuf, dlen);
54 + recurse = save_recurse;
60 /* This function is used to implement per-directory deletion, and
61 * is used by all the --delete-WHEN options. Note that the fbuf
62 --- orig/generator.c 2005-02-13 05:50:28
63 +++ generator.c 2005-02-13 21:47:28
64 @@ -47,6 +47,7 @@ extern int size_only;
65 extern OFF_T max_size;
66 extern int io_timeout;
67 extern int protocol_version;
68 +extern int fuzzy_basis;
69 extern int always_checksum;
70 extern char *partial_dir;
71 extern char *basis_dir[];
72 @@ -227,6 +228,59 @@ static void generate_and_send_sums(int f
76 +/* Try to find a filename in the same dir as "fname" with a similar name. */
77 +static int find_fuzzy(struct file_struct *file, struct file_list *dirlist)
79 + int fname_len, fname_suf_len;
80 + const char *fname_suf, *fname = file->basename;
81 + uint32 lowest_dist = 0x7FFFFFFF;
82 + int j, lowest_j = -1;
84 + fname_len = strlen(fname);
85 + fname_suf = find_filename_suffix(fname, fname_len, &fname_suf_len);
87 + for (j = 0; j < dirlist->count; j++) {
88 + struct file_struct *fp = dirlist->files[j];
89 + const char *suf, *name;
93 + if (!S_ISREG(fp->mode) || !fp->length
94 + || fp->flags & FLAG_NO_FUZZY)
97 + name = fp->basename;
99 + if (fp->length == file->length
100 + && fp->modtime == file->modtime) {
103 + "fuzzy size/modtime match for %s\n",
109 + len = strlen(name);
110 + suf = find_filename_suffix(name, len, &suf_len);
112 + dist = fuzzy_distance(name, len, fname, fname_len);
113 + /* Add some extra weight to how well the suffixes match. */
114 + dist += fuzzy_distance(suf, suf_len, fname_suf, fname_suf_len)
117 + rprintf(FINFO, "fuzzy distance for %s = %d.%05d\n",
118 + name, (int)(dist>>16), (int)(dist&0xFFFF));
120 + if (dist <= lowest_dist) {
121 + lowest_dist = dist;
130 /* Acts on flist->file's ndx'th item, whose name is fname. If a directory,
131 * make sure it exists, and has the right permissions/timestamp info. For
132 @@ -241,6 +295,8 @@ static void recv_generator(char *fname,
133 int f_out, int f_out_name)
135 static int missing_below = -1;
136 + static char *fuzzy_dirname = NULL;
137 + static struct file_list *fuzzy_dirlist = NULL;
138 int fd = -1, f_copy = -1;
139 STRUCT_STAT st, partial_st;
140 struct file_struct *back_file = NULL;
141 @@ -275,6 +331,16 @@ static void recv_generator(char *fname,
145 + if (fuzzy_basis && S_ISREG(file->mode)) {
146 + char *dn = file->dirname ? file->dirname : ".";
147 + if (fuzzy_dirname != dn) {
149 + flist_free(fuzzy_dirlist);
150 + fuzzy_dirname = dn;
151 + fuzzy_dirlist = get_dirlist(fuzzy_dirname, 1);
155 statret = link_stat(fname, &st,
156 keep_dirlinks && S_ISDIR(file->mode));
158 @@ -492,6 +558,24 @@ static void recv_generator(char *fname,
162 + if (statret == -1 && fuzzy_basis && dry_run <= 1) {
163 + int j = find_fuzzy(file, fuzzy_dirlist);
165 + struct file_struct *fp = fuzzy_dirlist->files[j];
166 + f_name_to(fp, fnamecmpbuf);
168 + rprintf(FINFO, "fuzzy basis selected for %s: %s\n",
169 + safe_fname(fname), safe_fname(fnamecmpbuf));
171 + st.st_mode = fp->mode;
172 + st.st_size = fp->length;
173 + st.st_mtime = fp->modtime;
175 + fnamecmp = fnamecmpbuf;
176 + fnamecmp_type = FNAMECMP_FUZZY;
181 if (preserve_hard_links && hard_link_check(file, HL_SKIP))
183 @@ -520,6 +604,8 @@ static void recv_generator(char *fname,
185 if (!compare_dest && fnamecmp_type <= FNAMECMP_BASIS_DIR_HIGH)
187 + else if (fnamecmp_type == FNAMECMP_FUZZY)
189 else if (unchanged_file(fnamecmp, file, &st)) {
190 if (fnamecmp_type == FNAMECMP_FNAME)
191 set_perms(fname, file, &st, PERMS_REPORT);
192 @@ -540,6 +626,11 @@ prepare_to_open:
196 + if (fuzzy_basis && fnamecmp_type == FNAMECMP_FNAME) {
197 + int j = flist_find(fuzzy_dirlist, file);
198 + if (j >= 0) /* don't use updating file as future fuzzy basis */
199 + fuzzy_dirlist->files[j]->flags |= FLAG_NO_FUZZY;
203 fd = do_open(fnamecmp, O_RDONLY, 0);
204 @@ -594,8 +685,24 @@ notify_others:
205 write_int(f_out, ndx);
206 if (protocol_version >= 29 && inplace && !read_batch)
207 write_byte(f_out, fnamecmp_type);
208 - if (f_out_name >= 0)
209 + if (f_out_name >= 0) {
210 write_byte(f_out_name, fnamecmp_type);
211 + if (fnamecmp_type == FNAMECMP_FUZZY) {
212 + uchar lenbuf[3], *lb = lenbuf;
213 + int len = strlen(fnamecmpbuf);
215 +#if MAXPATHLEN > 0x7FFF
216 + *lb++ = len / 0x10000 + 0x80;
217 + *lb++ = len / 0x100;
219 + *lb++ = len / 0x100 + 0x80;
223 + write_buf(f_out_name, lenbuf, lb - lenbuf + 1);
224 + write_buf(f_out_name, fnamecmpbuf, len);
228 if (dry_run || read_batch)
230 --- orig/main.c 2005-02-07 20:41:56
231 +++ main.c 2005-01-14 18:33:15
232 @@ -44,6 +44,7 @@ extern int keep_dirlinks;
233 extern int preserve_hard_links;
234 extern int protocol_version;
236 +extern int fuzzy_basis;
237 extern int relative_paths;
238 extern int rsync_port;
239 extern int whole_file;
240 @@ -488,7 +489,8 @@ static int do_recv(int f_in,int f_out,st
243 int error_pipe[2], name_pipe[2];
244 - BOOL need_name_pipe = (basis_dir[0] || partial_dir) && !dry_run;
245 + BOOL need_name_pipe = (basis_dir[0] || partial_dir || fuzzy_basis)
248 /* The receiving side mustn't obey this, or an existing symlink that
249 * points to an identical file won't be replaced by the referent. */
250 --- orig/options.c 2005-02-13 05:50:28
251 +++ options.c 2005-02-13 21:41:41
252 @@ -89,6 +89,7 @@ int copy_unsafe_links = 0;
254 int daemon_bwlimit = 0;
256 +int fuzzy_basis = 0;
257 size_t bwlimit_writemax = 0;
258 int only_existing = 0;
259 int opt_ignore_existing = 0;
260 @@ -302,6 +303,7 @@ void usage(enum logcode F)
261 rprintf(F," --size-only skip files that match in size\n");
262 rprintf(F," --modify-window=NUM compare mod-times with reduced accuracy\n");
263 rprintf(F," -T, --temp-dir=DIR create temporary files in directory DIR\n");
264 + rprintf(F," -y, --fuzzy find similar file for basis if no dest file\n");
265 rprintf(F," --compare-dest=DIR also compare destination files relative to DIR\n");
266 rprintf(F," --copy-dest=DIR ... and include copies of unchanged files\n");
267 rprintf(F," --link-dest=DIR hardlink to files in DIR when unchanged\n");
268 @@ -411,6 +413,7 @@ static struct poptOption long_options[]
269 {"compare-dest", 0, POPT_ARG_STRING, 0, OPT_COMPARE_DEST, 0, 0 },
270 {"copy-dest", 0, POPT_ARG_STRING, 0, OPT_COPY_DEST, 0, 0 },
271 {"link-dest", 0, POPT_ARG_STRING, 0, OPT_LINK_DEST, 0, 0 },
272 + {"fuzzy", 'y', POPT_ARG_NONE, &fuzzy_basis, 0, 0, 0 },
273 /* TODO: Should this take an optional int giving the compression level? */
274 {"compress", 'z', POPT_ARG_NONE, &do_compression, 0, 0, 0 },
275 {"stats", 0, POPT_ARG_NONE, &do_stats, 0, 0, 0 },
276 @@ -1382,6 +1385,9 @@ void server_options(char **args,int *arg
277 if (!implied_dirs && !am_sender)
278 args[ac++] = "--no-implied-dirs";
280 + if (fuzzy_basis && am_sender)
281 + args[ac++] = "--fuzzy";
286 --- orig/receiver.c 2005-02-11 10:53:14
287 +++ receiver.c 2005-01-15 21:21:02
288 @@ -257,6 +257,27 @@ static int receive_data(int f_in, char *
292 +static void read_gen_name(int fd, char *buf)
294 + int len = read_byte(fd);
296 +#if MAXPATHLEN > 32767
298 + read_buf(fd, (char *)lenbuf, 2);
299 + len = (len & ~0x80) * 0x10000 + lenbuf[0] * 0x100 + lenbuf[1];
301 + len = (len & ~0x80) * 0x100 + read_byte(fd);
304 + if (len >= MAXPATHLEN) {
305 + rprintf(FERROR, "bogus data on generator name pipe\n");
306 + exit_cleanup(RERR_PROTOCOL);
309 + read_sbuf(fd, buf, len);
313 static void discard_receive_data(int f_in, OFF_T length)
315 receive_data(f_in, NULL, -1, 0, NULL, -1, length);
316 @@ -396,6 +417,10 @@ int recv_files(int f_in, struct file_lis
317 case FNAMECMP_BACKUP:
318 fnamecmp = get_backup_name(fname);
320 + case FNAMECMP_FUZZY:
321 + read_gen_name(f_in_name, fnamecmpbuf);
322 + fnamecmp = fnamecmpbuf;
325 if (j >= basis_dir_cnt) {
327 --- orig/rsync.h 2005-02-12 19:54:27
328 +++ rsync.h 2005-02-13 21:19:16
330 #define FLAG_TOP_DIR (1<<0)
331 #define FLAG_HLINK_EOL (1<<1) /* generator only */
332 #define FLAG_MOUNT_POINT (1<<2) /* sender only */
333 +#define FLAG_NO_FUZZY (1<<2) /* generator only */
334 #define FLAG_DEL_HERE (1<<3) /* receiver/generator */
336 /* update this if you make incompatible changes */
338 #define FNAMECMP_FNAME 0x80
339 #define FNAMECMP_PARTIAL_DIR 0x81
340 #define FNAMECMP_BACKUP 0x82
341 +#define FNAMECMP_FUZZY 0x83
343 /* For calling delete_file() */
344 #define DEL_DIR (1<<0)
345 --- orig/rsync.yo 2005-02-13 21:51:10
346 +++ rsync.yo 2005-02-13 21:41:52
347 @@ -351,6 +351,7 @@ to the detailed description below for a
348 --size-only skip files that match in size
349 --modify-window=NUM compare mod-times with reduced accuracy
350 -T, --temp-dir=DIR create temporary files in directory DIR
351 + -y, --fuzzy find similar file for basis if no dest file
352 --compare-dest=DIR also compare received files relative to DIR
353 --copy-dest=DIR ... and include copies of unchanged files
354 --link-dest=DIR hardlink to files in DIR when unchanged
355 @@ -909,6 +910,16 @@ scratch directory when creating temporar
356 transferred on the receiving side. The default behavior is to create
357 the temporary files in the receiving directory.
359 +dit(bf(-y, --fuzzy)) This option tells rsync that it should look for a
360 +basis file for any destination file that is missing. The current algorithm
361 +looks in the same directory as the destination file for either a file that
362 +has an identical size and modified-time, or a similarly-named file. If
363 +found, rsync uses the fuzzy basis file to try to speed up the transfer.
365 +Note that the use of the bf(--delete) option might get rid of any potential
366 +fuzzy-match files, so either use bf(--delete-after) or specify some
367 +filename exclusions if you need to prevent this.
369 dit(bf(--compare-dest=DIR)) This option instructs rsync to use em(DIR) on
370 the destination machine as an additional hierarchy to compare destination
371 files against doing transfers (if the files are missing in the destination
372 --- orig/util.c 2005-02-11 10:53:15
373 +++ util.c 2005-02-13 09:44:25
374 @@ -1224,3 +1224,110 @@ void *_realloc_array(void *ptr, unsigned
375 return malloc(size * num);
376 return realloc(ptr, size * num);
379 +/* Take a filename and filename length and return the most significant
380 + * filename suffix we can find. This ignores suffixes such as "~",
381 + * ".bak", ".orig", ".~1~", etc. */
382 +const char *find_filename_suffix(const char *fn, int fn_len, int *len_ptr)
384 + const char *suf, *s;
388 + /* One or more dots at the start aren't a suffix. */
389 + while (fn_len && *fn == '.') fn++, fn_len--;
391 + /* Ignore the ~ in a "foo~" filename. */
392 + if (fn_len > 1 && fn[fn_len-1] == '~')
393 + fn_len--, had_tilde = True;
397 + /* Assume we don't find an suffix. */
401 + /* Find the last significant suffix. */
402 + for (s = fn + fn_len; fn_len > 1; ) {
403 + while (*--s != '.' && s != fn) {}
406 + s_len = fn_len - (s - fn);
409 + if (strcmp(s+1, "bak") == 0
410 + || strcmp(s+1, "old") == 0)
412 + } else if (s_len == 4) {
413 + if (strcmp(s+1, "orig") == 0)
415 + } else if (s_len > 2 && had_tilde
416 + && s[1] == '~' && isdigit(s[2]))
422 + /* Determine if the suffix is all digits. */
423 + for (s++, s_len--; s_len > 0; s++, s_len--) {
427 + /* An all-digit suffix may not be that signficant. */
434 +/* This is an implementation of the Levenshtein distance algorithm. It
435 + * was implemented to avoid needing a two-dimensional matrix (to save
436 + * memory). It was also tweaked to try to factor in the ASCII distance
437 + * between changed characters as a minor distance quantity. The normal
438 + * Levenshtein units of distance (each signifying a single change between
439 + * the two strings) are defined as a "UNIT". */
441 +#define UNIT (1 << 16)
443 +uint32 fuzzy_distance(const char *s1, int len1, const char *s2, int len2)
445 + uint32 a[MAXPATHLEN], diag, above, left, diag_inc, above_inc, left_inc;
449 + if (!len1 || !len2) {
454 + for (i1 = 0, cost = 0; i1 < len1; i1++)
456 + return (int32)len1 * UNIT + cost;
459 + for (i2 = 0; i2 < len2; i2++)
460 + a[i2] = (i2+1) * UNIT;
462 + for (i1 = 0; i1 < len1; i1++) {
464 + above = (i1+1) * UNIT;
465 + for (i2 = 0; i2 < len2; i2++) {
467 + if ((cost = *((uchar*)s1+i1) - *((uchar*)s2+i2)) != 0) {
469 + cost = UNIT - cost;
471 + cost = UNIT + cost;
473 + diag_inc = diag + cost;
474 + left_inc = left + UNIT + *((uchar*)s1+i1);
475 + above_inc = above + UNIT + *((uchar*)s2+i2);
476 + a[i2] = above = left < above
477 + ? (left_inc < diag_inc ? left_inc : diag_inc)
478 + : (above_inc < diag_inc ? above_inc : diag_inc);