1 This latest version has most of the TODO-list items solved. The one
2 remaining issue is that we really need to handle all the files in a dir
3 before we move on to the sub-directories, so this patch needs the sorting
4 algorithm to change to put all the subdirs at the end of the list of a
7 Be sure to run "make proto" before "make".
9 --- orig/flist.c 2005-02-12 19:54:27
10 +++ flist.c 2005-02-13 09:49:22
11 @@ -330,7 +330,7 @@ void send_file_entry(struct file_struct
12 char fname[MAXPATHLEN];
20 @@ -975,7 +975,8 @@ void send_file_name(int f, struct file_l
21 struct file_struct *file;
22 char fbuf[MAXPATHLEN];
24 - if (!(file = make_file(fname, flist, ALL_FILTERS)))
25 + file = make_file(fname, flist, f == -2 ? SERVER_FILTERS : ALL_FILTERS);
29 maybe_emit_filelist_progress(flist);
30 @@ -1311,7 +1312,7 @@ struct file_list *recv_file_list(int f)
32 clean_flist(flist, relative_paths, 1);
36 /* Now send the uid/gid list. This was introduced in
37 * protocol version 15 */
38 recv_uid_list(f, flist);
39 @@ -1650,6 +1651,25 @@ static int is_backup_file(char *fn)
40 return k > 0 && strcmp(fn+k, backup_suffix) == 0;
43 +struct file_list *get_dirlist(const char *dirname, int ignore_excludes)
45 + struct file_list *dirlist;
46 + char dirbuf[MAXPATHLEN];
48 + int save_recurse = recurse;
50 + dlen = strlcpy(dirbuf, dirname, MAXPATHLEN);
51 + if (dlen >= MAXPATHLEN)
54 + dirlist = flist_new(WITHOUT_HLINK, "get_dirlist");
56 + send_directory(ignore_excludes ? -2 : -1, dirlist, dirbuf, dlen);
57 + recurse = save_recurse;
63 /* This function is used to implement per-directory deletion, and
64 * is used by all the --delete-WHEN options. Note that the fbuf
65 --- orig/generator.c 2005-02-13 05:50:28
66 +++ generator.c 2005-02-13 10:01:48
67 @@ -47,6 +47,7 @@ extern int size_only;
68 extern OFF_T max_size;
69 extern int io_timeout;
70 extern int protocol_version;
71 +extern int fuzzy_basis;
72 extern int always_checksum;
73 extern char *partial_dir;
74 extern char *basis_dir[];
75 @@ -227,6 +228,47 @@ static void generate_and_send_sums(int f
79 +/* Try to find a filename in the same dir as "fname" with a similar name. */
80 +static int find_fuzzy(struct file_struct *file, struct file_list *dirlist)
82 + int fname_len, fname_suf_len;
83 + const char *fname_suf, *fname = file->basename;
84 + uint32 lowest_dist = 0x7FFFFFFF;
85 + int j, lowest_j = -1;
87 + fname_len = strlen(fname);
88 + fname_suf = find_filename_suffix(fname, fname_len, &fname_suf_len);
90 + for (j = 0; j < dirlist->count; j++) {
91 + struct file_struct *fp = dirlist->files[j];
92 + const char *suf, *name;
96 + if (!S_ISREG(fp->mode) || !fp->length)
99 + name = fp->basename;
100 + len = strlen(name);
101 + suf = find_filename_suffix(name, len, &suf_len);
103 + dist = fuzzy_distance(name, len, fname, fname_len);
104 + /* Add some extra weight to how well the suffixes match. */
105 + dist += fuzzy_distance(suf, suf_len, fname_suf, fname_suf_len)
108 + rprintf(FINFO, "fuzzy distance for %s = %d (%d)\n",
109 + name, (int)(dist>>16), (int)(dist&0xFFFF));
111 + if (dist <= lowest_dist) {
112 + lowest_dist = dist;
121 /* Acts on flist->file's ndx'th item, whose name is fname. If a directory,
122 * make sure it exists, and has the right permissions/timestamp info. For
123 @@ -241,6 +283,8 @@ static void recv_generator(char *fname,
124 int f_out, int f_out_name)
126 static int missing_below = -1;
127 + static char *fuzzy_dirname = NULL;
128 + static struct file_list *fuzzy_dirlist = NULL;
129 int fd = -1, f_copy = -1;
130 STRUCT_STAT st, partial_st;
131 struct file_struct *back_file = NULL;
132 @@ -275,6 +319,16 @@ static void recv_generator(char *fname,
136 + if (fuzzy_basis && S_ISREG(file->mode)) {
137 + char *dn = file->dirname ? file->dirname : ".";
138 + if (fuzzy_dirname != dn) {
140 + flist_free(fuzzy_dirlist);
141 + fuzzy_dirname = dn;
142 + fuzzy_dirlist = get_dirlist(fuzzy_dirname, 1);
146 statret = link_stat(fname, &st,
147 keep_dirlinks && S_ISDIR(file->mode));
149 @@ -492,6 +546,24 @@ static void recv_generator(char *fname,
153 + if (statret == -1 && fuzzy_basis && dry_run <= 1) {
154 + int j = find_fuzzy(file, fuzzy_dirlist);
156 + struct file_struct *fp = fuzzy_dirlist->files[j];
157 + f_name_to(fp, fnamecmpbuf);
159 + rprintf(FINFO, "fuzzy match for %s: %s\n",
160 + safe_fname(fname), safe_fname(fnamecmpbuf));
162 + st.st_mode = fp->mode;
163 + st.st_size = fp->length;
164 + st.st_mtime = fp->modtime;
166 + fnamecmp = fnamecmpbuf;
167 + fnamecmp_type = FNAMECMP_FUZZY;
172 if (preserve_hard_links && hard_link_check(file, HL_SKIP))
174 @@ -520,6 +592,8 @@ static void recv_generator(char *fname,
176 if (!compare_dest && fnamecmp_type <= FNAMECMP_BASIS_DIR_HIGH)
178 + else if (fnamecmp_type == FNAMECMP_FUZZY)
180 else if (unchanged_file(fnamecmp, file, &st)) {
181 if (fnamecmp_type == FNAMECMP_FNAME)
182 set_perms(fname, file, &st, PERMS_REPORT);
183 @@ -540,6 +614,11 @@ prepare_to_open:
187 + if (fuzzy_basis && fnamecmp_type == FNAMECMP_FNAME) {
188 + int j = flist_find(fuzzy_dirlist, file);
189 + if (j >= 0) /* don't use an updating file as fuzzy basis */
190 + fuzzy_dirlist->files[j]->length = 0;
194 fd = do_open(fnamecmp, O_RDONLY, 0);
195 @@ -594,8 +673,24 @@ notify_others:
196 write_int(f_out, ndx);
197 if (protocol_version >= 29 && inplace && !read_batch)
198 write_byte(f_out, fnamecmp_type);
199 - if (f_out_name >= 0)
200 + if (f_out_name >= 0) {
201 write_byte(f_out_name, fnamecmp_type);
202 + if (fnamecmp_type == FNAMECMP_FUZZY) {
203 + uchar lenbuf[3], *lb = lenbuf;
204 + int len = strlen(fnamecmpbuf);
206 +#if MAXPATHLEN > 0x7FFF
207 + *lb++ = len / 0x10000 + 0x80;
208 + *lb++ = len / 0x100;
210 + *lb++ = len / 0x100 + 0x80;
214 + write_buf(f_out_name, lenbuf, lb - lenbuf + 1);
215 + write_buf(f_out_name, fnamecmpbuf, len);
219 if (dry_run || read_batch)
221 --- orig/main.c 2005-02-07 20:41:56
222 +++ main.c 2005-01-14 18:33:15
223 @@ -44,6 +44,7 @@ extern int keep_dirlinks;
224 extern int preserve_hard_links;
225 extern int protocol_version;
227 +extern int fuzzy_basis;
228 extern int relative_paths;
229 extern int rsync_port;
230 extern int whole_file;
231 @@ -488,7 +489,8 @@ static int do_recv(int f_in,int f_out,st
234 int error_pipe[2], name_pipe[2];
235 - BOOL need_name_pipe = (basis_dir[0] || partial_dir) && !dry_run;
236 + BOOL need_name_pipe = (basis_dir[0] || partial_dir || fuzzy_basis)
239 /* The receiving side mustn't obey this, or an existing symlink that
240 * points to an identical file won't be replaced by the referent. */
241 --- orig/options.c 2005-02-13 05:50:28
242 +++ options.c 2005-02-13 06:56:06
243 @@ -89,6 +89,7 @@ int copy_unsafe_links = 0;
245 int daemon_bwlimit = 0;
247 +int fuzzy_basis = 0;
248 size_t bwlimit_writemax = 0;
249 int only_existing = 0;
250 int opt_ignore_existing = 0;
251 @@ -302,6 +303,7 @@ void usage(enum logcode F)
252 rprintf(F," --size-only skip files that match in size\n");
253 rprintf(F," --modify-window=NUM compare mod-times with reduced accuracy\n");
254 rprintf(F," -T, --temp-dir=DIR create temporary files in directory DIR\n");
255 + rprintf(F," --fuzzy find similar file for basis when no dest file\n");
256 rprintf(F," --compare-dest=DIR also compare destination files relative to DIR\n");
257 rprintf(F," --copy-dest=DIR ... and include copies of unchanged files\n");
258 rprintf(F," --link-dest=DIR hardlink to files in DIR when unchanged\n");
259 @@ -411,6 +413,7 @@ static struct poptOption long_options[]
260 {"compare-dest", 0, POPT_ARG_STRING, 0, OPT_COMPARE_DEST, 0, 0 },
261 {"copy-dest", 0, POPT_ARG_STRING, 0, OPT_COPY_DEST, 0, 0 },
262 {"link-dest", 0, POPT_ARG_STRING, 0, OPT_LINK_DEST, 0, 0 },
263 + {"fuzzy", 0, POPT_ARG_NONE, &fuzzy_basis, 0, 0, 0 },
264 /* TODO: Should this take an optional int giving the compression level? */
265 {"compress", 'z', POPT_ARG_NONE, &do_compression, 0, 0, 0 },
266 {"stats", 0, POPT_ARG_NONE, &do_stats, 0, 0, 0 },
267 @@ -1382,6 +1385,9 @@ void server_options(char **args,int *arg
268 if (!implied_dirs && !am_sender)
269 args[ac++] = "--no-implied-dirs";
271 + if (fuzzy_basis && am_sender)
272 + args[ac++] = "--fuzzy";
277 --- orig/receiver.c 2005-02-11 10:53:14
278 +++ receiver.c 2005-01-15 21:21:02
279 @@ -257,6 +257,27 @@ static int receive_data(int f_in, char *
283 +static void read_gen_name(int fd, char *buf)
285 + int len = read_byte(fd);
287 +#if MAXPATHLEN > 32767
289 + read_buf(fd, (char *)lenbuf, 2);
290 + len = (len & ~0x80) * 0x10000 + lenbuf[0] * 0x100 + lenbuf[1];
292 + len = (len & ~0x80) * 0x100 + read_byte(fd);
295 + if (len >= MAXPATHLEN) {
296 + rprintf(FERROR, "bogus data on generator name pipe\n");
297 + exit_cleanup(RERR_PROTOCOL);
300 + read_sbuf(fd, buf, len);
304 static void discard_receive_data(int f_in, OFF_T length)
306 receive_data(f_in, NULL, -1, 0, NULL, -1, length);
307 @@ -396,6 +417,10 @@ int recv_files(int f_in, struct file_lis
308 case FNAMECMP_BACKUP:
309 fnamecmp = get_backup_name(fname);
311 + case FNAMECMP_FUZZY:
312 + read_gen_name(f_in_name, fnamecmpbuf);
313 + fnamecmp = fnamecmpbuf;
316 if (j >= basis_dir_cnt) {
318 --- orig/rsync.h 2005-02-12 19:54:27
319 +++ rsync.h 2005-01-19 18:36:47
321 #define FNAMECMP_FNAME 0x80
322 #define FNAMECMP_PARTIAL_DIR 0x81
323 #define FNAMECMP_BACKUP 0x82
324 +#define FNAMECMP_FUZZY 0x83
326 /* For calling delete_file() */
327 #define DEL_DIR (1<<0)
328 --- orig/rsync.yo 2005-02-13 05:50:28
329 +++ rsync.yo 2005-02-13 06:56:46
330 @@ -351,6 +351,7 @@ to the detailed description below for a
331 --size-only skip files that match in size
332 --modify-window=NUM compare mod-times with reduced accuracy
333 -T --temp-dir=DIR create temporary files in directory DIR
334 + --fuzzy find similar file for basis when no dest
335 --compare-dest=DIR also compare received files relative to DIR
336 --copy-dest=DIR ... and include copies of unchanged files
337 --link-dest=DIR hardlink to files in DIR when unchanged
338 @@ -909,6 +910,14 @@ scratch directory when creating temporar
339 transferred on the receiving side. The default behavior is to create
340 the temporary files in the receiving directory.
342 +dit(bf(--fuzzy)) This option tells rsync that it should look around for a
343 +basis file for any destination file that is missing. The current algorithm
344 +looks for a similarly-named file in the same directory as the destination
345 +file, and, if found, uses that to try to speed up the transfer. Note that
346 +the use of the --delete option might get rid of any potential fuzzy-match
347 +files, so either use --delete-after or filename exclusions if you need to
350 dit(bf(--compare-dest=DIR)) This option instructs rsync to use em(DIR) on
351 the destination machine as an additional hierarchy to compare destination
352 files against doing transfers (if the files are missing in the destination
353 --- orig/util.c 2005-02-11 10:53:15
354 +++ util.c 2005-02-13 09:44:25
355 @@ -1224,3 +1224,110 @@ void *_realloc_array(void *ptr, unsigned
356 return malloc(size * num);
357 return realloc(ptr, size * num);
360 +/* Take a filename and filename length and return the most significant
361 + * filename suffix we can find. This ignores suffixes such as "~",
362 + * ".bak", ".orig", ".~1~", etc. */
363 +const char *find_filename_suffix(const char *fn, int fn_len, int *len_ptr)
365 + const char *suf, *s;
369 + /* One or more dots at the start aren't a suffix. */
370 + while (fn_len && *fn == '.') fn++, fn_len--;
372 + /* Ignore the ~ in a "foo~" filename. */
373 + if (fn_len > 1 && fn[fn_len-1] == '~')
374 + fn_len--, had_tilde = True;
378 + /* Assume we don't find an suffix. */
382 + /* Find the last significant suffix. */
383 + for (s = fn + fn_len; fn_len > 1; ) {
384 + while (*--s != '.' && s != fn) {}
387 + s_len = fn_len - (s - fn);
390 + if (strcmp(s+1, "bak") == 0
391 + || strcmp(s+1, "old") == 0)
393 + } else if (s_len == 4) {
394 + if (strcmp(s+1, "orig") == 0)
396 + } else if (s_len > 2 && had_tilde
397 + && s[1] == '~' && isdigit(s[2]))
403 + /* Determine if the suffix is all digits. */
404 + for (s++, s_len--; s_len > 0; s++, s_len--) {
408 + /* An all-digit suffix may not be that signficant. */
415 +/* This is an implementation of the Levenshtein distance algorithm. It
416 + * was implemented to avoid needing a two-dimensional matrix (to save
417 + * memory). It was also tweaked to try to factor in the ASCII distance
418 + * between changed characters as a minor distance quantity. The normal
419 + * Levenshtein units of distance (each signifying a single change between
420 + * the two strings) are defined as a "UNIT". */
422 +#define UNIT (1 << 16)
424 +uint32 fuzzy_distance(const char *s1, int len1, const char *s2, int len2)
426 + uint32 a[MAXPATHLEN], diag, above, left, diag_inc, above_inc, left_inc;
430 + if (!len1 || !len2) {
435 + for (i1 = 0, cost = 0; i1 < len1; i1++)
437 + return (int32)len1 * UNIT + cost;
440 + for (i2 = 0; i2 < len2; i2++)
441 + a[i2] = (i2+1) * UNIT;
443 + for (i1 = 0; i1 < len1; i1++) {
445 + above = (i1+1) * UNIT;
446 + for (i2 = 0; i2 < len2; i2++) {
448 + if ((cost = *((uchar*)s1+i1) - *((uchar*)s2+i2)) != 0) {
450 + cost = UNIT - cost;
452 + cost = UNIT + cost;
454 + diag_inc = diag + cost;
455 + left_inc = left + UNIT + *((uchar*)s1+i1);
456 + above_inc = above + UNIT + *((uchar*)s2+i2);
457 + a[i2] = above = left < above
458 + ? (left_inc < diag_inc ? left_inc : diag_inc)
459 + : (above_inc < diag_inc ? above_inc : diag_inc);