Updated to apply cleanly.
[rsync-patches.git] / link-by-hash.diff
1 After applying this patch, run these commands for a successful build:
2
3     ./prepare-source
4     ./configure                      (optional if already run)
5     make
6
7 Jason M. Felice writes:
8
9 This patch adds the --link-by-hash=DIR option, which hard links received
10 files in a link farm arranged by MD4 file hash.  The result is that the system
11 will only store one copy of the unique contents of each file, regardless of
12 the file's name.
13
14
15 --- old/Makefile.in
16 +++ new/Makefile.in
17 @@ -34,7 +34,7 @@ OBJS1=rsync.o generator.o receiver.o cle
18         main.o checksum.o match.o syscall.o log.o backup.o
19  OBJS2=options.o flist.o io.o compat.o hlink.o token.o uidlist.o socket.o \
20         fileio.o batch.o clientname.o chmod.o
21 -OBJS3=progress.o pipe.o
22 +OBJS3=progress.o pipe.o hashlink.o
23  DAEMON_OBJ = params.o loadparm.o clientserver.o access.o connection.o authenticate.o
24  popt_OBJS=popt/findme.o  popt/popt.o  popt/poptconfig.o \
25         popt/popthelp.o popt/poptparse.o
26 --- old/hashlink.c
27 +++ new/hashlink.c
28 @@ -0,0 +1,340 @@
29 +/*
30 +   Copyright (C) Cronosys, LLC 2004
31 +
32 +   This program is free software; you can redistribute it and/or modify
33 +   it under the terms of the GNU General Public License as published by
34 +   the Free Software Foundation; either version 2 of the License, or
35 +   (at your option) any later version.
36 +
37 +   This program is distributed in the hope that it will be useful,
38 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
39 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
40 +   GNU General Public License for more details.
41 +
42 +   You should have received a copy of the GNU General Public License
43 +   along with this program; if not, write to the Free Software
44 +   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
45 +*/
46 +
47 +/* This file contains code used by the --link-by-hash option. */
48 +
49 +#include "rsync.h"
50 +
51 +extern char *link_by_hash_dir;
52 +
53 +#if HAVE_LINK
54 +
55 +char* make_hash_name(struct file_struct *file)
56 +{
57 +       char hash[33], *dst;
58 +       unsigned char *src;
59 +       unsigned char c;
60 +       int i;
61 +
62 +       src = (unsigned char*)file->u.sum;
63 +       for (dst = hash, i = 0; i < 4; i++, src++) {
64 +               c = *src >> 4;
65 +               *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
66 +               c = *src & 0x0f;
67 +               *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
68 +       }
69 +       *dst++ = '/';
70 +       for (i = 0; i < 12; i++, src++) {
71 +               c = *src >> 4;
72 +               *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
73 +               c = *src & 0x0f;
74 +               *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
75 +       }
76 +       *dst = 0;
77 +
78 +       asprintf(&dst,"%s/%s",link_by_hash_dir,hash);
79 +       return dst;
80 +}
81 +
82 +
83 +void kill_hashfile(struct hashfile_struct *hashfile)
84 +{
85 +       if (!hashfile)
86 +               return;
87 +       free(hashfile->name);
88 +       close(hashfile->fd);
89 +       free(hashfile);
90 +}
91 +
92 +
93 +void kill_hashfiles(struct hashfile_struct *hashfiles)
94 +{
95 +       struct hashfile_struct *iter, *next;
96 +       if ((iter = hashfiles) != NULL) {
97 +               do {
98 +                       next = iter->next;
99 +                       kill_hashfile(iter);
100 +                       iter = next;
101 +               } while (iter != hashfiles);
102 +       }
103 +}
104 +
105 +
106 +struct hashfile_struct *find_hashfiles(char *hashname, int64 size, long *fnbr)
107 +{
108 +       DIR *d;
109 +       struct dirent *di;
110 +       struct hashfile_struct *hashfiles = NULL, *hashfile;
111 +       STRUCT_STAT st;
112 +       long this_fnbr;
113 +
114 +       *fnbr = 0;
115 +
116 +       /* Build a list of potential candidates and open
117 +        * them. */
118 +       if ((d = opendir(hashname)) == NULL) {
119 +               rsyserr(FERROR, errno, "opendir failed: \"%s\"", hashname);
120 +               free(hashname);
121 +               return NULL;
122 +       }
123 +       while ((di = readdir(d)) != NULL) {
124 +               if (!strcmp(di->d_name,".") || !strcmp(di->d_name,"..")) {
125 +                       continue;
126 +               }
127 +
128 +               /* We need to have the largest fnbr in case we need to store
129 +                * a new file. */
130 +               this_fnbr = atol(di->d_name);
131 +               if (this_fnbr > *fnbr)
132 +                       *fnbr = this_fnbr;
133 +
134 +               hashfile = new_array(struct hashfile_struct, 1);
135 +               asprintf(&hashfile->name,"%s/%s",hashname,
136 +                        di->d_name);
137 +               if (do_stat(hashfile->name,&st) == -1) {
138 +                       rsyserr(FERROR, errno, "stat failed: %s", hashfile->name);
139 +                       kill_hashfile(hashfile);
140 +                       continue;
141 +               }
142 +               if (st.st_size != size) {
143 +                       kill_hashfile(hashfile);
144 +                       continue;
145 +               }
146 +               hashfile->nlink = st.st_nlink;
147 +               hashfile->fd = open(hashfile->name,O_RDONLY|O_BINARY);
148 +               if (hashfile->fd == -1) {
149 +                       rsyserr(FERROR, errno, "open failed: %s", hashfile->name);
150 +                       kill_hashfile(hashfile);
151 +                       continue;
152 +               }
153 +               if (hashfiles == NULL)
154 +                       hashfiles = hashfile->next = hashfile->prev = hashfile;
155 +               else {
156 +                       hashfile->next = hashfiles;
157 +                       hashfile->prev = hashfiles->prev;
158 +                       hashfile->next->prev = hashfile;
159 +                       hashfile->prev->next = hashfile;
160 +               }
161 +       }
162 +       closedir(d);
163 +
164 +       return hashfiles;
165 +}
166 +
167 +
168 +struct hashfile_struct *compare_hashfiles(int fd,struct hashfile_struct *files)
169 +{
170 +       int amt, hamt;
171 +       char buffer[BUFSIZ], cmpbuffer[BUFSIZ];
172 +       struct hashfile_struct *iter, *next, *best;
173 +       uint32 nlink;
174 +
175 +       if (!files)
176 +               return NULL;
177 +
178 +       iter = files; /* in case files are 0 bytes */
179 +       while ((amt = read(fd, buffer, BUFSIZ)) > 0) {
180 +               iter = files;
181 +               do {
182 +                       /* Icky bit to resync when we steal the first node. */
183 +                       if (!files)
184 +                               files = iter;
185 +
186 +                       next = iter->next;
187 +
188 +                       hamt = read(iter->fd, cmpbuffer, BUFSIZ);
189 +                       if (amt != hamt || memcmp(buffer, cmpbuffer, amt)) {
190 +                               if (iter == files) {
191 +                                       files = files->prev;
192 +                               }
193 +                               if (iter->next == iter) {
194 +                                       files = next = NULL;
195 +                               } else {
196 +                                       next = iter->next;
197 +                                       if (iter == files) {
198 +                                               /* So we know to resync */
199 +                                               files = NULL;
200 +                                       }
201 +                               }
202 +                               iter->next->prev = iter->prev;
203 +                               iter->prev->next = iter->next;
204 +                               kill_hashfile(iter);
205 +                       }
206 +
207 +                       iter = next;
208 +               } while (iter != files);
209 +
210 +               if (iter == NULL && files == NULL) {
211 +                       /* There are no matches. */
212 +                       return NULL;
213 +               }
214 +       }
215 +
216 +       if (amt == -1) {
217 +               rsyserr(FERROR, errno, "read failed in compare_hashfiles()");
218 +               kill_hashfiles(files);
219 +               return NULL;
220 +       }
221 +
222 +       /* If we only have one file left, use it. */
223 +       if (files == files->next) {
224 +               return files;
225 +       }
226 +
227 +       /* All files which remain in the list are identical and should have
228 +        * the same size.  We pick the one with the lowest link count (we
229 +        * may have rolled over because we hit the maximum link count for
230 +        * the filesystem). */
231 +       best = iter = files;
232 +       nlink = iter->nlink;
233 +       do {
234 +               if (iter->nlink < nlink) {
235 +                       nlink = iter->nlink;
236 +                       best = iter;
237 +               }
238 +               iter = iter->next;
239 +       } while (iter != files);
240 +
241 +       best->next->prev = best->prev;
242 +       best->prev->next = best->next;
243 +       if (files == best)
244 +               files = files->next;
245 +       kill_hashfiles(files);
246 +       return best;
247 +}
248 +
249 +
250 +int link_by_hash(char *fnametmp,char *fname,struct file_struct *file)
251 +{
252 +       STRUCT_STAT st;
253 +       char *hashname = make_hash_name(file);
254 +       int first = 0, rc;
255 +       char *linkname;
256 +       long last_fnbr;
257 +
258 +       if (file->length == 0) {
259 +               return robust_rename(fnametmp,fname,0644);
260 +       }
261 +
262 +       if (do_stat(hashname, &st) == -1) {
263 +               char *dirname;
264 +
265 +               /* Directory does not exist. */
266 +               dirname = strdup(hashname);
267 +               *strrchr(dirname,'/') = 0;
268 +               if (do_mkdir(dirname, 0755) == -1 && errno != EEXIST) {
269 +                       rsyserr(FERROR, errno, "mkdir failed: %s", dirname);
270 +                       free(hashname);
271 +                       free(dirname);
272 +                       return robust_rename(fnametmp,fname,0644);
273 +               }
274 +               free(dirname);
275 +
276 +               if (do_mkdir(hashname, 0755) == -1 && errno != EEXIST) {
277 +                       rsyserr(FERROR, errno, "mkdir failed: %s", hashname);
278 +                       free(hashname);
279 +                       return robust_rename(fnametmp,fname,0644);
280 +               }
281 +
282 +               first = 1;
283 +               asprintf(&linkname,"%s/0",hashname);
284 +               rprintf(FINFO, "(1) linkname = %s\n", linkname);
285 +       } else {
286 +               struct hashfile_struct *hashfiles, *hashfile;
287 +
288 +               if (do_stat(fnametmp,&st) == -1) {
289 +                       rsyserr(FERROR, errno, "stat failed: %s", fname);
290 +                       return -1;
291 +               }
292 +               hashfiles = find_hashfiles(hashname, st.st_size, &last_fnbr);
293 +
294 +               if (hashfiles == NULL) {
295 +                       first = 1;
296 +                       asprintf(&linkname,"%s/0",hashname);
297 +                       rprintf(FINFO, "(2) linkname = %s\n", linkname);
298 +               } else {
299 +                       int fd;
300 +                       /* Search for one identical to us. */
301 +                       if ((fd = open(fnametmp,O_RDONLY|O_BINARY)) == -1) {
302 +                               rsyserr(FERROR, errno, "open failed: %s", fnametmp);
303 +                               kill_hashfiles(hashfiles);
304 +                               return -1;
305 +                       }
306 +                       hashfile = compare_hashfiles(fd, hashfiles);
307 +                       hashfiles = NULL;
308 +                       close(fd);
309 +
310 +                       if (hashfile) {
311 +                               first = 0;
312 +                               linkname = strdup(hashfile->name);
313 +                               rprintf(FINFO, "(3) linkname = %s\n", linkname);
314 +                               kill_hashfile(hashfile);
315 +                       } else {
316 +                               first = 1;
317 +                               asprintf(&linkname, "%s/%ld", hashname,
318 +                                        last_fnbr + 1);
319 +                               rprintf(FINFO, "(4) linkname = %s\n", linkname);
320 +                       }
321 +               }
322 +       }
323 +
324 +       if (!first) {
325 +               rprintf(FINFO, "link-by-hash (existing): \"%s\" -> %s\n",
326 +                               linkname, full_fname(fname));
327 +               robust_unlink(fname);
328 +               rc = do_link(linkname, fname);
329 +               if (rc == -1) {
330 +                       if (errno == EMLINK) {
331 +                               first = 1;
332 +                               free(linkname);
333 +                               asprintf(&linkname,"%s/%ld",hashname,
334 +                                        last_fnbr + 1);
335 +                               rprintf(FINFO, "(5) linkname = %s\n", linkname);
336 +                               rprintf(FINFO,"link-by-hash: max link count exceeded, starting new file \"%s\".\n", linkname);
337 +                       } else {
338 +                               rsyserr(FERROR, errno, "link \"%s\" -> \"%s\"",
339 +                                       linkname, full_fname(fname));
340 +                               rc = robust_rename(fnametmp,fname,0644);
341 +                       }
342 +               } else {
343 +                       do_unlink(fnametmp);
344 +               }
345 +       }
346 +
347 +       if (first) {
348 +               rprintf(FINFO, "link-by-hash (new): %s -> \"%s\"\n",
349 +                               full_fname(fname),linkname);
350 +
351 +               rc = robust_rename(fnametmp,fname,0644);
352 +               if (rc != 0) {
353 +                       rsyserr(FERROR, errno, "rename \"%s\" -> \"%s\"",
354 +                               full_fname(fnametmp), full_fname(fname));
355 +               }
356 +               rc = do_link(fname,linkname);
357 +               if (rc != 0) {
358 +                       rsyserr(FERROR, errno, "link \"%s\" -> \"%s\"",
359 +                               full_fname(fname), linkname);
360 +               }
361 +       }
362 +
363 +       free(linkname);
364 +       free(hashname);
365 +       return rc;
366 +}
367 +
368 +#endif
369 --- old/options.c
370 +++ new/options.c
371 @@ -145,6 +145,7 @@ char *backup_suffix = NULL;
372  char *tmpdir = NULL;
373  char *partial_dir = NULL;
374  char *basis_dir[MAX_BASIS_DIRS+1];
375 +char *link_by_hash_dir = NULL;
376  char *config_file = NULL;
377  char *shell_cmd = NULL;
378  char *log_format = NULL;
379 @@ -338,6 +339,7 @@ void usage(enum logcode F)
380    rprintf(F,"     --compare-dest=DIR      also compare destination files relative to DIR\n");
381    rprintf(F,"     --copy-dest=DIR         ... and include copies of unchanged files\n");
382    rprintf(F,"     --link-dest=DIR         hardlink to files in DIR when unchanged\n");
383 +  rprintf(F,"     --link-by-hash=DIR      create hardlinks by hash into DIR\n");
384    rprintf(F," -z, --compress              compress file data during the transfer\n");
385    rprintf(F,"     --compress-level=NUM    explicitly set compression level\n");
386    rprintf(F," -C, --cvs-exclude           auto-ignore files the same way CVS does\n");
387 @@ -385,7 +387,7 @@ enum {OPT_VERSION = 1000, OPT_DAEMON, OP
388        OPT_FILTER, OPT_COMPARE_DEST, OPT_COPY_DEST, OPT_LINK_DEST, OPT_HELP,
389        OPT_INCLUDE, OPT_INCLUDE_FROM, OPT_MODIFY_WINDOW, OPT_MIN_SIZE, OPT_CHMOD,
390        OPT_READ_BATCH, OPT_WRITE_BATCH, OPT_ONLY_WRITE_BATCH, OPT_MAX_SIZE,
391 -      OPT_NO_D,
392 +      OPT_NO_D, OPT_LINK_BY_HASH,
393        OPT_SERVER, OPT_REFUSED_BASE = 9000};
394  
395  static struct poptOption long_options[] = {
396 @@ -480,6 +482,7 @@ static struct poptOption long_options[] 
397    {"compare-dest",     0,  POPT_ARG_STRING, 0, OPT_COMPARE_DEST, 0, 0 },
398    {"copy-dest",        0,  POPT_ARG_STRING, 0, OPT_COPY_DEST, 0, 0 },
399    {"link-dest",        0,  POPT_ARG_STRING, 0, OPT_LINK_DEST, 0, 0 },
400 +  {"link-by-hash",     0,  POPT_ARG_STRING, 0, OPT_LINK_BY_HASH, 0, 0},
401    {"fuzzy",           'y', POPT_ARG_NONE,   &fuzzy_basis, 0, 0, 0 },
402    {"compress",        'z', POPT_ARG_NONE,   0, 'z', 0, 0 },
403    {"compress-level",   0,  POPT_ARG_INT,    &def_compress_level, 'z', 0, 0 },
404 @@ -1060,6 +1063,21 @@ int parse_arguments(int *argc, const cha
405                         usage(FINFO);
406                         exit_cleanup(0);
407  
408 +                case OPT_LINK_BY_HASH:
409 +#if HAVE_LINK
410 +                       arg = poptGetOptArg(pc);
411 +                       if (sanitize_paths)
412 +                               arg = sanitize_path(NULL, arg, NULL, 0);
413 +                       link_by_hash_dir = (char *)arg;
414 +                       break;
415 +#else
416 +                       snprintf(err_buf, sizeof err_buf,
417 +                                "hard links are not supported on this %s\n",
418 +                                am_server ? "server" : "client");
419 +                       rprintf(FERROR, "ERROR: %s", err_buf);
420 +                       return 0;
421 +#endif
422 +
423                 default:
424                         /* A large opt value means that set_refuse_options()
425                          * turned this option off. */
426 @@ -1708,6 +1726,11 @@ void server_options(char **args,int *arg
427                 }
428         }
429  
430 +       if (link_by_hash_dir && am_sender) {
431 +               args[ac++] = "--link-by-hash";
432 +               args[ac++] = link_by_hash_dir;
433 +       }
434 +
435         if (files_from && (!am_sender || filesfrom_host)) {
436                 if (filesfrom_host) {
437                         args[ac++] = "--files-from";
438 --- old/receiver.c
439 +++ new/receiver.c
440 @@ -54,6 +54,7 @@ extern int delay_updates;
441  extern struct stats stats;
442  extern char *log_format;
443  extern char *tmpdir;
444 +extern char *link_by_hash_dir;
445  extern char *partial_dir;
446  extern char *basis_dir[];
447  extern struct file_list *the_file_list;
448 @@ -125,12 +126,13 @@ static int get_tmpname(char *fnametmp, c
449  
450  
451  static int receive_data(int f_in, char *fname_r, int fd_r, OFF_T size_r,
452 -                       char *fname, int fd, OFF_T total_size)
453 +                       char *fname, int fd, OFF_T total_size, char *md4)
454  {
455         static char file_sum1[MD4_SUM_LENGTH];
456         static char file_sum2[MD4_SUM_LENGTH];
457         struct map_struct *mapbuf;
458         struct sum_struct sum;
459 +       struct mdfour mdfour_data;
460         int32 len;
461         OFF_T offset = 0;
462         OFF_T offset2;
463 @@ -150,6 +152,9 @@ static int receive_data(int f_in, char *
464         } else
465                 mapbuf = NULL;
466  
467 +       if (md4)
468 +               mdfour_begin(&mdfour_data);
469 +
470         sum_init(checksum_seed);
471  
472         if (append_mode) {
473 @@ -192,6 +197,8 @@ static int receive_data(int f_in, char *
474                         cleanup_got_literal = 1;
475  
476                         sum_update(data, i);
477 +                       if (md4)
478 +                               mdfour_update(&mdfour_data,data,i);
479  
480                         if (fd != -1 && write_file(fd,data,i) != i)
481                                 goto report_write_error;
482 @@ -218,6 +225,8 @@ static int receive_data(int f_in, char *
483  
484                         see_token(map, len);
485                         sum_update(map, len);
486 +                       if (md4)
487 +                               mdfour_update(&mdfour_data,map,len);
488                 }
489  
490                 if (inplace) {
491 @@ -258,6 +267,8 @@ static int receive_data(int f_in, char *
492         }
493  
494         sum_end(file_sum1);
495 +       if (md4)
496 +               mdfour_result(&mdfour_data, (unsigned char*)md4);
497  
498         if (mapbuf)
499                 unmap_file(mapbuf);
500 @@ -273,7 +284,7 @@ static int receive_data(int f_in, char *
501  
502  static void discard_receive_data(int f_in, OFF_T length)
503  {
504 -       receive_data(f_in, NULL, -1, 0, NULL, -1, length);
505 +       receive_data(f_in, NULL, -1, 0, NULL, -1, length, NULL);
506  }
507  
508  static void handle_delayed_updates(struct file_list *flist, char *local_name)
509 @@ -605,8 +616,12 @@ int recv_files(int f_in, struct file_lis
510                         rprintf(FINFO, "%s\n", fname);
511  
512                 /* recv file data */
513 +#if HAVE_LINK
514 +               if (link_by_hash_dir)
515 +                       file->u.sum = new_array(char, MD4_SUM_LENGTH);
516 +#endif
517                 recv_ok = receive_data(f_in, fnamecmp, fd1, st.st_size,
518 -                                      fname, fd2, file->length);
519 +                                      fname, fd2, file->length, file->u.sum);
520  
521                 if (!log_before_transfer)
522                         log_item(file, &initial_stats, iflags, NULL);
523 --- old/rsync.c
524 +++ new/rsync.c
525 @@ -50,6 +50,7 @@ extern int inplace;
526  extern int keep_dirlinks;
527  extern int make_backups;
528  extern struct stats stats;
529 +extern char *link_by_hash_dir;
530  
531  #if defined HAVE_ICONV_OPEN && defined HAVE_ICONV_H
532  iconv_t ic_chck = (iconv_t)-1;
533 @@ -266,8 +267,15 @@ void finish_transfer(char *fname, char *
534         /* move tmp file over real file */
535         if (verbose > 2)
536                 rprintf(FINFO, "renaming %s to %s\n", fnametmp, fname);
537 -       ret = robust_rename(fnametmp, fname, partialptr,
538 -                           file->mode & INITACCESSPERMS);
539 +#if HAVE_LINK
540 +       if (link_by_hash_dir)
541 +               ret = link_by_hash(fnametmp, fname, file);
542 +       else
543 +#endif
544 +       {
545 +               ret = robust_rename(fnametmp, fname, partialptr,
546 +                                   file->mode & INITACCESSPERMS);
547 +       }
548         if (ret < 0) {
549                 rsyserr(FERROR, errno, "%s %s -> \"%s\"",
550                         ret == -2 ? "copy" : "rename",
551 --- old/rsync.h
552 +++ new/rsync.h
553 @@ -640,6 +640,14 @@ struct stats {
554         int current_file_index;
555  };
556  
557 +struct hashfile_struct {
558 +       struct hashfile_struct *next;
559 +       struct hashfile_struct *prev;
560 +       char *name;
561 +       int fd;
562 +       uint32 nlink;
563 +};
564 +
565  struct chmod_mode_struct;
566  
567  #include "byteorder.h"
568 --- old/rsync.yo
569 +++ new/rsync.yo
570 @@ -361,6 +361,7 @@ to the detailed description below for a 
571       --compare-dest=DIR      also compare received files relative to DIR
572       --copy-dest=DIR         ... and include copies of unchanged files
573       --link-dest=DIR         hardlink to files in DIR when unchanged
574 +     --link-by-hash=DIR      create hardlinks by hash into DIR
575   -z, --compress              compress file data during the transfer
576       --compress-level=NUM    explicitly set compression level
577   -C, --cvs-exclude           auto-ignore files in the same way CVS does