9e5ffac2437103c59ac5f1177cf33c072b7fdc61
[rsync-patches.git] / link-by-hash.diff
1 Jason M. Felice wrote:
2
3 This patch adds the --link-by-hash=DIR option, which hard links received
4 files in a link farm arranged by MD4 file hash.  The result is that the system
5 will only store one copy of the unique contents of each file, regardless of
6 the file's name.
7
8 To use this patch, run these commands for a successful build:
9
10     patch -p1 <patches/link-by-hash.diff
11     ./prepare-source
12     ./configure
13     make
14
15 --- old/Makefile.in
16 +++ new/Makefile.in
17 @@ -35,7 +35,7 @@ OBJS1=flist.o rsync.o generator.o receiv
18         util.o main.o checksum.o match.o syscall.o log.o backup.o
19  OBJS2=options.o io.o compat.o hlink.o token.o uidlist.o socket.o hashtable.o \
20         fileio.o batch.o clientname.o chmod.o acls.o xattrs.o
21 -OBJS3=progress.o pipe.o
22 +OBJS3=progress.o pipe.o hashlink.o
23  DAEMON_OBJ = params.o loadparm.o clientserver.o access.o connection.o authenticate.o
24  popt_OBJS=popt/findme.o  popt/popt.o  popt/poptconfig.o \
25         popt/popthelp.o popt/poptparse.o
26 --- old/flist.c
27 +++ new/flist.c
28 @@ -67,6 +67,7 @@ extern int need_unsorted_flist;
29  extern int unsort_ndx;
30  extern struct stats stats;
31  extern char *filesfrom_host;
32 +extern char *link_by_hash_dir;
33  
34  extern char curr_dir[MAXPATHLEN];
35  
36 @@ -815,7 +816,7 @@ static struct file_struct *recv_file_ent
37                 extra_len += (S_ISDIR(mode) ? 2 : 1) * EXTRA_LEN;
38  #endif
39  
40 -       if (always_checksum && S_ISREG(mode))
41 +       if ((always_checksum || link_by_hash_dir) && S_ISREG(mode))
42                 extra_len += SUM_EXTRA_CNT * EXTRA_LEN;
43  
44         if (file_length > 0xFFFFFFFFu && S_ISREG(mode))
45 --- old/hashlink.c
46 +++ new/hashlink.c
47 @@ -0,0 +1,336 @@
48 +/*
49 +   Copyright (C) Cronosys, LLC 2004
50 +
51 +   This program is free software; you can redistribute it and/or modify
52 +   it under the terms of the GNU General Public License as published by
53 +   the Free Software Foundation; either version 2 of the License, or
54 +   (at your option) any later version.
55 +
56 +   This program is distributed in the hope that it will be useful,
57 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
58 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
59 +   GNU General Public License for more details.
60 +
61 +   You should have received a copy of the GNU General Public License
62 +   along with this program; if not, write to the Free Software
63 +   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
64 +*/
65 +
66 +/* This file contains code used by the --link-by-hash option. */
67 +
68 +#include "rsync.h"
69 +
70 +extern char *link_by_hash_dir;
71 +
72 +#ifdef HAVE_LINK
73 +
74 +char *make_hash_name(struct file_struct *file)
75 +{
76 +       char hash[33], *dst;
77 +       uchar c, *src = (uchar*)F_SUM(file);
78 +       int i;
79 +
80 +       for (dst = hash, i = 0; i < 4; i++, src++) {
81 +               c = *src >> 4;
82 +               *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
83 +               c = *src & 0x0f;
84 +               *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
85 +       }
86 +       *dst++ = '/';
87 +       for (i = 0; i < 12; i++, src++) {
88 +               c = *src >> 4;
89 +               *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
90 +               c = *src & 0x0f;
91 +               *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
92 +       }
93 +       *dst = 0;
94 +
95 +       asprintf(&dst,"%s/%s",link_by_hash_dir,hash);
96 +       return dst;
97 +}
98 +
99 +
100 +void kill_hashfile(struct hashfile_struct *hashfile)
101 +{
102 +       if (!hashfile)
103 +               return;
104 +       free(hashfile->name);
105 +       close(hashfile->fd);
106 +       free(hashfile);
107 +}
108 +
109 +
110 +void kill_hashfiles(struct hashfile_struct *hashfiles)
111 +{
112 +       struct hashfile_struct *iter, *next;
113 +       if ((iter = hashfiles) != NULL) {
114 +               do {
115 +                       next = iter->next;
116 +                       kill_hashfile(iter);
117 +                       iter = next;
118 +               } while (iter != hashfiles);
119 +       }
120 +}
121 +
122 +
123 +struct hashfile_struct *find_hashfiles(char *hashname, int64 size, long *fnbr)
124 +{
125 +       DIR *d;
126 +       struct dirent *di;
127 +       struct hashfile_struct *hashfiles = NULL, *hashfile;
128 +       STRUCT_STAT st;
129 +       long this_fnbr;
130 +
131 +       *fnbr = 0;
132 +
133 +       /* Build a list of potential candidates and open
134 +        * them. */
135 +       if ((d = opendir(hashname)) == NULL) {
136 +               rsyserr(FERROR, errno, "opendir failed: \"%s\"", hashname);
137 +               free(hashname);
138 +               return NULL;
139 +       }
140 +       while ((di = readdir(d)) != NULL) {
141 +               if (!strcmp(di->d_name,".") || !strcmp(di->d_name,"..")) {
142 +                       continue;
143 +               }
144 +
145 +               /* We need to have the largest fnbr in case we need to store
146 +                * a new file. */
147 +               this_fnbr = atol(di->d_name);
148 +               if (this_fnbr > *fnbr)
149 +                       *fnbr = this_fnbr;
150 +
151 +               hashfile = new_array(struct hashfile_struct, 1);
152 +               asprintf(&hashfile->name,"%s/%s",hashname,
153 +                        di->d_name);
154 +               if (do_stat(hashfile->name,&st) == -1) {
155 +                       rsyserr(FERROR, errno, "stat failed: %s", hashfile->name);
156 +                       kill_hashfile(hashfile);
157 +                       continue;
158 +               }
159 +               if (st.st_size != size) {
160 +                       kill_hashfile(hashfile);
161 +                       continue;
162 +               }
163 +               hashfile->nlink = st.st_nlink;
164 +               hashfile->fd = open(hashfile->name,O_RDONLY|O_BINARY);
165 +               if (hashfile->fd == -1) {
166 +                       rsyserr(FERROR, errno, "open failed: %s", hashfile->name);
167 +                       kill_hashfile(hashfile);
168 +                       continue;
169 +               }
170 +               if (hashfiles == NULL)
171 +                       hashfiles = hashfile->next = hashfile->prev = hashfile;
172 +               else {
173 +                       hashfile->next = hashfiles;
174 +                       hashfile->prev = hashfiles->prev;
175 +                       hashfile->next->prev = hashfile;
176 +                       hashfile->prev->next = hashfile;
177 +               }
178 +       }
179 +       closedir(d);
180 +
181 +       return hashfiles;
182 +}
183 +
184 +
185 +struct hashfile_struct *compare_hashfiles(int fd,struct hashfile_struct *files)
186 +{
187 +       int amt, hamt;
188 +       char buffer[BUFSIZ], cmpbuffer[BUFSIZ];
189 +       struct hashfile_struct *iter, *next, *best;
190 +       uint32 nlink;
191 +
192 +       if (!files)
193 +               return NULL;
194 +
195 +       iter = files; /* in case files are 0 bytes */
196 +       while ((amt = read(fd, buffer, BUFSIZ)) > 0) {
197 +               iter = files;
198 +               do {
199 +                       /* Icky bit to resync when we steal the first node. */
200 +                       if (!files)
201 +                               files = iter;
202 +
203 +                       next = iter->next;
204 +
205 +                       hamt = read(iter->fd, cmpbuffer, BUFSIZ);
206 +                       if (amt != hamt || memcmp(buffer, cmpbuffer, amt)) {
207 +                               if (iter == files) {
208 +                                       files = files->prev;
209 +                               }
210 +                               if (iter->next == iter) {
211 +                                       files = next = NULL;
212 +                               } else {
213 +                                       next = iter->next;
214 +                                       if (iter == files) {
215 +                                               /* So we know to resync */
216 +                                               files = NULL;
217 +                                       }
218 +                               }
219 +                               iter->next->prev = iter->prev;
220 +                               iter->prev->next = iter->next;
221 +                               kill_hashfile(iter);
222 +                       }
223 +
224 +                       iter = next;
225 +               } while (iter != files);
226 +
227 +               if (iter == NULL && files == NULL) {
228 +                       /* There are no matches. */
229 +                       return NULL;
230 +               }
231 +       }
232 +
233 +       if (amt == -1) {
234 +               rsyserr(FERROR, errno, "read failed in compare_hashfiles()");
235 +               kill_hashfiles(files);
236 +               return NULL;
237 +       }
238 +
239 +       /* If we only have one file left, use it. */
240 +       if (files == files->next) {
241 +               return files;
242 +       }
243 +
244 +       /* All files which remain in the list are identical and should have
245 +        * the same size.  We pick the one with the lowest link count (we
246 +        * may have rolled over because we hit the maximum link count for
247 +        * the filesystem). */
248 +       best = iter = files;
249 +       nlink = iter->nlink;
250 +       do {
251 +               if (iter->nlink < nlink) {
252 +                       nlink = iter->nlink;
253 +                       best = iter;
254 +               }
255 +               iter = iter->next;
256 +       } while (iter != files);
257 +
258 +       best->next->prev = best->prev;
259 +       best->prev->next = best->next;
260 +       if (files == best)
261 +               files = files->next;
262 +       kill_hashfiles(files);
263 +       return best;
264 +}
265 +
266 +
267 +int link_by_hash(const char *fnametmp, const char *fname, struct file_struct *file)
268 +{
269 +       STRUCT_STAT st;
270 +       char *hashname = make_hash_name(file);
271 +       int first = 0, rc;
272 +       char *linkname;
273 +       long last_fnbr;
274 +
275 +       if (F_LENGTH(file) == 0)
276 +               return robust_rename(fnametmp, fname, NULL, 0644);
277 +
278 +       if (do_stat(hashname, &st) == -1) {
279 +               char *dirname;
280 +
281 +               /* Directory does not exist. */
282 +               dirname = strdup(hashname);
283 +               *strrchr(dirname,'/') = 0;
284 +               if (do_mkdir(dirname, 0755) == -1 && errno != EEXIST) {
285 +                       rsyserr(FERROR, errno, "mkdir failed: %s", dirname);
286 +                       free(hashname);
287 +                       free(dirname);
288 +                       return robust_rename(fnametmp, fname, NULL, 0644);
289 +               }
290 +               free(dirname);
291 +
292 +               if (do_mkdir(hashname, 0755) == -1 && errno != EEXIST) {
293 +                       rsyserr(FERROR, errno, "mkdir failed: %s", hashname);
294 +                       free(hashname);
295 +                       return robust_rename(fnametmp, fname, NULL, 0644);
296 +               }
297 +
298 +               first = 1;
299 +               asprintf(&linkname,"%s/0",hashname);
300 +               rprintf(FINFO, "(1) linkname = %s\n", linkname);
301 +       } else {
302 +               struct hashfile_struct *hashfiles, *hashfile;
303 +
304 +               if (do_stat(fnametmp,&st) == -1) {
305 +                       rsyserr(FERROR, errno, "stat failed: %s", fname);
306 +                       return -1;
307 +               }
308 +               hashfiles = find_hashfiles(hashname, st.st_size, &last_fnbr);
309 +
310 +               if (hashfiles == NULL) {
311 +                       first = 1;
312 +                       asprintf(&linkname,"%s/0",hashname);
313 +                       rprintf(FINFO, "(2) linkname = %s\n", linkname);
314 +               } else {
315 +                       int fd;
316 +                       /* Search for one identical to us. */
317 +                       if ((fd = open(fnametmp,O_RDONLY|O_BINARY)) == -1) {
318 +                               rsyserr(FERROR, errno, "open failed: %s", fnametmp);
319 +                               kill_hashfiles(hashfiles);
320 +                               return -1;
321 +                       }
322 +                       hashfile = compare_hashfiles(fd, hashfiles);
323 +                       hashfiles = NULL;
324 +                       close(fd);
325 +
326 +                       if (hashfile) {
327 +                               first = 0;
328 +                               linkname = strdup(hashfile->name);
329 +                               rprintf(FINFO, "(3) linkname = %s\n", linkname);
330 +                               kill_hashfile(hashfile);
331 +                       } else {
332 +                               first = 1;
333 +                               asprintf(&linkname, "%s/%ld", hashname,
334 +                                        last_fnbr + 1);
335 +                               rprintf(FINFO, "(4) linkname = %s\n", linkname);
336 +                       }
337 +               }
338 +       }
339 +
340 +       if (!first) {
341 +               rprintf(FINFO, "link-by-hash (existing): \"%s\" -> %s\n",
342 +                               linkname, full_fname(fname));
343 +               robust_unlink(fname);
344 +               rc = do_link(linkname, fname);
345 +               if (rc == -1) {
346 +                       if (errno == EMLINK) {
347 +                               first = 1;
348 +                               free(linkname);
349 +                               asprintf(&linkname,"%s/%ld",hashname,
350 +                                        last_fnbr + 1);
351 +                               rprintf(FINFO, "(5) linkname = %s\n", linkname);
352 +                               rprintf(FINFO,"link-by-hash: max link count exceeded, starting new file \"%s\".\n", linkname);
353 +                       } else {
354 +                               rsyserr(FERROR, errno, "link \"%s\" -> \"%s\"",
355 +                                       linkname, full_fname(fname));
356 +                               rc = robust_rename(fnametmp, fname, NULL, 0644);
357 +                       }
358 +               } else {
359 +                       do_unlink(fnametmp);
360 +               }
361 +       }
362 +
363 +       if (first) {
364 +               rprintf(FINFO, "link-by-hash (new): %s -> \"%s\"\n",
365 +                               full_fname(fname),linkname);
366 +
367 +               rc = robust_rename(fnametmp, fname, NULL, 0644);
368 +               if (rc != 0) {
369 +                       rsyserr(FERROR, errno, "rename \"%s\" -> \"%s\"",
370 +                               full_fname(fnametmp), full_fname(fname));
371 +               }
372 +               rc = do_link(fname,linkname);
373 +               if (rc != 0) {
374 +                       rsyserr(FERROR, errno, "link \"%s\" -> \"%s\"",
375 +                               full_fname(fname), linkname);
376 +               }
377 +       }
378 +
379 +       free(linkname);
380 +       free(hashname);
381 +       return rc;
382 +}
383 +#endif
384 --- old/options.c
385 +++ new/options.c
386 @@ -154,6 +154,7 @@ char *backup_suffix = NULL;
387  char *tmpdir = NULL;
388  char *partial_dir = NULL;
389  char *basis_dir[MAX_BASIS_DIRS+1];
390 +char *link_by_hash_dir = NULL;
391  char *config_file = NULL;
392  char *shell_cmd = NULL;
393  char *logfile_name = NULL;
394 @@ -386,6 +387,7 @@ void usage(enum logcode F)
395    rprintf(F,"     --compare-dest=DIR      also compare destination files relative to DIR\n");
396    rprintf(F,"     --copy-dest=DIR         ... and include copies of unchanged files\n");
397    rprintf(F,"     --link-dest=DIR         hardlink to files in DIR when unchanged\n");
398 +  rprintf(F,"     --link-by-hash=DIR      create hardlinks by hash into DIR\n");
399    rprintf(F," -z, --compress              compress file data during the transfer\n");
400    rprintf(F,"     --compress-level=NUM    explicitly set compression level\n");
401    rprintf(F,"     --skip-compress=LIST    skip compressing files with a suffix in LIST\n");
402 @@ -438,7 +440,7 @@ enum {OPT_VERSION = 1000, OPT_DAEMON, OP
403        OPT_FILTER, OPT_COMPARE_DEST, OPT_COPY_DEST, OPT_LINK_DEST, OPT_HELP,
404        OPT_INCLUDE, OPT_INCLUDE_FROM, OPT_MODIFY_WINDOW, OPT_MIN_SIZE, OPT_CHMOD,
405        OPT_READ_BATCH, OPT_WRITE_BATCH, OPT_ONLY_WRITE_BATCH, OPT_MAX_SIZE,
406 -      OPT_NO_D, OPT_APPEND,
407 +      OPT_NO_D, OPT_APPEND, OPT_LINK_BY_HASH,
408        OPT_SERVER, OPT_REFUSED_BASE = 9000};
409  
410  static struct poptOption long_options[] = {
411 @@ -561,6 +563,7 @@ static struct poptOption long_options[] 
412    {"compare-dest",     0,  POPT_ARG_STRING, 0, OPT_COMPARE_DEST, 0, 0 },
413    {"copy-dest",        0,  POPT_ARG_STRING, 0, OPT_COPY_DEST, 0, 0 },
414    {"link-dest",        0,  POPT_ARG_STRING, 0, OPT_LINK_DEST, 0, 0 },
415 +  {"link-by-hash",     0,  POPT_ARG_STRING, 0, OPT_LINK_BY_HASH, 0, 0},
416    {"fuzzy",           'y', POPT_ARG_NONE,   &fuzzy_basis, 0, 0, 0 },
417    {"compress",        'z', POPT_ARG_NONE,   0, 'z', 0, 0 },
418    {"no-compress",      0,  POPT_ARG_VAL,    &do_compression, 0, 0, 0 },
419 @@ -1221,6 +1224,21 @@ int parse_arguments(int *argc_p, const c
420                         return 0;
421  #endif
422  
423 +                case OPT_LINK_BY_HASH:
424 +#ifdef HAVE_LINK
425 +                       arg = poptGetOptArg(pc);
426 +                       if (sanitize_paths)
427 +                               arg = sanitize_path(NULL, arg, NULL, 0, NULL);
428 +                       link_by_hash_dir = (char *)arg;
429 +                       break;
430 +#else
431 +                       snprintf(err_buf, sizeof err_buf,
432 +                                "hard links are not supported on this %s\n",
433 +                                am_server ? "server" : "client");
434 +                       rprintf(FERROR, "ERROR: %s", err_buf);
435 +                       return 0;
436 +#endif
437 +
438                 default:
439                         /* A large opt value means that set_refuse_options()
440                          * turned this option off. */
441 @@ -1963,6 +1981,11 @@ void server_options(char **args, int *ar
442         } else if (inplace)
443                 args[ac++] = "--inplace";
444  
445 +       if (link_by_hash_dir && am_sender) {
446 +               args[ac++] = "--link-by-hash";
447 +               args[ac++] = link_by_hash_dir;
448 +       }
449 +
450         if (files_from && (!am_sender || filesfrom_host)) {
451                 if (filesfrom_host) {
452                         args[ac++] = "--files-from";
453 --- old/receiver.c
454 +++ new/receiver.c
455 @@ -162,12 +162,14 @@ int open_tmpfile(char *fnametmp, const c
456  }
457  
458  static int receive_data(int f_in, char *fname_r, int fd_r, OFF_T size_r,
459 -                       const char *fname, int fd, OFF_T total_size)
460 +                       const char *fname, int fd, OFF_T total_size,
461 +                       const char *md4)
462  {
463         static char file_sum1[MAX_DIGEST_LEN];
464         static char file_sum2[MAX_DIGEST_LEN];
465         struct map_struct *mapbuf;
466         struct sum_struct sum;
467 +       md_context mdfour_data;
468         int32 len, sum_len;
469         OFF_T offset = 0;
470         OFF_T offset2;
471 @@ -187,6 +189,9 @@ static int receive_data(int f_in, char *
472         } else
473                 mapbuf = NULL;
474  
475 +       if (md4)
476 +               mdfour_begin(&mdfour_data);
477 +
478         sum_init(checksum_seed);
479  
480         if (append_mode > 0) {
481 @@ -231,6 +236,8 @@ static int receive_data(int f_in, char *
482                         cleanup_got_literal = 1;
483  
484                         sum_update(data, i);
485 +                       if (md4)
486 +                               mdfour_update(&mdfour_data, (uchar*)data, i);
487  
488                         if (fd != -1 && write_file(fd,data,i) != i)
489                                 goto report_write_error;
490 @@ -257,6 +264,8 @@ static int receive_data(int f_in, char *
491  
492                         see_token(map, len);
493                         sum_update(map, len);
494 +                       if (md4)
495 +                               mdfour_update(&mdfour_data, (uchar*)map, len);
496                 }
497  
498                 if (updating_basis_or_equiv) {
499 @@ -299,6 +308,8 @@ static int receive_data(int f_in, char *
500         }
501  
502         sum_len = sum_end(file_sum1);
503 +       if (md4)
504 +               mdfour_result(&mdfour_data, (uchar*)md4);
505  
506         if (mapbuf)
507                 unmap_file(mapbuf);
508 @@ -314,7 +325,7 @@ static int receive_data(int f_in, char *
509  
510  static void discard_receive_data(int f_in, OFF_T length)
511  {
512 -       receive_data(f_in, NULL, -1, 0, NULL, -1, length);
513 +       receive_data(f_in, NULL, -1, 0, NULL, -1, length, NULL);
514  }
515  
516  static void handle_delayed_updates(char *local_name)
517 @@ -673,7 +684,7 @@ int recv_files(int f_in, char *local_nam
518  
519                 /* recv file data */
520                 recv_ok = receive_data(f_in, fnamecmp, fd1, st.st_size,
521 -                                      fname, fd2, F_LENGTH(file));
522 +                                      fname, fd2, F_LENGTH(file), F_SUM(file));
523  
524                 log_item(log_code, file, &initial_stats, iflags, NULL);
525  
526 --- old/rsync.c
527 +++ new/rsync.c
528 @@ -48,6 +48,7 @@ extern int inplace;
529  extern int flist_eof;
530  extern int keep_dirlinks;
531  extern int make_backups;
532 +extern char *link_by_hash_dir;
533  extern struct file_list *cur_flist, *first_flist, *dir_flist;
534  extern struct chmod_mode_struct *daemon_chmod_modes;
535  #ifdef ICONV_OPTION
536 @@ -530,8 +531,15 @@ void finish_transfer(const char *fname, 
537         /* move tmp file over real file */
538         if (verbose > 2)
539                 rprintf(FINFO, "renaming %s to %s\n", fnametmp, fname);
540 -       ret = robust_rename(fnametmp, fname, partialptr,
541 -                           file->mode & INITACCESSPERMS);
542 +#ifdef HAVE_LINK
543 +       if (link_by_hash_dir)
544 +               ret = link_by_hash(fnametmp, fname, file);
545 +       else
546 +#endif
547 +       {
548 +               ret = robust_rename(fnametmp, fname, partialptr,
549 +                                   file->mode & INITACCESSPERMS);
550 +       }
551         if (ret < 0) {
552                 rsyserr(FERROR, errno, "%s %s -> \"%s\"",
553                         ret == -2 ? "copy" : "rename",
554 --- old/rsync.h
555 +++ new/rsync.h
556 @@ -807,6 +807,14 @@ struct stats {
557         int current_file_index;
558  };
559  
560 +struct hashfile_struct {
561 +       struct hashfile_struct *next;
562 +       struct hashfile_struct *prev;
563 +       char *name;
564 +       int fd;
565 +       uint32 nlink;
566 +};
567 +
568  struct chmod_mode_struct;
569  
570  #define EMPTY_ITEM_LIST {NULL, 0, 0}
571 --- old/rsync.yo
572 +++ new/rsync.yo
573 @@ -387,6 +387,7 @@ to the detailed description below for a 
574       --compare-dest=DIR      also compare received files relative to DIR
575       --copy-dest=DIR         ... and include copies of unchanged files
576       --link-dest=DIR         hardlink to files in DIR when unchanged
577 +     --link-by-hash=DIR      create hardlinks by hash into DIR
578   -z, --compress              compress file data during the transfer
579       --compress-level=NUM    explicitly set compression level
580       --skip-compress=LIST    skip compressing files with suffix in LIST