b319873a8dfe46e271e565ea93b1c66c37738c5b
[rsync-patches.git] / link-by-hash.diff
1 After applying this patch and running configure, you MUST run this
2 command before "make":
3
4     make proto
5
6 Jason M. Felice writes:
7
8 This patch adds the --link-by-hash=DIR option, which hard links received
9 files in a link farm arranged by MD4 file hash.  The result is that the system
10 will only store one copy of the unique contents of each file, regardless of
11 the file's name.
12
13
14 --- Makefile.in 15 May 2004 00:48:11 -0000      1.101
15 +++ Makefile.in 6 Jun 2004 21:24:41 -0000
16 @@ -35,7 +35,7 @@ OBJS1=rsync.o generator.o receiver.o cle
17         main.o checksum.o match.o syscall.o log.o backup.o
18  OBJS2=options.o flist.o io.o compat.o hlink.o token.o uidlist.o socket.o \
19         fileio.o batch.o clientname.o
20 -OBJS3=progress.o pipe.o
21 +OBJS3=progress.o pipe.o hashlink.o
22  DAEMON_OBJ = params.o loadparm.o clientserver.o access.o connection.o authenticate.o
23  popt_OBJS=popt/findme.o  popt/popt.o  popt/poptconfig.o \
24         popt/popthelp.o popt/poptparse.o
25 --- /dev/null   1 Jan 1970 00:00:00 -0000
26 +++ hashlink.c  6 Jun 2004 21:24:41 -0000
27 @@ -0,0 +1,342 @@
28 +/*
29 +   Copyright (C) Cronosys, LLC 2004
30 +
31 +   This program is free software; you can redistribute it and/or modify
32 +   it under the terms of the GNU General Public License as published by
33 +   the Free Software Foundation; either version 2 of the License, or
34 +   (at your option) any later version.
35 +
36 +   This program is distributed in the hope that it will be useful,
37 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
38 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
39 +   GNU General Public License for more details.
40 +
41 +   You should have received a copy of the GNU General Public License
42 +   along with this program; if not, write to the Free Software
43 +   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
44 +*/
45 +
46 +/* This file contains code used by the --link-by-hash option. */
47 +
48 +#include "rsync.h"
49 +
50 +extern char *link_by_hash_dir;
51 +
52 +#ifdef HAVE_LINK
53 +
54 +char* make_hash_name(struct file_struct *file)
55 +{
56 +       char hash[33], *dst;
57 +       unsigned char *src;
58 +       unsigned char c;
59 +       int i;
60 +
61 +       src = (unsigned char*)file->u.sum;
62 +       for (dst = hash, i = 0; i < 4; i++, src++) {
63 +               c = *src >> 4;
64 +               *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
65 +               c = *src & 0x0f;
66 +               *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
67 +       }
68 +       *dst++ = '/';
69 +       for (i = 0; i < 12; i++, src++) {
70 +               c = *src >> 4;
71 +               *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
72 +               c = *src & 0x0f;
73 +               *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
74 +       }
75 +       *dst = 0;
76 +
77 +       asprintf(&dst,"%s/%s",link_by_hash_dir,hash);
78 +       return dst;
79 +}
80 +
81 +
82 +void kill_hashfile(struct hashfile_struct *hashfile)
83 +{
84 +       if (!hashfile)
85 +               return;
86 +       free(hashfile->name);
87 +       close(hashfile->fd);
88 +       free(hashfile);
89 +}
90 +
91 +
92 +void kill_hashfiles(struct hashfile_struct *hashfiles)
93 +{
94 +       struct hashfile_struct *iter, *next;
95 +       if ((iter = hashfiles) != NULL) {
96 +               do {
97 +                       next = iter->next;
98 +                       kill_hashfile(iter);
99 +                       iter = next;
100 +               } while (iter != hashfiles);
101 +       }
102 +}
103 +
104 +
105 +struct hashfile_struct *find_hashfiles(char *hashname, int64 size, long *fnbr)
106 +{
107 +       DIR *d;
108 +       struct dirent *di;
109 +       struct hashfile_struct *hashfiles = NULL, *hashfile;
110 +       STRUCT_STAT st;
111 +       long this_fnbr;
112 +
113 +       *fnbr = 0;
114 +       
115 +       /* Build a list of potential candidates and open
116 +        * them. */
117 +       if ((d = opendir(hashname)) == NULL) {
118 +               rsyserr(FERROR, errno, "opendir \"%s\"", hashname);
119 +               free(hashname);
120 +               return NULL;
121 +       }
122 +       while ((di = readdir(d)) != NULL) {
123 +               if (!strcmp(di->d_name,".") || !strcmp(di->d_name,"..")) {
124 +                       continue;
125 +               }
126 +
127 +               /* We need to have the largest fnbr in case we need to store
128 +                * a new file. */
129 +               this_fnbr = atol(di->d_name);
130 +               if (this_fnbr > *fnbr)
131 +                       *fnbr = this_fnbr;
132 +
133 +               hashfile = (struct hashfile_struct*)malloc(sizeof(struct hashfile_struct));
134 +               asprintf(&hashfile->name,"%s/%s",hashname,
135 +                        di->d_name);
136 +               if (do_stat(hashfile->name,&st) == -1) {
137 +                       rsyserr(FERROR, errno, "%s: %s", hashfile->name);
138 +                       kill_hashfile(hashfile);
139 +                       continue;
140 +               }
141 +               if (st.st_size != size) {
142 +                       kill_hashfile(hashfile);
143 +                       continue;
144 +               }
145 +               hashfile->nlink = st.st_nlink;
146 +               hashfile->fd = open(hashfile->name,O_RDONLY|O_BINARY);
147 +               if (hashfile->fd == -1) {
148 +                       rsyserr(FERROR, errno, "%s", hashfile->name);
149 +                       kill_hashfile(hashfile);
150 +                       continue;
151 +               }
152 +               if (hashfiles == NULL)
153 +                       hashfiles = hashfile->next = hashfile->prev = hashfile;
154 +               else {
155 +                       hashfile->next = hashfiles;
156 +                       hashfile->prev = hashfiles->prev;
157 +                       hashfile->next->prev = hashfile;
158 +                       hashfile->prev->next = hashfile;
159 +               }
160 +       }
161 +       closedir(d);
162 +
163 +       return hashfiles;
164 +}
165 +
166 +
167 +struct hashfile_struct *compare_hashfiles(int fd,struct hashfile_struct *files)
168 +{
169 +       int amt, hamt;
170 +       char buffer[BUFSIZ], cmpbuffer[BUFSIZ];
171 +       struct hashfile_struct *iter, *next, *best;
172 +       uint32 nlink;
173 +
174 +       if (!files)
175 +               return NULL;
176 +
177 +       iter = files; /* in case files are 0 bytes */
178 +       while ((amt = read(fd, buffer, BUFSIZ)) > 0) {
179 +               iter = files;
180 +               do {
181 +                       /* Icky bit to resync when we steal the first node. */
182 +                       if (!files)
183 +                               files = iter;
184 +
185 +                       next = iter->next;
186 +
187 +                       hamt = read(iter->fd, cmpbuffer, BUFSIZ);
188 +                       if (amt != hamt || memcmp(buffer, cmpbuffer, amt)) {
189 +                               if (iter == files) {
190 +                                       files = files->prev;
191 +                               }
192 +                               if (iter->next == iter) {
193 +                                       files = next = NULL;
194 +                               } else {
195 +                                       next = iter->next;
196 +                                       if (iter == files) {
197 +                                               /* So we know to resync */
198 +                                               files = NULL;
199 +                                       }
200 +                               }
201 +                               iter->next->prev = iter->prev;
202 +                               iter->prev->next = iter->next;
203 +                               kill_hashfile(iter);
204 +                       }
205 +
206 +                       iter = next;
207 +               } while (iter != files);
208 +
209 +               if (iter == NULL && files == NULL) {
210 +                       /* There are no matches. */
211 +                       return NULL;
212 +               }
213 +               
214 +       }
215 +
216 +       if (amt == -1) {
217 +               rsyserr(FERROR, errno, "%s");
218 +               kill_hashfiles(files);
219 +               return NULL;
220 +       }
221 +
222 +       /* If we only have one file left, use it. */
223 +       if (files == files->next) {
224 +               return files;
225 +       }
226 +
227 +       /* All files which remain in the list are identical and should have
228 +        * the same size.  We pick the one with the lowest link count (we
229 +        * may have rolled over because we hit the maximum link count for
230 +        * the filesystem). */
231 +       best = iter = files;
232 +       nlink = iter->nlink;
233 +       do {
234 +               if (iter->nlink < nlink) {
235 +                       nlink = iter->nlink;
236 +                       best = iter;
237 +               }
238 +               iter = iter->next;
239 +       } while (iter != files);
240 +
241 +       best->next->prev = best->prev;
242 +       best->prev->next = best->next;
243 +       if (files == best)
244 +               files = files->next;
245 +       kill_hashfiles(files);
246 +       return best;
247 +}
248 +
249 +
250 +int link_by_hash(char *fnametmp,char *fname,struct file_struct *file)
251 +{
252 +       STRUCT_STAT st;
253 +       char *hashname = make_hash_name(file);          
254 +       int first = 0, rc;
255 +       char *linkname;
256 +       long last_fnbr;
257 +
258 +       if (file->length == 0) {
259 +               return robust_rename(fnametmp,fname,0644);
260 +       }
261 +
262 +       if (do_stat(hashname, &st) == -1) {
263 +               char *dirname;
264 +
265 +               /* Directory does not exist. */
266 +               dirname = strdup(hashname);
267 +               *strrchr(dirname,'/') = 0;
268 +               if (do_mkdir(dirname, 0755) == -1 && errno != EEXIST) {
269 +                       rsyserr(FERROR, errno, "mkdir %s", dirname);
270 +                       free(hashname);
271 +                       free(dirname);
272 +                       return robust_rename(fnametmp,fname,0644);
273 +               }
274 +               free(dirname);
275 +
276 +               if (do_mkdir(hashname, 0755) == -1 && errno != EEXIST) {
277 +                       rsyserr(FERROR, errno, "mkdir %s", hashname);
278 +                       free(hashname);
279 +                       return robust_rename(fnametmp,fname,0644);
280 +               }
281 +
282 +               first = 1;
283 +               asprintf(&linkname,"%s/0",hashname);
284 +               rprintf(FINFO, "(1) linkname = %s\n", linkname);
285 +                       
286 +       } else {
287 +               struct hashfile_struct *hashfiles, *hashfile;
288 +               int fd;
289 +
290 +               if (do_stat(fnametmp,&st) == -1) {
291 +                       rsyserr(FERROR, errno, "%s", fname);
292 +                       return -1;
293 +               }
294 +               hashfiles = find_hashfiles(hashname, st.st_size, &last_fnbr);
295 +
296 +               if (hashfiles == NULL) {
297 +                       first = 1;
298 +                       asprintf(&linkname,"%s/0",hashname);
299 +                       rprintf(FINFO, "(2) linkname = %s\n", linkname);
300 +               } else {
301 +                       
302 +                       /* Search for one identical to us. */
303 +                       if ((fd = open(fnametmp,O_RDONLY|O_BINARY)) == -1) {
304 +                               rsyserr(FERROR, errno, "%s", fnametmp);
305 +                               kill_hashfiles(hashfiles);
306 +                               return -1;
307 +                       }
308 +                       hashfile = compare_hashfiles(fd, hashfiles);
309 +                       hashfiles = NULL;
310 +
311 +                       if (hashfile) {
312 +                               first = 0;
313 +                               linkname = strdup(hashfile->name);
314 +                               rprintf(FINFO, "(3) linkname = %s\n", linkname);
315 +                               kill_hashfile(hashfile);
316 +                       } else {
317 +                               first = 1;
318 +                               asprintf(&linkname, "%s/%ld", hashname,
319 +                                        last_fnbr + 1);
320 +                               rprintf(FINFO, "(4) linkname = %s\n", linkname);
321 +                       }
322 +               }
323 +       }
324 +
325 +       if (!first) {
326 +               rprintf(FINFO, "link-by-hash (existing): \"%s\" -> %s\n",
327 +                               linkname, full_fname(fname));
328 +               rc = do_link(linkname, fname);
329 +               if (rc == -1) {
330 +                       if (errno == EMLINK) {
331 +                               first = 1;
332 +                               free(linkname);
333 +                               asprintf(&linkname,"%s/%ld",hashname,
334 +                                        last_fnbr + 1);
335 +                               rprintf(FINFO, "(5) linkname = %s\n", linkname);
336 +                               rprintf(FINFO,"link-by-hash: max link count exceeded, starting new file \"%s\".\n", linkname);
337 +                       } else {
338 +                               rsyserr(FERROR, errno, "link \"%s\" -> \"%s\"",
339 +                                       linkname, full_fname(fname));
340 +                               robust_unlink(fname);
341 +                               rc = robust_rename(fnametmp,fname,0644);
342 +                       }
343 +               } else {
344 +                       do_unlink(fnametmp);
345 +               }
346 +       }
347 +
348 +       if (first) {
349 +               rprintf(FINFO, "link-by-hash (new): %s -> \"%s\"\n",
350 +                               full_fname(fname),linkname);
351 +
352 +               rc = robust_rename(fnametmp,fname,0644);
353 +               if (rc != 0) {
354 +                       rsyserr(FERROR, errno, "rename \"%s\" -> \"%s\"",
355 +                               full_fname(fnametmp), full_fname(fname));
356 +               }
357 +               rc = do_link(fname,linkname);
358 +               if (rc != 0) {
359 +                       rsyserr(FERROR, errno, "link \"%s\" -> \"%s\"",
360 +                               full_fname(fname), linkname);
361 +               }
362 +       }
363 +
364 +       free(linkname);
365 +       free(hashname);
366 +       return rc;
367 +}
368 +
369 +#endif
370 --- options.c   6 Jun 2004 19:02:40 -0000       1.155
371 +++ options.c   6 Jun 2004 21:24:41 -0000
372 @@ -124,6 +124,7 @@ char *log_format = NULL;
373  char *password_file = NULL;
374  char *rsync_path = RSYNC_PATH;
375  char *backup_dir = NULL;
376 +char *link_by_hash_dir = NULL;
377  char backup_dir_buf[MAXPATHLEN];
378  int rsync_port = RSYNC_PORT;
379  int link_dest = 0;
380 @@ -270,6 +271,7 @@ void usage(enum logcode F)
381    rprintf(F," -T  --temp-dir=DIR          create temporary files in directory DIR\n");
382    rprintf(F,"     --compare-dest=DIR      also compare destination files relative to DIR\n");
383    rprintf(F,"     --link-dest=DIR         create hardlinks to DIR for unchanged files\n");
384 +  rprintf(F,"     --link-by-hash=DIR      create hardlinks by hash to DIR for regular files\n");
385    rprintf(F," -P                          equivalent to --partial --progress\n");
386    rprintf(F," -z, --compress              compress file data\n");
387    rprintf(F," -C, --cvs-exclude           auto ignore files in the same way CVS does\n");
388 @@ -310,7 +312,7 @@ void usage(enum logcode F)
389  enum {OPT_VERSION = 1000, OPT_SENDER, OPT_EXCLUDE, OPT_EXCLUDE_FROM,
390        OPT_DELETE_AFTER, OPT_DELETE_EXCLUDED, OPT_LINK_DEST,
391        OPT_INCLUDE, OPT_INCLUDE_FROM, OPT_MODIFY_WINDOW,
392 -      OPT_READ_BATCH, OPT_WRITE_BATCH, OPT_TIMEOUT,
393 +      OPT_READ_BATCH, OPT_WRITE_BATCH, OPT_TIMEOUT, OPT_LINK_BY_HASH,
394        OPT_REFUSED_BASE = 9000};
395  
396  static struct poptOption long_options[] = {
397 @@ -368,6 +370,7 @@ static struct poptOption long_options[] 
398    {"temp-dir",        'T', POPT_ARG_STRING, &tmpdir, 0, 0, 0 },
399    {"compare-dest",     0,  POPT_ARG_STRING, &compare_dest, 0, 0, 0 },
400    {"link-dest",        0,  POPT_ARG_STRING, &compare_dest,  OPT_LINK_DEST, 0, 0 },
401 +  {"link-by-hash",     0,  POPT_ARG_STRING, 0,              OPT_LINK_BY_HASH, 0, 0},
402    /* TODO: Should this take an optional int giving the compression level? */
403    {"compress",        'z', POPT_ARG_NONE,   &do_compression, 0, 0, 0 },
404    {"daemon",           0,  POPT_ARG_NONE,   &daemon_opt, 0, 0, 0 },
405 @@ -601,6 +604,19 @@ int parse_arguments(int *argc, const cha
406                         return 0;
407  #endif
408  
409 +                case OPT_LINK_BY_HASH:
410 +#if HAVE_LINK
411 +                       link_by_hash_dir = (char *)poptGetOptArg(pc);
412 +                       checksum_seed = FIXED_CHECKSUM_SEED;
413 +                       break;
414 +#else
415 +                       snprintf(err_buf, sizeof err_buf,
416 +                                "hard links are not supported on this %s\n",
417 +                                am_server ? "server" : "client");
418 +                       rprintf(FERROR, "ERROR: %s", err_buf);
419 +                       return 0;
420 +#endif
421 +
422                 default:
423                         /* A large opt value means that set_refuse_options()
424                          * turned this option off (opt-BASE is its index). */
425 @@ -982,6 +998,11 @@ void server_options(char **args,int *arg
426                 args[ac++] = compare_dest;
427         }
428  
429 +       if (link_by_hash_dir && am_sender) {
430 +               args[ac++] = "--link-by-hash";
431 +               args[ac++] = link_by_hash_dir;
432 +       }
433 +
434         if (files_from && (!am_sender || remote_filesfrom_file)) {
435                 if (remote_filesfrom_file) {
436                         args[ac++] = "--files-from";
437 --- receiver.c  21 May 2004 08:27:04 -0000      1.79
438 +++ receiver.c  6 Jun 2004 21:24:41 -0000
439 @@ -47,6 +47,7 @@ extern int ignore_errors;
440  extern int orig_umask;
441  extern int keep_partial;
442  extern int checksum_seed;
443 +extern char *link_by_hash_dir;
444  
445  static void delete_one(char *fn, int is_dir)
446  {
447 @@ -193,10 +194,11 @@ static int get_tmpname(char *fnametmp, c
448  
449  
450  static int receive_data(int f_in,struct map_struct *mapbuf,int fd,char *fname,
451 -                       OFF_T total_size)
452 +                       OFF_T total_size,char *md4)
453  {
454         int i;
455         struct sum_struct sum;
456 +       struct mdfour mdfour_data;
457         unsigned int len;
458         OFF_T offset = 0;
459         OFF_T offset2;
460 @@ -207,6 +209,9 @@ static int receive_data(int f_in,struct 
461  
462         read_sum_head(f_in, &sum);
463  
464 +       if (md4)
465 +               mdfour_begin(&mdfour_data);
466 +
467         sum_init(checksum_seed);
468  
469         while ((i = recv_token(f_in, &data)) != 0) {
470 @@ -223,6 +228,8 @@ static int receive_data(int f_in,struct 
471                         cleanup_got_literal = 1;
472  
473                         sum_update(data,i);
474 +                       if (md4)
475 +                               mdfour_update(&mdfour_data,data,i);
476  
477                         if (fd != -1 && write_file(fd,data,i) != i) {
478                                 rsyserr(FERROR, errno, "write failed on %s",
479 @@ -250,6 +257,8 @@ static int receive_data(int f_in,struct 
480  
481                         see_token(map, len);
482                         sum_update(map,len);
483 +                       if (md4)
484 +                               mdfour_update(&mdfour_data,map,len);
485                 }
486  
487                 if (fd != -1 && write_file(fd,map,len) != (int) len) {
488 @@ -272,6 +281,8 @@ static int receive_data(int f_in,struct 
489         }
490  
491         sum_end(file_sum1);
492 +       if (md4)
493 +               mdfour_result(&mdfour_data, (unsigned char*)md4);
494  
495         read_buf(f_in,file_sum2,MD4_SUM_LENGTH);
496         if (verbose > 2) {
497 @@ -375,7 +386,7 @@ int recv_files(int f_in,struct file_list
498                 if (fd1 != -1 && do_fstat(fd1,&st) != 0) {
499                         rsyserr(FERROR, errno, "fstat %s failed",
500                                 full_fname(fnamecmp));
501 -                       receive_data(f_in,NULL,-1,NULL,file->length);
502 +                       receive_data(f_in,NULL,-1,NULL,file->length,NULL);
503                         close(fd1);
504                         continue;
505                 }
506 @@ -388,7 +399,7 @@ int recv_files(int f_in,struct file_list
507                          */
508                         rprintf(FERROR,"recv_files: %s is a directory\n",
509                                 full_fname(fnamecmp));
510 -                       receive_data(f_in, NULL, -1, NULL, file->length);
511 +                       receive_data(f_in,NULL,-1,NULL,file->length,NULL);
512                         close(fd1);
513                         continue;
514                 }
515 @@ -440,7 +451,7 @@ int recv_files(int f_in,struct file_list
516                 if (fd2 == -1) {
517                         rsyserr(FERROR, errno, "mkstemp %s failed",
518                                 full_fname(fnametmp));
519 -                       receive_data(f_in,mapbuf,-1,NULL,file->length);
520 +                       receive_data(f_in,mapbuf,-1,NULL,file->length,NULL);
521                         if (mapbuf) unmap_file(mapbuf);
522                         if (fd1 != -1) close(fd1);
523                         continue;
524 @@ -453,7 +464,11 @@ int recv_files(int f_in,struct file_list
525                 }
526  
527                 /* recv file data */
528 -               recv_ok = receive_data(f_in,mapbuf,fd2,fname,file->length);
529 +#ifdef HAVE_LINK
530 +               if (link_by_hash_dir)
531 +                       file->u.sum = (char*)malloc(MD4_SUM_LENGTH);
532 +#endif
533 +               recv_ok = receive_data(f_in,mapbuf,fd2,fname,file->length,file->u.sum);
534  
535                 log_recv(file, &initial_stats);
536  
537 --- rsync.c     21 May 2004 08:43:03 -0000      1.140
538 +++ rsync.c     6 Jun 2004 21:24:41 -0000
539 @@ -34,6 +34,7 @@ extern int force_delete;
540  extern int recurse;
541  extern int make_backups;
542  extern char *backup_dir;
543 +extern char *link_by_hash_dir;
544  
545  
546  /*
547 @@ -239,8 +240,12 @@ void finish_transfer(char *fname, char *
548         if (make_backups && !make_backup(fname))
549                 return;
550  
551 -       /* move tmp file over real file */
552 -       ret = robust_rename(fnametmp, fname, file->mode & INITACCESSPERMS);
553 +#ifdef HAVE_LINK
554 +       if (link_by_hash_dir)
555 +               ret = link_by_hash(fnametmp, fname, file);
556 +       else
557 +#endif
558 +               ret = robust_rename(fnametmp, fname, file->mode & INITACCESSPERMS);
559         if (ret < 0) {
560                 rsyserr(FERROR, errno, "%s %s -> \"%s\"",
561                     ret == -2 ? "copy" : "rename",
562 --- rsync.h     16 May 2004 07:28:24 -0000      1.204
563 +++ rsync.h     6 Jun 2004 21:24:41 -0000
564 @@ -522,6 +522,14 @@ struct stats {
565         int current_file_index;
566  };
567  
568 +struct hashfile_struct {
569 +       struct hashfile_struct *next;
570 +       struct hashfile_struct *prev;
571 +       char *name;
572 +       int fd;
573 +       uint32 nlink;
574 +};
575 +
576  
577  /* we need this function because of the silly way in which duplicate
578     entries are handled in the file lists - we can't change this