-After applying this patch and running configure, you MUST run this
-command before "make":
+Jason M. Felice wrote:
- make proto
+This patch adds the --link-by-hash=DIR option, which hard links received files
+in a link farm arranged by MD4 or MD5 file hash. The result is that the system
+will only store one copy of the unique contents of each file, regardless of the
+file's name.
-Jason M. Felice writes:
+To use this patch, run these commands for a successful build:
-This patch adds the --link-by-hash=DIR option, which hard links received
-files in a link farm arranged by MD4 file hash. The result is that the system
-will only store one copy of the unique contents of each file, regardless of
-the file's name.
+ patch -p1 <patches/link-by-hash.diff
+ ./prepare-source
+ ./configure
+ make
-
---- orig/Makefile.in 2005-07-07 23:11:07
-+++ Makefile.in 2004-07-03 20:20:15
-@@ -34,7 +34,7 @@ OBJS1=rsync.o generator.o receiver.o cle
- main.o checksum.o match.o syscall.o log.o backup.o
- OBJS2=options.o flist.o io.o compat.o hlink.o token.o uidlist.o socket.o \
- fileio.o batch.o clientname.o
+based-on: 8946cfc6f8018e30740ee1db4cc2e2008e4f7e7e
+diff --git a/Makefile.in b/Makefile.in
+--- a/Makefile.in
++++ b/Makefile.in
+@@ -39,7 +39,7 @@ OBJS1=flist.o rsync.o generator.o receiver.o cleanup.o sender.o exclude.o \
+ util.o util2.o main.o checksum.o match.o syscall.o log.o backup.o delete.o
+ OBJS2=options.o io.o compat.o hlink.o token.o uidlist.o socket.o hashtable.o \
+ fileio.o batch.o clientname.o chmod.o acls.o xattrs.o
-OBJS3=progress.o pipe.o
+OBJS3=progress.o pipe.o hashlink.o
DAEMON_OBJ = params.o loadparm.o clientserver.o access.o connection.o authenticate.o
popt_OBJS=popt/findme.o popt/popt.o popt/poptconfig.o \
popt/popthelp.o popt/poptparse.o
---- orig/hashlink.c 2004-09-24 16:44:25
-+++ hashlink.c 2004-09-24 16:44:25
-@@ -0,0 +1,340 @@
+diff --git a/checksum.c b/checksum.c
+--- a/checksum.c
++++ b/checksum.c
+@@ -21,8 +21,11 @@
+
+ #include "rsync.h"
+
++extern int checksum_len;
+ extern int checksum_seed;
+ extern int protocol_version;
++extern char *link_by_hash_dir;
++extern char link_by_hash_extra_sum[MAX_DIGEST_LEN];
+
+ /*
+ a simple 32 bit checksum that can be upadted from either end
+@@ -151,7 +154,7 @@ void file_checksum(char *fname, char *sum, OFF_T size)
+ }
+
+ static int32 sumresidue;
+-static md_context md;
++static md_context md, md2;
+
+ void sum_init(int seed)
+ {
+@@ -164,6 +167,8 @@ void sum_init(int seed)
+ sumresidue = 0;
+ SIVAL(s, 0, seed);
+ sum_update(s, 4);
++ if (link_by_hash_dir)
++ md5_begin(&md2);
+ }
+ }
+
+@@ -182,6 +187,9 @@ void sum_update(const char *p, int32 len)
+ return;
+ }
+
++ if (link_by_hash_dir)
++ md5_update(&md2, (uchar *)p, len);
++
+ if (len + sumresidue < CSUM_CHUNK) {
+ memcpy(md.buffer + sumresidue, p, len);
+ sumresidue += len;
+@@ -214,6 +222,9 @@ int sum_end(char *sum)
+ return MD5_DIGEST_LEN;
+ }
+
++ if (link_by_hash_dir)
++ md5_result(&md2, (uchar *)link_by_hash_extra_sum);
++
+ if (sumresidue || protocol_version >= 27)
+ mdfour_update(&md, (uchar *)md.buffer, sumresidue);
+
+diff --git a/clientserver.c b/clientserver.c
+--- a/clientserver.c
++++ b/clientserver.c
+@@ -50,6 +50,7 @@ extern int logfile_format_has_i;
+ extern int logfile_format_has_o_or_i;
+ extern char *bind_address;
+ extern char *config_file;
++extern char *link_by_hash_dir;
+ extern char *logfile_format;
+ extern char *files_from;
+ extern char *tmpdir;
+@@ -548,6 +549,9 @@ static int rsync_module(int f_in, int f_out, int i, const char *addr, const char
+ return -1;
+ }
+
++ if (*lp_link_by_hash_dir(i))
++ link_by_hash_dir = lp_link_by_hash_dir(i);
++
+ if (am_daemon && am_server) {
+ rprintf(FLOG, "rsync allowed access on module %s from %s (%s)\n",
+ name, host, addr);
+diff --git a/compat.c b/compat.c
+--- a/compat.c
++++ b/compat.c
+@@ -55,6 +55,7 @@ extern char *partial_dir;
+ extern char *dest_option;
+ extern char *files_from;
+ extern char *filesfrom_host;
++extern char *link_by_hash_dir;
+ extern filter_rule_list filter_list;
+ extern int need_unsorted_flist;
+ #ifdef ICONV_OPTION
+diff --git a/hashlink.c b/hashlink.c
+new file mode 100644
+--- /dev/null
++++ b/hashlink.c
+@@ -0,0 +1,92 @@
+/*
+ Copyright (C) Cronosys, LLC 2004
+
+/* This file contains code used by the --link-by-hash option. */
+
+#include "rsync.h"
++#include "inums.h"
+
++extern int protocol_version;
+extern char *link_by_hash_dir;
++extern char sender_file_sum[MAX_DIGEST_LEN];
+
-+#if HAVE_LINK
-+
-+char* make_hash_name(struct file_struct *file)
-+{
-+ char hash[33], *dst;
-+ unsigned char *src;
-+ unsigned char c;
-+ int i;
-+
-+ src = (unsigned char*)file->u.sum;
-+ for (dst = hash, i = 0; i < 4; i++, src++) {
-+ c = *src >> 4;
-+ *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
-+ c = *src & 0x0f;
-+ *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
-+ }
-+ *dst++ = '/';
-+ for (i = 0; i < 12; i++, src++) {
-+ c = *src >> 4;
-+ *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
-+ c = *src & 0x0f;
-+ *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
-+ }
-+ *dst = 0;
++char link_by_hash_extra_sum[MAX_DIGEST_LEN]; /* Only used when md4 sums are in the transfer */
+
-+ asprintf(&dst,"%s/%s",link_by_hash_dir,hash);
-+ return dst;
-+}
++#ifdef HAVE_LINK
+
-+
-+void kill_hashfile(struct hashfile_struct *hashfile)
-+{
-+ if (!hashfile)
-+ return;
-+ free(hashfile->name);
-+ close(hashfile->fd);
-+ free(hashfile);
-+}
-+
-+
-+void kill_hashfiles(struct hashfile_struct *hashfiles)
-+{
-+ struct hashfile_struct *iter, *next;
-+ if ((iter = hashfiles) != NULL) {
-+ do {
-+ next = iter->next;
-+ kill_hashfile(iter);
-+ iter = next;
-+ } while (iter != hashfiles);
-+ }
-+}
-+
-+
-+struct hashfile_struct *find_hashfiles(char *hashname, int64 size, long *fnbr)
++/* This function is always called after a file is received, so the
++ * sender_file_sum buffer has whatever the last checksum was for the
++ * transferred file. */
++void link_by_hash(const char *fname, const char *fnametmp, struct file_struct *file)
+{
-+ DIR *d;
-+ struct dirent *di;
-+ struct hashfile_struct *hashfiles = NULL, *hashfile;
+ STRUCT_STAT st;
-+ long this_fnbr;
-+
-+ *fnbr = 0;
-+
-+ /* Build a list of potential candidates and open
-+ * them. */
-+ if ((d = opendir(hashname)) == NULL) {
-+ rsyserr(FERROR, errno, "opendir failed: \"%s\"", hashname);
-+ free(hashname);
-+ return NULL;
-+ }
-+ while ((di = readdir(d)) != NULL) {
-+ if (!strcmp(di->d_name,".") || !strcmp(di->d_name,"..")) {
-+ continue;
-+ }
-+
-+ /* We need to have the largest fnbr in case we need to store
-+ * a new file. */
-+ this_fnbr = atol(di->d_name);
-+ if (this_fnbr > *fnbr)
-+ *fnbr = this_fnbr;
-+
-+ hashfile = new_array(struct hashfile_struct, 1);
-+ asprintf(&hashfile->name,"%s/%s",hashname,
-+ di->d_name);
-+ if (do_stat(hashfile->name,&st) == -1) {
-+ rsyserr(FERROR, errno, "stat failed: %s", hashfile->name);
-+ kill_hashfile(hashfile);
-+ continue;
-+ }
-+ if (st.st_size != size) {
-+ kill_hashfile(hashfile);
-+ continue;
-+ }
-+ hashfile->nlink = st.st_nlink;
-+ hashfile->fd = open(hashfile->name,O_RDONLY|O_BINARY);
-+ if (hashfile->fd == -1) {
-+ rsyserr(FERROR, errno, "open failed: %s", hashfile->name);
-+ kill_hashfile(hashfile);
-+ continue;
-+ }
-+ if (hashfiles == NULL)
-+ hashfiles = hashfile->next = hashfile->prev = hashfile;
-+ else {
-+ hashfile->next = hashfiles;
-+ hashfile->prev = hashfiles->prev;
-+ hashfile->next->prev = hashfile;
-+ hashfile->prev->next = hashfile;
-+ }
-+ }
-+ closedir(d);
-+
-+ return hashfiles;
-+}
-+
-+
-+struct hashfile_struct *compare_hashfiles(int fd,struct hashfile_struct *files)
-+{
-+ int amt, hamt;
-+ char buffer[BUFSIZ], cmpbuffer[BUFSIZ];
-+ struct hashfile_struct *iter, *next, *best;
-+ uint32 nlink;
-+
-+ if (!files)
-+ return NULL;
-+
-+ iter = files; /* in case files are 0 bytes */
-+ while ((amt = read(fd, buffer, BUFSIZ)) > 0) {
-+ iter = files;
-+ do {
-+ /* Icky bit to resync when we steal the first node. */
-+ if (!files)
-+ files = iter;
-+
-+ next = iter->next;
-+
-+ hamt = read(iter->fd, cmpbuffer, BUFSIZ);
-+ if (amt != hamt || memcmp(buffer, cmpbuffer, amt)) {
-+ if (iter == files) {
-+ files = files->prev;
-+ }
-+ if (iter->next == iter) {
-+ files = next = NULL;
-+ } else {
-+ next = iter->next;
-+ if (iter == files) {
-+ /* So we know to resync */
-+ files = NULL;
-+ }
-+ }
-+ iter->next->prev = iter->prev;
-+ iter->prev->next = iter->next;
-+ kill_hashfile(iter);
-+ }
-+
-+ iter = next;
-+ } while (iter != files);
-+
-+ if (iter == NULL && files == NULL) {
-+ /* There are no matches. */
-+ return NULL;
-+ }
-+ }
-+
-+ if (amt == -1) {
-+ rsyserr(FERROR, errno, "read failed in compare_hashfiles()");
-+ kill_hashfiles(files);
-+ return NULL;
-+ }
-+
-+ /* If we only have one file left, use it. */
-+ if (files == files->next) {
-+ return files;
-+ }
++ char *hashname, *last_slash, *num_str;
++ const char *hex;
++ int num = 0;
+
-+ /* All files which remain in the list are identical and should have
-+ * the same size. We pick the one with the lowest link count (we
-+ * may have rolled over because we hit the maximum link count for
-+ * the filesystem). */
-+ best = iter = files;
-+ nlink = iter->nlink;
-+ do {
-+ if (iter->nlink < nlink) {
-+ nlink = iter->nlink;
-+ best = iter;
-+ }
-+ iter = iter->next;
-+ } while (iter != files);
-+
-+ best->next->prev = best->prev;
-+ best->prev->next = best->next;
-+ if (files == best)
-+ files = files->next;
-+ kill_hashfiles(files);
-+ return best;
-+}
-+
-+
-+int link_by_hash(char *fnametmp,char *fname,struct file_struct *file)
-+{
-+ STRUCT_STAT st;
-+ char *hashname = make_hash_name(file);
-+ int first = 0, rc;
-+ char *linkname;
-+ long last_fnbr;
++ /* We don't bother to hard-link 0-length files. */
++ if (F_LENGTH(file) == 0)
++ return;
+
-+ if (file->length == 0) {
-+ return robust_rename(fnametmp,fname,0644);
++ hex = sum_as_hex(protocol_version >= 30 ? sender_file_sum : link_by_hash_extra_sum);
++ if (asprintf(&hashname, "%s/%.3s/%.3s/%.3s/%s.%s.000000",
++ link_by_hash_dir, hex, hex+3, hex+6, hex+9, big_num(F_LENGTH(file))) < 0)
++ {
++ out_of_memory("make_hash_name");
+ }
+
-+ if (do_stat(hashname, &st) == -1) {
-+ char *dirname;
-+
-+ /* Directory does not exist. */
-+ dirname = strdup(hashname);
-+ *strrchr(dirname,'/') = 0;
-+ if (do_mkdir(dirname, 0755) == -1 && errno != EEXIST) {
-+ rsyserr(FERROR, errno, "mkdir failed: %s", dirname);
-+ free(hashname);
-+ free(dirname);
-+ return robust_rename(fnametmp,fname,0644);
-+ }
-+ free(dirname);
++ last_slash = strrchr(hashname, '/');
++ num_str = strrchr(last_slash, '.') + 1;
+
-+ if (do_mkdir(hashname, 0755) == -1 && errno != EEXIST) {
-+ rsyserr(FERROR, errno, "mkdir failed: %s", hashname);
-+ free(hashname);
-+ return robust_rename(fnametmp,fname,0644);
++ while (1) {
++ if (num >= 999999) { /* Surely we'll never reach this... */
++ if (DEBUG_GTE(HASHLINK, 1))
++ rprintf(FINFO, "link-by-hash: giving up after \"%s\".\n", hashname);
++ goto cleanup;
+ }
++ if (num > 0 && DEBUG_GTE(HASHLINK, 1))
++ rprintf(FINFO, "link-by-hash: max link count exceeded, starting new file \"%s\".\n", hashname);
+
-+ first = 1;
-+ asprintf(&linkname,"%s/0",hashname);
-+ rprintf(FINFO, "(1) linkname = %s\n", linkname);
-+ } else {
-+ struct hashfile_struct *hashfiles, *hashfile;
-+
-+ if (do_stat(fnametmp,&st) == -1) {
-+ rsyserr(FERROR, errno, "stat failed: %s", fname);
-+ return -1;
-+ }
-+ hashfiles = find_hashfiles(hashname, st.st_size, &last_fnbr);
++ snprintf(num_str, 7, "%d", num++);
++ if (do_stat(hashname, &st) < 0)
++ break;
+
-+ if (hashfiles == NULL) {
-+ first = 1;
-+ asprintf(&linkname,"%s/0",hashname);
-+ rprintf(FINFO, "(2) linkname = %s\n", linkname);
++ if (do_link(hashname, fnametmp) < 0) {
++ if (errno == EMLINK)
++ continue;
++ rsyserr(FERROR, errno, "link \"%s\" -> \"%s\"", hashname, full_fname(fname));
+ } else {
-+ int fd;
-+ /* Search for one identical to us. */
-+ if ((fd = open(fnametmp,O_RDONLY|O_BINARY)) == -1) {
-+ rsyserr(FERROR, errno, "open failed: %s", fnametmp);
-+ kill_hashfiles(hashfiles);
-+ return -1;
-+ }
-+ hashfile = compare_hashfiles(fd, hashfiles);
-+ hashfiles = NULL;
-+ close(fd);
-+
-+ if (hashfile) {
-+ first = 0;
-+ linkname = strdup(hashfile->name);
-+ rprintf(FINFO, "(3) linkname = %s\n", linkname);
-+ kill_hashfile(hashfile);
-+ } else {
-+ first = 1;
-+ asprintf(&linkname, "%s/%ld", hashname,
-+ last_fnbr + 1);
-+ rprintf(FINFO, "(4) linkname = %s\n", linkname);
-+ }
++ if (DEBUG_GTE(HASHLINK, 2))
++ rprintf(FINFO, "link-by-hash (existing): \"%s\" -> %s\n", hashname, full_fname(fname));
++ robust_rename(fnametmp, fname, NULL, 0644);
+ }
-+ }
+
-+ if (!first) {
-+ rprintf(FINFO, "link-by-hash (existing): \"%s\" -> %s\n",
-+ linkname, full_fname(fname));
-+ robust_unlink(fname);
-+ rc = do_link(linkname, fname);
-+ if (rc == -1) {
-+ if (errno == EMLINK) {
-+ first = 1;
-+ free(linkname);
-+ asprintf(&linkname,"%s/%ld",hashname,
-+ last_fnbr + 1);
-+ rprintf(FINFO, "(5) linkname = %s\n", linkname);
-+ rprintf(FINFO,"link-by-hash: max link count exceeded, starting new file \"%s\".\n", linkname);
-+ } else {
-+ rsyserr(FERROR, errno, "link \"%s\" -> \"%s\"",
-+ linkname, full_fname(fname));
-+ rc = robust_rename(fnametmp,fname,0644);
-+ }
-+ } else {
-+ do_unlink(fnametmp);
-+ }
++ goto cleanup;
+ }
+
-+ if (first) {
-+ rprintf(FINFO, "link-by-hash (new): %s -> \"%s\"\n",
-+ full_fname(fname),linkname);
++ if (DEBUG_GTE(HASHLINK, 2))
++ rprintf(FINFO, "link-by-hash (new): %s -> \"%s\"\n", full_fname(fname), hashname);
+
-+ rc = robust_rename(fnametmp,fname,0644);
-+ if (rc != 0) {
-+ rsyserr(FERROR, errno, "rename \"%s\" -> \"%s\"",
-+ full_fname(fnametmp), full_fname(fname));
-+ }
-+ rc = do_link(fname,linkname);
-+ if (rc != 0) {
-+ rsyserr(FERROR, errno, "link \"%s\" -> \"%s\"",
-+ full_fname(fname), linkname);
-+ }
-+ }
++ if (do_link(fname, hashname) < 0
++ && (errno != ENOENT || make_path(hashname, MKP_DROP_NAME) < 0 || do_link(fname, hashname) < 0))
++ rsyserr(FERROR, errno, "link \"%s\" -> \"%s\"", full_fname(fname), hashname);
+
-+ free(linkname);
++ cleanup:
+ free(hashname);
-+ return rc;
+}
-+
+#endif
---- orig/options.c 2005-08-27 21:11:26
-+++ options.c 2005-05-19 08:55:42
-@@ -141,6 +141,7 @@ char *log_format = NULL;
- char *password_file = NULL;
- char *rsync_path = RSYNC_PATH;
- char *backup_dir = NULL;
+diff --git a/loadparm.c b/loadparm.c
+--- a/loadparm.c
++++ b/loadparm.c
+@@ -119,6 +119,7 @@ typedef struct {
+ char *include;
+ char *include_from;
+ char *incoming_chmod;
++ char *link_by_hash_dir;
+ char *lock_file;
+ char *log_file;
+ char *log_format;
+@@ -195,6 +196,7 @@ static const all_vars Defaults = {
+ /* include; */ NULL,
+ /* include_from; */ NULL,
+ /* incoming_chmod; */ NULL,
++ /* link_by_hash_dir; */ NULL,
+ /* lock_file; */ DEFAULT_LOCK_FILE,
+ /* log_file; */ NULL,
+ /* log_format; */ "%o %h [%a] %m (%u) %f %l",
+@@ -336,6 +338,7 @@ static struct parm_struct parm_table[] =
+ {"include from", P_STRING, P_LOCAL, &Vars.l.include_from, NULL,0},
+ {"include", P_STRING, P_LOCAL, &Vars.l.include, NULL,0},
+ {"incoming chmod", P_STRING, P_LOCAL, &Vars.l.incoming_chmod, NULL,0},
++ {"link by hash dir", P_STRING, P_LOCAL, &Vars.l.link_by_hash_dir, NULL,0},
+ {"list", P_BOOL, P_LOCAL, &Vars.l.list, NULL,0},
+ {"lock file", P_STRING, P_LOCAL, &Vars.l.lock_file, NULL,0},
+ {"log file", P_STRING, P_LOCAL, &Vars.l.log_file, NULL,0},
+@@ -464,6 +467,7 @@ FN_LOCAL_STRING(lp_hosts_deny, hosts_deny)
+ FN_LOCAL_STRING(lp_include, include)
+ FN_LOCAL_STRING(lp_include_from, include_from)
+ FN_LOCAL_STRING(lp_incoming_chmod, incoming_chmod)
++FN_LOCAL_STRING(lp_link_by_hash_dir, link_by_hash_dir)
+ FN_LOCAL_STRING(lp_lock_file, lock_file)
+ FN_LOCAL_STRING(lp_log_file, log_file)
+ FN_LOCAL_STRING(lp_log_format, log_format)
+diff --git a/options.c b/options.c
+--- a/options.c
++++ b/options.c
+@@ -158,6 +158,7 @@ char *backup_suffix = NULL;
+ char *tmpdir = NULL;
+ char *partial_dir = NULL;
+ char *basis_dir[MAX_BASIS_DIRS+1];
+char *link_by_hash_dir = NULL;
- char backup_dir_buf[MAXPATHLEN];
- int rsync_port = 0;
- int compare_dest = 0;
-@@ -322,6 +323,7 @@ void usage(enum logcode F)
+ char *config_file = NULL;
+ char *shell_cmd = NULL;
+ char *logfile_name = NULL;
+@@ -207,7 +208,7 @@ static const char *debug_verbosity[] = {
+ /*2*/ "BIND,CMD,CONNECT,DEL,DELTASUM,DUP,FILTER,FLIST,ICONV",
+ /*3*/ "ACL,BACKUP,CONNECT2,DELTASUM2,DEL2,EXIT,FILTER2,FLIST2,FUZZY,GENR,OWN,RECV,SEND,TIME",
+ /*4*/ "CMD2,DELTASUM3,DEL3,EXIT2,FLIST3,ICONV2,OWN2,PROTO,TIME2",
+- /*5*/ "CHDIR,DELTASUM4,FLIST4,FUZZY2,HASH,HLINK",
++ /*5*/ "CHDIR,DELTASUM4,FLIST4,FUZZY2,HASH,HASHLINK,HLINK",
+ };
+
+ #define MAX_VERBOSITY ((int)(sizeof debug_verbosity / sizeof debug_verbosity[0]) - 1)
+@@ -277,6 +278,7 @@ static struct output_struct debug_words[COUNT_DEBUG+1] = {
+ DEBUG_WORD(FUZZY, W_REC, "Debug fuzzy scoring (levels 1-2)"),
+ DEBUG_WORD(GENR, W_REC, "Debug generator functions"),
+ DEBUG_WORD(HASH, W_SND|W_REC, "Debug hashtable code"),
++ DEBUG_WORD(HASHLINK, W_REC, "Debug hashlink code (levels 1-2)"),
+ DEBUG_WORD(HLINK, W_SND|W_REC, "Debug hard-link actions (levels 1-3)"),
+ DEBUG_WORD(ICONV, W_CLI|W_SRV, "Debug iconv character conversions (levels 1-2)"),
+ DEBUG_WORD(IO, W_CLI|W_SRV, "Debug I/O routines (levels 1-4)"),
+@@ -761,6 +763,7 @@ void usage(enum logcode F)
rprintf(F," --compare-dest=DIR also compare destination files relative to DIR\n");
rprintf(F," --copy-dest=DIR ... and include copies of unchanged files\n");
rprintf(F," --link-dest=DIR hardlink to files in DIR when unchanged\n");
+ rprintf(F," --link-by-hash=DIR create hardlinks by hash into DIR\n");
rprintf(F," -z, --compress compress file data during the transfer\n");
- rprintf(F," -C, --cvs-exclude auto-ignore files the same way CVS does\n");
- rprintf(F," -f, --filter=RULE add a file-filtering RULE\n");
-@@ -362,7 +364,7 @@ void usage(enum logcode F)
-
- enum {OPT_VERSION = 1000, OPT_DAEMON, OPT_SENDER, OPT_EXCLUDE, OPT_EXCLUDE_FROM,
- OPT_FILTER, OPT_COMPARE_DEST, OPT_COPY_DEST, OPT_LINK_DEST,
-- OPT_INCLUDE, OPT_INCLUDE_FROM, OPT_MODIFY_WINDOW,
-+ OPT_INCLUDE, OPT_INCLUDE_FROM, OPT_MODIFY_WINDOW, OPT_LINK_BY_HASH,
+ rprintf(F," --compress-level=NUM explicitly set compression level\n");
+ rprintf(F," --skip-compress=LIST skip compressing files with a suffix in LIST\n");
+@@ -817,7 +820,7 @@ enum {OPT_VERSION = 1000, OPT_DAEMON, OPT_SENDER, OPT_EXCLUDE, OPT_EXCLUDE_FROM,
+ OPT_FILTER, OPT_COMPARE_DEST, OPT_COPY_DEST, OPT_LINK_DEST, OPT_HELP,
+ OPT_INCLUDE, OPT_INCLUDE_FROM, OPT_MODIFY_WINDOW, OPT_MIN_SIZE, OPT_CHMOD,
OPT_READ_BATCH, OPT_WRITE_BATCH, OPT_ONLY_WRITE_BATCH, OPT_MAX_SIZE,
- OPT_REFUSED_BASE = 9000};
+- OPT_NO_D, OPT_APPEND, OPT_NO_ICONV, OPT_INFO, OPT_DEBUG,
++ OPT_NO_D, OPT_APPEND, OPT_NO_ICONV, OPT_INFO, OPT_DEBUG, OPT_LINK_BY_HASH,
+ OPT_USERMAP, OPT_GROUPMAP, OPT_CHOWN, OPT_BWLIMIT,
+ OPT_SERVER, OPT_REFUSED_BASE = 9000};
-@@ -446,6 +448,7 @@ static struct poptOption long_options[]
+@@ -961,6 +964,7 @@ static struct poptOption long_options[] = {
{"compare-dest", 0, POPT_ARG_STRING, 0, OPT_COMPARE_DEST, 0, 0 },
{"copy-dest", 0, POPT_ARG_STRING, 0, OPT_COPY_DEST, 0, 0 },
{"link-dest", 0, POPT_ARG_STRING, 0, OPT_LINK_DEST, 0, 0 },
+ {"link-by-hash", 0, POPT_ARG_STRING, 0, OPT_LINK_BY_HASH, 0, 0},
- {"fuzzy", 'y', POPT_ARG_NONE, &fuzzy_basis, 0, 0, 0 },
- {"compress", 'z', POPT_ARG_NONE, &do_compression, 0, 0, 0 },
- {0, 'P', POPT_ARG_NONE, 0, 'P', 0, 0 },
-@@ -916,6 +919,21 @@ int parse_arguments(int *argc, const cha
- basis_dir[basis_dir_cnt++] = (char *)arg;
- break;
+ {"fuzzy", 'y', POPT_ARG_NONE, 0, 'y', 0, 0 },
+ {"no-fuzzy", 0, POPT_ARG_VAL, &fuzzy_basis, 0, 0, 0 },
+ {"no-y", 0, POPT_ARG_VAL, &fuzzy_basis, 0, 0, 0 },
+@@ -1308,6 +1312,9 @@ int parse_arguments(int *argc_p, const char ***argv_p)
+ iconv_opt = strdup(arg);
+ #endif
+
++ if (*lp_link_by_hash_dir(module_id))
++ set_refuse_options("link-by-hash");
++
+ /* TODO: Call poptReadDefaultConfig; handle errors. */
+
+ /* The context leaks in case of an error, but if there's a
+@@ -1794,6 +1801,21 @@ int parse_arguments(int *argc_p, const char ***argv_p)
+ return 0;
+ #endif
+ case OPT_LINK_BY_HASH:
-+#if HAVE_LINK
++#ifdef HAVE_LINK
+ arg = poptGetOptArg(pc);
+ if (sanitize_paths)
-+ arg = sanitize_path(NULL, arg, NULL, 0);
++ arg = sanitize_path(NULL, arg, NULL, 0, SP_DEFAULT);
+ link_by_hash_dir = (char *)arg;
+ break;
+#else
default:
/* A large opt value means that set_refuse_options()
* turned this option off. */
-@@ -1507,6 +1525,11 @@ void server_options(char **args,int *arg
- }
+@@ -2069,6 +2091,8 @@ int parse_arguments(int *argc_p, const char ***argv_p)
+ tmpdir = sanitize_path(NULL, tmpdir, NULL, 0, SP_DEFAULT);
+ if (backup_dir)
+ backup_dir = sanitize_path(NULL, backup_dir, NULL, 0, SP_DEFAULT);
++ if (link_by_hash_dir)
++ link_by_hash_dir = sanitize_path(NULL, link_by_hash_dir, NULL, 0, SP_DEFAULT);
}
+ if (daemon_filter_list.head && !am_sender) {
+ filter_rule_list *elp = &daemon_filter_list;
+@@ -2717,6 +2741,12 @@ void server_options(char **args, int *argc_p)
+ } else if (inplace)
+ args[ac++] = "--inplace";
+ if (link_by_hash_dir && am_sender) {
+ args[ac++] = "--link-by-hash";
+ args[ac++] = link_by_hash_dir;
++ link_by_hash_dir = NULL; /* optimize sending-side checksums */
+ }
+
if (files_from && (!am_sender || filesfrom_host)) {
if (filesfrom_host) {
args[ac++] = "--files-from";
---- orig/receiver.c 2005-08-17 06:45:08
-+++ receiver.c 2005-01-15 21:29:13
-@@ -53,6 +53,7 @@ extern int delay_updates;
- extern struct stats stats;
- extern char *log_format;
- extern char *tmpdir;
-+extern char *link_by_hash_dir;
- extern char *partial_dir;
- extern char *basis_dir[];
- extern struct file_list *the_file_list;
-@@ -186,12 +187,13 @@ static int get_tmpname(char *fnametmp, c
-
-
- static int receive_data(int f_in, char *fname_r, int fd_r, OFF_T size_r,
-- char *fname, int fd, OFF_T total_size)
-+ char *fname, int fd, OFF_T total_size, char *md4)
- {
- static char file_sum1[MD4_SUM_LENGTH];
- static char file_sum2[MD4_SUM_LENGTH];
- struct map_struct *mapbuf;
- struct sum_struct sum;
-+ struct mdfour mdfour_data;
- int32 len;
- OFF_T offset = 0;
- OFF_T offset2;
-@@ -211,6 +213,9 @@ static int receive_data(int f_in, char *
- } else
- mapbuf = NULL;
-
-+ if (md4)
-+ mdfour_begin(&mdfour_data);
-+
- sum_init(checksum_seed);
-
- if (append_mode) {
-@@ -253,6 +258,8 @@ static int receive_data(int f_in, char *
- cleanup_got_literal = 1;
-
- sum_update(data, i);
-+ if (md4)
-+ mdfour_update(&mdfour_data,data,i);
-
- if (fd != -1 && write_file(fd,data,i) != i)
- goto report_write_error;
-@@ -279,6 +286,8 @@ static int receive_data(int f_in, char *
-
- see_token(map, len);
- sum_update(map, len);
-+ if (md4)
-+ mdfour_update(&mdfour_data,map,len);
- }
-
- if (inplace) {
-@@ -319,6 +328,8 @@ static int receive_data(int f_in, char *
- }
-
- sum_end(file_sum1);
-+ if (md4)
-+ mdfour_result(&mdfour_data, (unsigned char*)md4);
-
- if (mapbuf)
- unmap_file(mapbuf);
-@@ -334,7 +345,7 @@ static int receive_data(int f_in, char *
-
- static void discard_receive_data(int f_in, OFF_T length)
- {
-- receive_data(f_in, NULL, -1, 0, NULL, -1, length);
-+ receive_data(f_in, NULL, -1, 0, NULL, -1, length, NULL);
- }
-
- static void handle_delayed_updates(struct file_list *flist, char *local_name)
-@@ -663,8 +674,12 @@ int recv_files(int f_in, struct file_lis
- rprintf(FINFO, "%s\n", safe_fname(fname));
-
- /* recv file data */
-+#if HAVE_LINK
-+ if (link_by_hash_dir)
-+ file->u.sum = new_array(char, MD4_SUM_LENGTH);
-+#endif
- recv_ok = receive_data(f_in, fnamecmp, fd1, st.st_size,
-- fname, fd2, file->length);
-+ fname, fd2, file->length, file->u.sum);
-
- if (!log_before_transfer)
- log_item(file, &initial_stats, iflags, NULL);
---- orig/rsync.c 2005-07-27 23:31:12
-+++ rsync.c 2005-02-21 11:04:36
-@@ -38,6 +38,7 @@ extern int inplace;
+diff --git a/rsync.c b/rsync.c
+--- a/rsync.c
++++ b/rsync.c
+@@ -49,6 +49,7 @@ extern int flist_eof;
+ extern int file_old_total;
extern int keep_dirlinks;
extern int make_backups;
- extern struct stats stats;
+extern char *link_by_hash_dir;
-
-
- /*
-@@ -190,7 +191,12 @@ void finish_transfer(char *fname, char *
- rprintf(FINFO, "renaming %s to %s\n",
- safe_fname(fnametmp), safe_fname(fname));
+ extern struct file_list *cur_flist, *first_flist, *dir_flist;
+ extern struct chmod_mode_struct *daemon_chmod_modes;
+ #ifdef ICONV_OPTION
+@@ -679,6 +680,10 @@ int finish_transfer(const char *fname, const char *fnametmp,
}
-- ret = robust_rename(fnametmp, fname, file->mode & INITACCESSPERMS);
-+#if HAVE_LINK
-+ if (link_by_hash_dir)
-+ ret = link_by_hash(fnametmp, fname, file);
-+ else
+ if (ret == 0) {
+ /* The file was moved into place (not copied), so it's done. */
++#ifdef HAVE_LINK
++ if (link_by_hash_dir)
++ link_by_hash(fname, fnametmp, file);
+#endif
-+ ret = robust_rename(fnametmp, fname, file->mode & INITACCESSPERMS);
- if (ret < 0) {
- rsyserr(FERROR, errno, "%s %s -> \"%s\"",
- ret == -2 ? "copy" : "rename",
---- orig/rsync.h 2005-08-17 06:45:08
-+++ rsync.h 2004-07-03 20:20:15
-@@ -639,6 +639,14 @@ struct stats {
- int current_file_index;
- };
-
-+struct hashfile_struct {
-+ struct hashfile_struct *next;
-+ struct hashfile_struct *prev;
-+ char *name;
-+ int fd;
-+ uint32 nlink;
-+};
-+
-
- #include "byteorder.h"
- #include "lib/mdfour.h"
---- orig/rsync.yo 2005-08-27 21:05:12
-+++ rsync.yo 2005-02-13 06:58:47
-@@ -356,6 +356,7 @@ to the detailed description below for a
+ return 1;
+ }
+ /* The file was copied, so tweak the perms of the copied file. If it
+diff --git a/rsync.h b/rsync.h
+--- a/rsync.h
++++ b/rsync.h
+@@ -1265,7 +1265,8 @@ extern short info_levels[], debug_levels[];
+ #define DEBUG_FUZZY (DEBUG_FLIST+1)
+ #define DEBUG_GENR (DEBUG_FUZZY+1)
+ #define DEBUG_HASH (DEBUG_GENR+1)
+-#define DEBUG_HLINK (DEBUG_HASH+1)
++#define DEBUG_HASHLINK (DEBUG_HASH+1)
++#define DEBUG_HLINK (DEBUG_HASHLINK+1)
+ #define DEBUG_ICONV (DEBUG_HLINK+1)
+ #define DEBUG_IO (DEBUG_ICONV+1)
+ #define DEBUG_OWN (DEBUG_IO+1)
+diff --git a/rsync.yo b/rsync.yo
+--- a/rsync.yo
++++ b/rsync.yo
+@@ -416,6 +416,7 @@ to the detailed description below for a complete description. verb(
--compare-dest=DIR also compare received files relative to DIR
--copy-dest=DIR ... and include copies of unchanged files
--link-dest=DIR hardlink to files in DIR when unchanged
+ --link-by-hash=DIR create hardlinks by hash into DIR
-z, --compress compress file data during the transfer
- -C, --cvs-exclude auto-ignore files in the same way CVS does
- -f, --filter=RULE add a file-filtering RULE
+ --compress-level=NUM explicitly set compression level
+ --skip-compress=LIST skip compressing files with suffix in LIST
+@@ -1849,6 +1850,48 @@ bf(--link-dest) from working properly for a non-super-user when bf(-o) was
+ specified (or implied by bf(-a)). You can work-around this bug by avoiding
+ the bf(-o) option when sending to an old rsync.
+
++dit(bf(--link-by-hash=DIR)) This option hard links the destination files into
++em(DIR), a link farm arranged by MD5 file hash. The result is that the system
++will only store (usually) one copy of the unique contents of each file,
++regardless of the file's name (it will use extra files if the links overflow
++the available maximum).
++
++This patch does not take into account file permissions, extended attributes,
++or ACLs when linking things together, so you should only use this if you
++don't care about preserving those extra file attributes (or if they are
++always the same for identical files).
++
++The DIR is relative to the destination directory, so either specify a full
++path to the hash hierarchy, or specify a relative path that puts the links
++outside the destination (e.g. "../links").
++
++Keep in mind that the hierarchy is never pruned, so if you need to reclaim
++space, you should remove any files that have just one link (since they are not
++linked into any destination dirs anymore):
++
++ find $DIR -links 1 -delete
++
++The link farm's directory hierarchy is determined by the file's (32-char) MD5
++hash and the file-length. The hash is split up into directory shards. For
++example, if a file is 54321 bytes long, it could be stored like this:
++
++ $DIR/123/456/789/01234567890123456789012.54321.0
++
++Note that the directory layout in this patch was modified for version 3.1.0,
++so anyone using an older version of this patch should move their existing
++link hierarchy out of the way and then use the newer rsync to copy the saved
++hierarchy into its new layout. Assuming that no files have overflowed their
++link limits, this would work:
++
++ mv $DIR $DIR.old
++ rsync -aiv --link-by-hash=$DIR $DIR.old/ $DIR.tmp/
++ rm -rf $DIR.tmp
++ rm -rf $DIR.old
++
++If some of your files are at their link limit, you'd be better of using a
++script to calculate the md5 sum of each file in the hierarchy and move it
++to its new location.
++
+ dit(bf(-z, --compress)) With this option, rsync compresses the file data
+ as it is sent to the destination machine, which reduces the amount of data
+ being transmitted -- something that is useful over a slow connection.
+diff --git a/rsyncd.conf.yo b/rsyncd.conf.yo
+--- a/rsyncd.conf.yo
++++ b/rsyncd.conf.yo
+@@ -284,6 +284,21 @@ message telling them to try later. The default is 0, which means no limit.
+ A negative value disables the module.
+ See also the "lock file" parameter.
+
++dit(bf(link by hash dir)) When the "link by hash dir" parameter is set to a
++non-empty string, received files will be hard linked into em(DIR), a link farm
++arranged by MD5 file hash. See the bf(--link-by-hash) option for a full
++explanation.
++
++The em(DIR) must be accessible inside any chroot restrictions for the module,
++but can exist outside the transfer location if there is an inside-the-chroot
++path to the module (see "use chroot"). Note that a user-specified option does
++not allow this outside-the-transfer-area placement.
++
++If this parameter is set, it will disable the bf(--link-by-hash) command-line
++option for copies into the module.
++
++The default is for this parameter to be unset.
++
+ dit(bf(log file)) When the "log file" parameter is set to a non-empty
+ string, the rsync daemon will log messages to the indicated file rather
+ than using syslog. This is particularly useful on systems (such as AIX)