2 this is an attempted implementation of Broders shingles algorithm for similarity testing of documents
4 see http://gatekeeper.dec.com/pub/DEC/SRC/publications/broder/positano-final-wpnums.pdf
19 #define MAX_WORD_SIZE 8
21 #define SKETCH_SIZE 32
23 #define HASH_PRIME 0x01000193
24 #define HASH_INIT 0x28021967
26 #define FLAG_IGNORE_HEADERS 1
28 #define ROLLING_WINDOW 20
31 typedef unsigned char uchar;
33 #define QSORT_CAST (int (*)(const void *, const void *))
42 static void parse_sketch(const uchar *str, struct u32_set *v)
46 for (i=0;i<SKETCH_SIZE;i++) {
50 v->v[i] = (v->v[i] << 7) | *str++;
56 static int u32_comp(u32 *v1, u32 *v2)
67 static void u32_union(struct u32_set *v1, struct u32_set *v2, struct u32_set *u)
70 u32 vv[SKETCH_SIZE*2];
72 n = v1->count + v2->count;
73 memcpy(vv, v1->v, v1->count * sizeof(u32));
74 memcpy(&vv[v1->count], v2->v, v2->count * sizeof(u32));
76 qsort(vv, n, sizeof(u32), QSORT_CAST u32_comp);
82 if (vv[i] != u->v[u->count-1]) {
83 u->v[u->count++] = vv[i];
88 static void u32_intersect(struct u32_set *v1, struct u32_set *v2, struct u32_set *u)
90 u32 vv[SKETCH_SIZE*2];
93 memcpy(vv, v1->v, v1->count*sizeof(u32));
94 memcpy(&vv[v1->count], v2->v, v2->count*sizeof(u32));
95 qsort(vv, v1->count + v2->count, sizeof(u32), QSORT_CAST u32_comp);
99 for (i=1;i<v1->count + v2->count; i++) {
100 if (vv[i] == vv[i-1]) {
101 u->v[u->count++] = vv[i];
106 static void u32_min(struct u32_set *v)
108 qsort(v->v, v->count, sizeof(u32), QSORT_CAST u32_comp);
109 if (v->count > SKETCH_SIZE) {
110 v->count = SKETCH_SIZE;
115 given two sketch strings return a value indicating the degree to which they match.
117 static u32 sketch_match(const char *str1, const char *str2)
120 struct u32_set v1, v2;
122 struct u32_set v3, v4;
124 parse_sketch(str1, &v1);
125 parse_sketch(str2, &v2);
127 u32_union(&v1, &v2, &u1);
130 u32_intersect(&u1, &v1, &v3);
131 u32_intersect(&v3, &v2, &v4);
133 score = (100 * v4.count) / u1.count;
139 return the maximum match for a file containing a list of sketchs
141 static u32 sketch_match_db(const char *fname, const char *sum, u32 threshold)
147 f = fopen(fname, "r");
150 /* on each line of the database we compute the sketch match
151 score. We then pick the best score */
152 while (fgets(line, sizeof(line)-1, f)) {
156 if (line[len-1] == '\n') line[len-1] = 0;
158 score = sketch_match(sum, line);
162 if (best >= threshold) break;
172 /* a simple non-rolling hash, based on the FNV hash */
173 static inline u32 sum_hash(uchar c, u32 h)
181 uchar window[ROLLING_WINDOW];
187 a rolling hash, based on the Adler checksum. By using a rolling hash
188 we can perform auto resynchronisation after inserts/deletes
190 internally, h1 is the sum of the bytes in the window and h2
191 is the sum of the bytes times the index
193 h3 is a shift/xor based rolling hash, and is mostly needed to ensure that
194 we can cope with large blocksize values
196 static inline u32 roll_hash(uchar c)
198 roll_state.h2 -= roll_state.h1;
199 roll_state.h2 += ROLLING_WINDOW * c;
202 roll_state.h1 -= roll_state.window[roll_state.n % ROLLING_WINDOW];
204 roll_state.window[roll_state.n % ROLLING_WINDOW] = c;
207 roll_state.h3 = (roll_state.h3 << 5) & 0xFFFFFFFF;
210 return roll_state.h1 + roll_state.h2 + roll_state.h3;
214 reset the state of the rolling hash and return the initial rolling hash value
216 static u32 roll_reset(void)
218 memset(&roll_state, 0, sizeof(roll_state));
222 static uchar *advance(uchar *in, unsigned w, uchar *limit)
227 for (i=0;i<MAX_WORD_SIZE;i++) {
229 if (in >= limit || !isalnum(in[i])) break;
232 while (in < limit && !isalnum(in[0])) {
240 static u32 hash_emit(uchar *start, size_t len)
244 if (len == 0) return 0;
247 if (isalnum(*start)) {
248 v = sum_hash(*start, v);
260 static int el_comp(struct el *v1, struct el *v2)
262 if (v2->hash > v1->hash) {
265 if (v2->hash < v1->hash) {
271 static int el_comp2(struct el *v1, struct el *v2)
273 return v1->i - v2->i;
276 static uchar *shingle_sketch(uchar *in_0, size_t size, u32 flags)
280 struct el *sketch, *sketch2;
284 const char *b64 = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
287 return strdup("NULL");
290 /* if we are ignoring email headers then skip past them now */
291 if (flags & FLAG_IGNORE_HEADERS) {
292 uchar *s = strstr(in, "\n\n");
299 /* allocate worst case size */
300 sketch = calloc(sizeof(struct el), SKETCH_SIZE + size+1);
301 sketch2 = calloc(sizeof(struct el), SKETCH_SIZE + size+1);
302 ret = calloc(SKETCH_SIZE+1, 4);
306 p = advance(in, W_SHINGLES, in_0 + size);
308 sketch[sketch_size].hash = hash_emit(in, p-in);
309 sketch[sketch_size].i = sketch_size++;
311 while (p < in_0 + size) {
312 in = advance(in, 1, in_0 + size);
313 p = advance(p, 1, in_0 + size);
314 sketch[sketch_size].hash = hash_emit(in, p-in);
315 sketch[sketch_size].i = sketch_size++;
318 qsort(sketch, sketch_size, sizeof(struct el), QSORT_CAST el_comp);
320 sketch2[0] = sketch[0];
322 for (i=1;i<sketch_size;i++) {
323 if (sketch[i].hash != sketch2[n-1].hash) {
324 sketch2[n++] = sketch[i];
331 if (sketch_size > SKETCH_SIZE) {
332 qsort(sketch, SKETCH_SIZE, sizeof(struct el), QSORT_CAST el_comp2);
335 for (i=0;i<SKETCH_SIZE && i < sketch_size;i++) {
336 u32 v = sketch[i].hash;
339 ret[4*i + j] = b64[v % 64];
351 return the shingles sketch on stdin
353 static char *sketch_stdin(u32 flags)
361 msg = malloc(sizeof(buf));
362 if (!msg) return NULL;
364 /* load the file, expanding the allocation as needed. */
366 n = read(0, buf, sizeof(buf));
367 if (n == -1 && errno == EINTR) continue;
370 msg = realloc(msg, length + n);
371 if (!msg) return NULL;
373 memcpy(msg+length, buf, n);
377 sum = shingle_sketch(msg, length, flags);
386 return the sketch on a file
388 char *sketch_file(const char *fname, u32 flags)
395 if (strcmp(fname, "-") == 0) {
396 return sketch_stdin(flags);
399 fd = open(fname, O_RDONLY);
405 if (fstat(fd, &st) == -1) {
410 msg = mmap(NULL, st.st_size, PROT_READ, MAP_FILE|MAP_PRIVATE, fd, 0);
411 if (msg == (uchar *)-1) {
417 sum = shingle_sketch(msg, st.st_size, flags);
419 munmap(msg, st.st_size);
425 static void show_help(void)
428 shingles v1.0 written by Andrew Tridgell <tridge@samba.org>
431 shingles [options] <files>
433 shingles [options] -d sigs.txt -c SIG
435 shingles [options] -d sigs.txt -C file
437 When called with a list of filenames shingles will write out the
438 signatures of each file on a separate line. You can specify the
439 filename '-' for standard input.
441 When called with the second form, shingles will print the best score
442 for the given signature with the signatures in the given database. A
443 score of 100 means a perfect match, and a score of 0 means a complete
446 When checking, shingles returns 0 (success) when the message *is* spam,
447 1 for internal errors, and 2 for messages whose signature is not
450 The 3rd form is just like the second form, but you pass a file
451 containing a message instead of a pre-computed signature.
455 -H skip past mail headers
456 -T <threshold> set the threshold above which shingles will stop
462 int main(int argc, char *argv[])
474 while ((c = getopt(argc, argv, "B:WHd:c:C:hT:")) != -1) {
477 flags |= FLAG_IGNORE_HEADERS;
485 threshold = atoi(optarg);
493 score = sketch_match_db(dbname, optarg,
495 printf("%u\n", score);
496 exit(score >= threshold ? 0 : 2);
503 score = sketch_match_db(dbname,
504 sketch_file(optarg, flags),
506 printf("%u\n", score);
507 exit(score >= threshold ? 0 : 2);
524 /* compute the sketch on a list of files */
525 for (i=0;i<argc;i++) {
526 sum = sketch_file(argv[i], flags);