1 /*****************************************************************************
2 A simple C program to detect plagiarism
7 - collect groups of N tokens, overlapping my M tokens
9 - sort the resultant hashes
11 Andrew Tridgell <tridge@samba.org> July 2003
13 ******************************************************************************/
21 typedef enum {False=0,True=1} BOOL;
26 /* should I be case sensitive? */
27 BOOL case_sensitive = False;
29 /* how big can a word get? */
30 #define MAX_WORD_LEN 500
32 /* how many files are we processing? */
35 /* the names of the files we are processing - normally stolen from argv */
36 char **file_names = NULL;
38 /* how many words have we found? */
41 /* a list of chars to ignore - can be interesting */
42 char ignore_chars[MAX_WORD_LEN] = "";
44 /* what debug level? */
48 /* will I add new words to the word list ? */
49 BOOL add_words = True;
51 /* shall I weight the answers ? */
52 BOOL do_weighting = True;
54 /* shall I concatenate all adjacent words to form word pairs? */
55 BOOL concat_words = False;
57 /* a suspicious filename */
58 char *suspicious_file = NULL;
60 /* how to write a boolean */
61 #define bool_str(v) ((v)?"True":"False")
64 a list of all the words found and where they occurred
68 int count; /* how many files this word occurs in */
69 char *word; /* text of word */
70 BOOL *occurred; /* yes/no has occurred in this file */
73 /*****************************************************************************
74 like realloc() but can start from scratch, and exits if there is an error
75 ******************************************************************************/
76 void *Realloc(void *ptr,int size)
81 ptr = realloc(ptr,size);
84 printf("Can't allocate %d bytes\n",size);
90 /*****************************************************************************
92 ******************************************************************************/
96 return(file_names[i]);
97 return(suspicious_file);
101 /*****************************************************************************
102 decide if two words are equal
103 ******************************************************************************/
104 BOOL wordequal(char *word1,char *word2)
107 return(strcmp(word1,word2) == 0);
109 return(strcasecmp(word1,word2) == 0);
113 /*****************************************************************************
114 find an insertion position for a new word
115 ******************************************************************************/
116 int insert_position(char *word)
118 int high_i = num_words;
122 /* do a bisection search - this assumes the list is kept ordered */
123 while (high_i > low_i)
125 guess = (high_i + low_i)/2;
127 ret = strcmp(word,words[guess].word);
129 ret = strcasecmp(word,words[guess].word);
141 /*****************************************************************************
142 ask if a word has occurred before - return word index if it has
144 ******************************************************************************/
145 int find_word(char *word)
149 /* we can take advantage of the order of the words */
150 ret = insert_position(word);
151 if (ret >= num_words || !wordequal(word,words[ret].word))
158 /*****************************************************************************
159 Insert a word into the list of words that have occurred.
160 If the word already exists then just set the occured bit for the filenum
161 ******************************************************************************/
162 void insert_word(char *word,int filenum)
165 wordnum = find_word(word);
167 if (wordnum < 0 && !add_words)
173 printf("new word %s from filenum %d\n",word,filenum);
175 wordnum = insert_position(word);
176 words = (struct word *)Realloc(words,sizeof(struct word)*(num_words+1));
179 for (i=num_words-2;i>=wordnum;i--)
180 words[i+1] = words[i];
182 words[wordnum].count=0;
183 words[wordnum].word = strdup(word);
184 words[wordnum].occurred =
185 (BOOL *)Realloc(NULL,sizeof(BOOL)*(num_files+1));
187 for (i=0;i<=num_files;i++) words[wordnum].occurred[i] = False;
190 if (!words[wordnum].occurred[filenum]) words[wordnum].count++;
191 words[wordnum].occurred[filenum] = True;
194 /*****************************************************************************
195 dump the word occurrance table
196 ******************************************************************************/
197 void dump_word_table(void)
200 for (i=0;i<num_words;i++)
202 printf("%20.20s ",words[i].word);
203 for (j=0;j<=num_files;j++)
205 if (words[i].occurred[j])
214 /*****************************************************************************
215 read one character - skipping over the chars in ignore
216 ******************************************************************************/
217 int Read(FILE *f,char *ignore)
220 if (ch == EOF) return(EOF);
221 if (*ignore && (strchr(ignore,(char)ch) != NULL))
222 return(Read(f,ignore));
226 /*****************************************************************************
227 get a word from a stream. return the word. return NULL if no more words
228 exist. possibly skip comments?
229 ******************************************************************************/
230 char *get_word(FILE *f)
232 static char word[MAX_WORD_LEN+1];
238 /* skip over non alphanum chars */
239 while ((ch = Read(f,ignore_chars)) != EOF)
241 if (isalnum(ch)) break;
243 if (ch == EOF) return(NULL);
245 word[word_len++] = ch;
247 /* grab all the alphanums */
248 while (word_len < MAX_WORD_LEN && (isalnum(ch = Read(f,ignore_chars))))
249 word[word_len++] = ch;
251 word[word_len] = '\0';
256 /* used for cheating matrix */
263 /*****************************************************************************
264 compare 2 pair structures - used for sorting
265 ******************************************************************************/
266 int pair_compare(struct pair *t1,struct pair *t2)
268 return(t2->count - t1->count);
271 /*****************************************************************************
272 compare 2 word structures - used for sorting
273 ******************************************************************************/
274 int words_compare(struct word *t1,struct word *t2)
276 if (t2->count == t1->count)
277 return(strcmp(t1->word,t2->word));
278 return(t1->count - t2->count);
281 /*****************************************************************************
282 form the cheating matrix.
283 The match between two files is increased if
284 the word occurs in both files but doesn't occur in more than THRESH % of files.
285 ******************************************************************************/
286 void cheat_matrix(double threshold1,double threshold2,double output_thresh)
289 int max_matches1,max_matches2;
292 /* max matches can't be less than 2 */
293 if (threshold1 >= 1.0)
294 max_matches1 = threshold1;
296 max_matches1 = num_files * threshold1;
297 if (max_matches1 < 2) max_matches1 = 2;
299 if (threshold2 > 1.0)
300 max_matches2 = threshold2;
302 max_matches2 = num_files * threshold2;
303 if (max_matches2 < max_matches1) max_matches2 = max_matches1;
306 if (output_thresh >= 1.0)
307 output_count = output_thresh;
309 output_count = output_thresh * num_files;
310 if (output_count < 1) output_count = 1;
311 if (output_count > num_files) output_count = num_files;
314 #define CMAT(i,j) mat[(i)*(num_files+1) + j]
316 /* alloc the cheat matrix */
317 mat = (struct pair *)Realloc(mat,(num_files+1)*(num_files+1)*sizeof(*mat));
320 for (i=0;i<=num_files;i++)
321 for (j=0;j<=num_files;j++)
323 CMAT(i,j).i = i; CMAT(i,j).j = j;
327 /* process the words one at a time */
328 for (wordnum=0;wordnum<num_words;wordnum++)
330 int occurrances = words[wordnum].count;
332 /* if the word is very common then forget it */
333 if (occurrances > max_matches1)
336 printf("ignoring common word %s (%d occurrances)\n",
337 words[wordnum].word,occurrances);
339 if (occurrances > max_matches2)
340 words[wordnum].count = 0;
346 printf("%s occurred %d times\n",words[wordnum].word,occurrances);
348 /* increment possible cheaters */
349 for (i=0;i<=num_files;i++)
351 if (words[wordnum].occurred[i])
352 for (j=i+1;j<=num_files;j++)
353 if (words[wordnum].occurred[j]) {
355 CMAT(i,j).count += ((max_matches1+1) - words[wordnum].count);
364 qsort(mat,(num_files+1)*(num_files+1),sizeof(*mat),(int (*)())pair_compare);
366 /* sort the wordlist so least frequent words are at the top */
367 qsort(words,num_words,sizeof(*words),(int (*)())words_compare);
369 /* find the highest */
372 for (f=0;f<output_count;f++)
373 if (mat[f].count > 0)
375 i = mat[f].i; j = mat[f].j;
376 printf("scored %3d in (%s %s)\n",mat[f].count,
380 for (wordnum=0;wordnum<num_words;wordnum++)
382 if (words[wordnum].count>0 &&
383 words[wordnum].occurred[i] &&
384 words[wordnum].occurred[j])
385 printf("%s(%d) ",words[wordnum].word,words[wordnum].count);
393 /*****************************************************************************
395 ******************************************************************************/
396 void process_one(char *filename,char *filter_prog,int filenum)
400 static char lastword[MAX_WORD_LEN+1];
405 sprintf(cmd,"cat %s | %s",filename,filter_prog);
409 f = fopen(filename,"r");
418 printf("processing file %s\n",filename);
422 while ((word = get_word(f)) != NULL) {
423 insert_word(word,filenum);
425 strcat(lastword, word);
426 insert_word(lastword,filenum);
427 strcpy(lastword, word);
437 /*****************************************************************************
438 process the files by opening them and getting the words
439 ******************************************************************************/
440 void process_files(char *filter_prog)
446 process_one(suspicious_file,filter_prog,num_files);
450 for (i=0;i<num_files;i++)
452 if (suspicious_file && strcmp(suspicious_file,FileName(i))==0)
456 process_one(FileName(i),filter_prog,i);
459 printf("\nfinished initial processing\n");
462 /*****************************************************************************
464 ******************************************************************************/
467 printf("detect: find cheaters by looking at token similarities between files\n");
468 printf("\ndetect [options] <files..>\n");
469 printf("\t-h = give extended help and examples of use (RECOMMENDED)\n");
470 printf("\t-c = be case sensitive\n");
471 printf("\t-n num = output the top n pairs (default top 20%%)\n");
472 printf("\t-t thresh = set threshold1 (default 3)\n");
473 printf("\t-T thresh = set threshold2 (default 10)\n");
474 printf("\t-C concatenate adjacent tokens to make new tokens (very useful!)\n");
475 printf("\t-I ignore_str = ignore these chars in the file\n");
476 printf("\t-f prog = filter the files through the given program\n");
477 printf("\t-s filename = only look for matches to this particular suspicious file\n");
478 printf("\t-w toggle weighting of the score by (threshold1-frequency) (default True)\n");
482 /*****************************************************************************
483 provide extended help
484 ******************************************************************************/
487 /* NOTE: this bit of text relies on a compiler that can handle new */
488 /* lines in string literals. gcc is good at this */
491 This program tries to identify common words between a list of files
492 in an attempt to find cases of plagiarism.
497 1) Build a list words that occur in the files, and a bitmap of which
498 words occur in which files.
499 2) Discard any words that occur in more than <threshold2> files
500 3) Produce a matrix M where M(i,j) is the number of times a word was
501 used both in file i and file j, not counting words that occur in
502 more than <threshold1> files.
503 4) Sort this matrix to produce the top <n> pairs, weighting by
504 how infrequent the words are.
505 5) Write out the pairs along with what words matched between them and
506 the frequency of those words.
509 Interpreting the output:
510 ========================
512 Here is some sample output.
514 scored 13 in (stu1.mod,stu2.mod)
515 AveAss(2) CalculateAss(2) OutputResult(2) StuMark(2) stuNum(2)
516 InputFile(3) MaxAss(3) ReadAss(3) index(4) TutTotal(5)
518 This means that these two files (stu1.mod and stu2.mod) both contained
519 all the identifiers listed. The identifiers with a (2) after them
520 occurred only in these two files. The identifier TutTotal was used in
521 5 files and is therefore less suspicious.
528 This will find common tokens between a list of modula files, listing
529 which files have lots of common tokens, what the common tokens are and
530 how common those tokens are across all the files.
535 detect -c -n 10 -t 2 -T 20 -I ',_ ' *.c
537 This will look in a bunch of C progs, and rank the pairs that used
538 tokens that were used by only 2 files. It will list common tokens
539 between the files up to those tokens used by 20 files. It will skip
540 over all underscores, commas and spaces. This last step is useful to
541 catch people who just inserted underscores. Skipping spaces and commas
542 has the effect of treating whole argument lists as a single word which
543 can be a very effective technique. This example will also be case
544 sensitive in comparisons. It will only show the top 10 matching pairs.
549 detect -f 'grep PROCEDURE' *.mod
551 This will only look at lines that have the word PROCEDURE in them.
552 This can be good for checking only whether procedure names match.
558 detect -s stu1.mod *.mod
560 This will only look for matches between the 'suspicious file' stu1.mod
561 and all the other files. This is useful to detect cheating where you
562 suspect a particular person, or to find a group who collaborated once
563 you find one pair that are a good match.
570 This will disable weighting of the scores by (threshold1-frequency)
571 thus making all matches of equal weight.
576 detect -C `find . -name \"*.txt\"`
578 This will process all text files below the current directory and will
579 also enable token concatenation. Token concatenation is very useful as
580 it builds a much larger token pool with much less common tokens. It is
581 particularly useful for essay type assignments which lack unique
582 programming identifiers.
587 *) The filter programs used by -f can be any program that reads from
588 standard input and writes to standard output.
590 *) The two thesholds and the output count can be given either as a
591 proportion of the total number of files or as a absolute count. Thus
592 `-t 0.05' will set the first threshold to 5% of the total number of
593 files. Any number less than 1 is interpreted as a proportion.
600 Andrew.Tridgell@anu.edu.au
612 /*****************************************************************************
614 ******************************************************************************/
615 int main(int argc,char **argv)
617 double threshold1 = 3;
618 double threshold2 = 10;
619 double output_thresh = 0.2;
620 char *filter_prog=NULL;
626 /* get some options */
627 while ((opt = getopt (argc, argv, "Cwct:T:I:d:hbf:n:s:")) != EOF)
631 output_thresh = atof(optarg);
634 filter_prog = strdup(optarg);
640 case_sensitive = !case_sensitive;
643 concat_words = !concat_words;
646 do_weighting = !do_weighting;
652 threshold1 = atof(optarg);
655 threshold2 = atof(optarg);
658 strcat(ignore_chars,optarg);
661 debug = atoi(optarg);
664 suspicious_file = strdup(optarg);
671 /* get the file names */
672 num_files = argc-optind;
673 file_names = &argv[optind];
681 printf("\nProcessing %d files\n",num_files);
684 printf("Suspicious file = %s\n",suspicious_file);
686 printf("Weighting=%s\tCase Sensitive=%s\n",
687 bool_str(do_weighting),bool_str(case_sensitive));
689 printf("Threshold1=%g\tThreshold2=%g\n",threshold1,threshold2);
692 printf("Ignoring chars `%s'\n",ignore_chars);
695 printf("Filtering with `%s'\n",filter_prog);
697 /* process the files */
698 process_files(filter_prog);
703 /* and get the matches */
704 cheat_matrix(threshold1,threshold2,output_thresh);