1 /*****************************************************************************
2 A simple C program to detect plagiarism
6 - create a list of words in the files
7 - foreach word create a bitmap of where they occurred
8 - ignore words that occur in a large proportion of files
9 - produce a matrix of co-occurances
10 - output the pairs with the highest co-occurances
14 - two thresholds, one for rating matches, one for display of matches
15 - ignore strings; allowing certain chars to be skipped in the input
16 - enable ther use of a filter script to read files
18 - added "suspicious" files.
19 - added token concatenation
21 Andrew Tridgell, May 1994
23 ******************************************************************************/
31 typedef enum {False=0,True=1} BOOL;
36 /* should I be case sensitive? */
37 BOOL case_sensitive = False;
39 /* how big can a word get? */
40 #define MAX_WORD_LEN 500
42 /* how many files are we processing? */
45 /* the names of the files we are processing - normally stolen from argv */
46 char **file_names = NULL;
48 /* how many words have we found? */
51 /* a list of chars to ignore - can be interesting */
52 char ignore_chars[MAX_WORD_LEN] = "";
54 /* what debug level? */
58 /* will I add new words to the word list ? */
59 BOOL add_words = True;
61 /* shall I weight the answers ? */
62 BOOL do_weighting = True;
64 /* shall I concatenate all adjacent words to form word pairs? */
65 BOOL concat_words = False;
67 /* a suspicious filename */
68 char *suspicious_file = NULL;
70 /* how to write a boolean */
71 #define bool_str(v) ((v)?"True":"False")
74 a list of all the words found and where they occurred
78 int count; /* how many files this word occurs in */
79 char *word; /* text of word */
80 BOOL *occurred; /* yes/no has occurred in this file */
83 /*****************************************************************************
84 like realloc() but can start from scratch, and exits if there is an error
85 ******************************************************************************/
86 void *Realloc(void *ptr,int size)
91 ptr = realloc(ptr,size);
94 printf("Can't allocate %d bytes\n",size);
100 /*****************************************************************************
102 ******************************************************************************/
103 char *FileName(int i)
106 return(file_names[i]);
107 return(suspicious_file);
111 /*****************************************************************************
112 decide if two words are equal
113 ******************************************************************************/
114 BOOL wordequal(char *word1,char *word2)
117 return(strcmp(word1,word2) == 0);
119 return(strcasecmp(word1,word2) == 0);
123 /*****************************************************************************
124 find an insertion position for a new word
125 ******************************************************************************/
126 int insert_position(char *word)
128 int high_i = num_words;
132 /* do a bisection search - this assumes the list is kept ordered */
133 while (high_i > low_i)
135 guess = (high_i + low_i)/2;
137 ret = strcmp(word,words[guess].word);
139 ret = strcasecmp(word,words[guess].word);
151 /*****************************************************************************
152 ask if a word has occurred before - return word index if it has
154 ******************************************************************************/
155 int find_word(char *word)
159 /* we can take advantage of the order of the words */
160 ret = insert_position(word);
161 if (ret >= num_words || !wordequal(word,words[ret].word))
168 /*****************************************************************************
169 Insert a word into the list of words that have occurred.
170 If the word already exists then just set the occured bit for the filenum
171 ******************************************************************************/
172 void insert_word(char *word,int filenum)
175 wordnum = find_word(word);
177 if (wordnum < 0 && !add_words)
183 printf("new word %s from filenum %d\n",word,filenum);
185 wordnum = insert_position(word);
186 words = (struct word *)Realloc(words,sizeof(struct word)*(num_words+1));
189 for (i=num_words-2;i>=wordnum;i--)
190 words[i+1] = words[i];
192 words[wordnum].count=0;
193 words[wordnum].word = strdup(word);
194 words[wordnum].occurred =
195 (BOOL *)Realloc(NULL,sizeof(BOOL)*(num_files+1));
197 for (i=0;i<=num_files;i++) words[wordnum].occurred[i] = False;
200 if (!words[wordnum].occurred[filenum]) words[wordnum].count++;
201 words[wordnum].occurred[filenum] = True;
204 /*****************************************************************************
205 dump the word occurrance table
206 ******************************************************************************/
207 void dump_word_table(void)
210 for (i=0;i<num_words;i++)
212 printf("%20.20s ",words[i].word);
213 for (j=0;j<=num_files;j++)
215 if (words[i].occurred[j])
224 /*****************************************************************************
225 read one character - skipping over the chars in ignore
226 ******************************************************************************/
227 int Read(FILE *f,char *ignore)
230 if (ch == EOF) return(EOF);
231 if (*ignore && (strchr(ignore,(char)ch) != NULL))
232 return(Read(f,ignore));
236 /*****************************************************************************
237 get a word from a stream. return the word. return NULL if no more words
238 exist. possibly skip comments?
239 ******************************************************************************/
240 char *get_word(FILE *f)
242 static char word[MAX_WORD_LEN+1];
248 /* skip over non alphanum chars */
249 while ((ch = Read(f,ignore_chars)) != EOF)
251 if (isalnum(ch)) break;
253 if (ch == EOF) return(NULL);
255 word[word_len++] = ch;
257 /* grab all the alphanums */
258 while (word_len < MAX_WORD_LEN && (isalnum(ch = Read(f,ignore_chars))))
259 word[word_len++] = ch;
261 word[word_len] = '\0';
266 /* used for cheating matrix */
273 /*****************************************************************************
274 compare 2 pair structures - used for sorting
275 ******************************************************************************/
276 int pair_compare(struct pair *t1,struct pair *t2)
278 return(t2->count - t1->count);
281 /*****************************************************************************
282 compare 2 word structures - used for sorting
283 ******************************************************************************/
284 int words_compare(struct word *t1,struct word *t2)
286 if (t2->count == t1->count)
287 return(strcmp(t1->word,t2->word));
288 return(t1->count - t2->count);
291 /*****************************************************************************
292 form the cheating matrix.
293 The match between two files is increased if
294 the word occurs in both files but doesn't occur in more than THRESH % of files.
295 ******************************************************************************/
296 void cheat_matrix(double threshold1,double threshold2,double output_thresh)
299 int max_matches1,max_matches2;
302 /* max matches can't be less than 2 */
303 if (threshold1 >= 1.0)
304 max_matches1 = threshold1;
306 max_matches1 = num_files * threshold1;
307 if (max_matches1 < 2) max_matches1 = 2;
309 if (threshold2 > 1.0)
310 max_matches2 = threshold2;
312 max_matches2 = num_files * threshold2;
313 if (max_matches2 < max_matches1) max_matches2 = max_matches1;
316 if (output_thresh >= 1.0)
317 output_count = output_thresh;
319 output_count = output_thresh * num_files;
320 if (output_count < 1) output_count = 1;
321 if (output_count > num_files) output_count = num_files;
324 #define CMAT(i,j) mat[(i)*(num_files+1) + j]
326 /* alloc the cheat matrix */
327 mat = (struct pair *)Realloc(mat,(num_files+1)*(num_files+1)*sizeof(*mat));
330 for (i=0;i<=num_files;i++)
331 for (j=0;j<=num_files;j++)
333 CMAT(i,j).i = i; CMAT(i,j).j = j;
337 /* process the words one at a time */
338 for (wordnum=0;wordnum<num_words;wordnum++)
340 int occurrances = words[wordnum].count;
342 /* if the word is very common then forget it */
343 if (occurrances > max_matches1)
346 printf("ignoring common word %s (%d occurrances)\n",
347 words[wordnum].word,occurrances);
349 if (occurrances > max_matches2)
350 words[wordnum].count = 0;
356 printf("%s occurred %d times\n",words[wordnum].word,occurrances);
358 /* increment possible cheaters */
359 for (i=0;i<=num_files;i++)
361 if (words[wordnum].occurred[i])
362 for (j=i+1;j<=num_files;j++)
363 if (words[wordnum].occurred[j]) {
365 CMAT(i,j).count += ((max_matches1+1) - words[wordnum].count);
374 qsort(mat,(num_files+1)*(num_files+1),sizeof(*mat),(int (*)())pair_compare);
376 /* sort the wordlist so least frequent words are at the top */
377 qsort(words,num_words,sizeof(*words),(int (*)())words_compare);
379 /* find the highest */
382 for (f=0;f<output_count;f++)
383 if (mat[f].count > 0)
385 i = mat[f].i; j = mat[f].j;
386 printf("scored %3d in (%s %s)\n",mat[f].count,
390 for (wordnum=0;wordnum<num_words;wordnum++)
392 if (words[wordnum].count>0 &&
393 words[wordnum].occurred[i] &&
394 words[wordnum].occurred[j])
395 printf("%s(%d) ",words[wordnum].word,words[wordnum].count);
403 /*****************************************************************************
405 ******************************************************************************/
406 void process_one(char *filename,char *filter_prog,int filenum)
410 static char lastword[MAX_WORD_LEN+1];
415 sprintf(cmd,"cat %s | %s",filename,filter_prog);
419 f = fopen(filename,"r");
428 printf("processing file %s\n",filename);
432 while ((word = get_word(f)) != NULL) {
433 insert_word(word,filenum);
435 strcat(lastword, word);
436 insert_word(lastword,filenum);
437 strcpy(lastword, word);
447 /*****************************************************************************
448 process the files by opening them and getting the words
449 ******************************************************************************/
450 void process_files(char *filter_prog)
456 process_one(suspicious_file,filter_prog,num_files);
460 for (i=0;i<num_files;i++)
462 if (suspicious_file && strcmp(suspicious_file,FileName(i))==0)
466 process_one(FileName(i),filter_prog,i);
469 printf("\nfinished initial processing\n");
472 /*****************************************************************************
474 ******************************************************************************/
477 printf("detect: find cheaters by looking at token similarities between files\n");
478 printf("\ndetect [options] <files..>\n");
479 printf("\t-h = give extended help and examples of use (RECOMMENDED)\n");
480 printf("\t-c = be case sensitive\n");
481 printf("\t-n num = output the top n pairs (default top 20%%)\n");
482 printf("\t-t thresh = set threshold1 (default 3)\n");
483 printf("\t-T thresh = set threshold2 (default 10)\n");
484 printf("\t-C concatenate adjacent tokens to make new tokens (very useful!)\n");
485 printf("\t-I ignore_str = ignore these chars in the file\n");
486 printf("\t-f prog = filter the files through the given program\n");
487 printf("\t-s filename = only look for matches to this particular suspicious file\n");
488 printf("\t-w toggle weighting of the score by (threshold1-frequency) (default True)\n");
492 /*****************************************************************************
493 provide extended help
494 ******************************************************************************/
497 /* NOTE: this bit of text relies on a compiler that can handle new */
498 /* lines in string literals. gcc is good at this */
501 This program tries to identify common words between a list of files
502 in an attempt to find cases of plagiarism.
507 1) Build a list words that occur in the files, and a bitmap of which
508 words occur in which files.
509 2) Discard any words that occur in more than <threshold2> files
510 3) Produce a matrix M where M(i,j) is the number of times a word was
511 used both in file i and file j, not counting words that occur in
512 more than <threshold1> files.
513 4) Sort this matrix to produce the top <n> pairs, weighting by
514 how infrequent the words are.
515 5) Write out the pairs along with what words matched between them and
516 the frequency of those words.
519 Interpreting the output:
520 ========================
522 Here is some sample output.
524 scored 13 in (stu1.mod,stu2.mod)
525 AveAss(2) CalculateAss(2) OutputResult(2) StuMark(2) stuNum(2)
526 InputFile(3) MaxAss(3) ReadAss(3) index(4) TutTotal(5)
528 This means that these two files (stu1.mod and stu2.mod) both contained
529 all the identifiers listed. The identifiers with a (2) after them
530 occurred only in these two files. The identifier TutTotal was used in
531 5 files and is therefore less suspicious.
538 This will find common tokens between a list of modula files, listing
539 which files have lots of common tokens, what the common tokens are and
540 how common those tokens are across all the files.
545 detect -c -n 10 -t 2 -T 20 -I ',_ ' *.c
547 This will look in a bunch of C progs, and rank the pairs that used
548 tokens that were used by only 2 files. It will list common tokens
549 between the files up to those tokens used by 20 files. It will skip
550 over all underscores, commas and spaces. This last step is useful to
551 catch people who just inserted underscores. Skipping spaces and commas
552 has the effect of treating whole argument lists as a single word which
553 can be a very effective technique. This example will also be case
554 sensitive in comparisons. It will only show the top 10 matching pairs.
559 detect -f 'grep PROCEDURE' *.mod
561 This will only look at lines that have the word PROCEDURE in them.
562 This can be good for checking only whether procedure names match.
568 detect -s stu1.mod *.mod
570 This will only look for matches between the 'suspicious file' stu1.mod
571 and all the other files. This is useful to detect cheating where you
572 suspect a particular person, or to find a group who collaborated once
573 you find one pair that are a good match.
580 This will disable weighting of the scores by (threshold1-frequency)
581 thus making all matches of equal weight.
586 detect -C `find . -name \"*.txt\"`
588 This will process all text files below the current directory and will
589 also enable token concatenation. Token concatenation is very useful as
590 it builds a much larger token pool with much less common tokens. It is
591 particularly useful for essay type assignments which lack unique
592 programming identifiers.
597 *) The filter programs used by -f can be any program that reads from
598 standard input and writes to standard output.
600 *) The two thesholds and the output count can be given either as a
601 proportion of the total number of files or as a absolute count. Thus
602 `-t 0.05' will set the first threshold to 5% of the total number of
603 files. Any number less than 1 is interpreted as a proportion.
610 Andrew.Tridgell@anu.edu.au
622 /*****************************************************************************
624 ******************************************************************************/
625 int main(int argc,char **argv)
627 double threshold1 = 3;
628 double threshold2 = 10;
629 double output_thresh = 0.2;
630 char *filter_prog=NULL;
636 /* get some options */
637 while ((opt = getopt (argc, argv, "Cwct:T:I:d:hbf:n:s:")) != EOF)
641 output_thresh = atof(optarg);
644 filter_prog = strdup(optarg);
650 case_sensitive = !case_sensitive;
653 concat_words = !concat_words;
656 do_weighting = !do_weighting;
662 threshold1 = atof(optarg);
665 threshold2 = atof(optarg);
668 strcat(ignore_chars,optarg);
671 debug = atoi(optarg);
674 suspicious_file = strdup(optarg);
681 /* get the file names */
682 num_files = argc-optind;
683 file_names = &argv[optind];
691 printf("\nProcessing %d files\n",num_files);
694 printf("Suspicious file = %s\n",suspicious_file);
696 printf("Weighting=%s\tCase Sensitive=%s\n",
697 bool_str(do_weighting),bool_str(case_sensitive));
699 printf("Threshold1=%g\tThreshold2=%g\n",threshold1,threshold2);
702 printf("Ignoring chars `%s'\n",ignore_chars);
705 printf("Filtering with `%s'\n",filter_prog);
707 /* process the files */
708 process_files(filter_prog);
713 /* and get the matches */
714 cheat_matrix(threshold1,threshold2,output_thresh);