detect.c

   1 /*****************************************************************************
   2 A simple C program to detect plagiarism
   3
   4 Idea:
   5
   6 - create a list of words in the files
   7 - foreach word create a bitmap of where they occurred
   8 - ignore words that occur in a large proportion of files
   9 - produce a matrix of co-occurances
  10 - output the pairs with the highest co-occurances
  11
  12
  13 Enhancements:
  14 - two thresholds, one for rating matches, one for display of matches
  15 - ignore strings; allowing certain chars to be skipped in the input
  16 - enable ther use of a filter script to read files
  17 - added weighting
  18 - added "suspicious" files.
  19 - added token concatenation
  20
  21 Andrew Tridgell, May 1994
  22
  23 ******************************************************************************/
  24 #include <stdio.h>
  25 #include <unistd.h>
  26 #include <stdlib.h>
  27 #include <string.h>
  28 #include <math.h>
  29 #include <ctype.h>
  30
  31 typedef enum {False=0,True=1} BOOL;
  32
  33 /* experimental ? */
  34 BOOL beta = False;
  35
  36 /* should I be case sensitive? */
  37 BOOL case_sensitive = False;
  38
  39 /* how big can a word get? */
  40 #define MAX_WORD_LEN 500
  41
  42 /* how many files are we processing? */
  43 int num_files = 0;
  44
  45 /* the names of the files we are processing - normally stolen from argv */
  46 char **file_names = NULL;
  47
  48 /* how many words have we found? */
  49 int num_words = 0;
  50
  51 /* a list of chars to ignore - can be interesting */
  52 char ignore_chars[MAX_WORD_LEN] = "";
  53
  54 /* what debug level? */
  55 int debug = 0;
  56
  57
  58 /* will I add new words to the word list ? */
  59 BOOL add_words = True;
  60
  61 /* shall I weight the answers ? */
  62 BOOL do_weighting = True;
  63
  64 /* shall I concatenate all adjacent words to form word pairs? */
  65 BOOL concat_words = False;
  66
  67 /* a suspicious filename */
  68 char *suspicious_file = NULL;
  69
  70 /* how to write a boolean */
  71 #define bool_str(v) ((v)?"True":"False")
  72
  73 /*
  74 a list of all the words found and where they occurred
  75 */
  76 struct word
  77 {
  78   int count; /* how many files this word occurs in */
  79   char *word; /* text of word */
  80   BOOL *occurred; /* yes/no has occurred in this file */
  81 } *words = NULL;
  82
  83 /*****************************************************************************
  84 like realloc() but can start from scratch, and exits if there is an error
  85 ******************************************************************************/
  86 void *Realloc(void *ptr,int size)
  87 {
  88   if (!ptr)
  89     ptr = malloc(size);
  90   else
  91     ptr = realloc(ptr,size);
  92   if (!ptr)
  93     {
  94       printf("Can't allocate %d bytes\n",size);
  95       exit(0);
  96     }
  97   return(ptr);
  98 }
  99
 100 /*****************************************************************************
 101 return a filename
 102 ******************************************************************************/
 103 char *FileName(int i)
 104 {
 105   if (i < num_files)
 106     return(file_names[i]);
 107   return(suspicious_file);
 108 }
 109
 110
 111 /*****************************************************************************
 112 decide if two words are equal
 113 ******************************************************************************/
 114 BOOL wordequal(char *word1,char *word2)
 115 {
 116   if (case_sensitive)
 117     return(strcmp(word1,word2) == 0);
 118   else
 119     return(strcasecmp(word1,word2) == 0);
 120 }
 121
 122
 123 /*****************************************************************************
 124 find an insertion position for a new word
 125 ******************************************************************************/
 126 int insert_position(char *word)
 127 {
 128   int high_i = num_words;
 129   int low_i = 0;
 130   int guess,ret;
 131
 132   /* do a bisection search - this assumes the list is kept ordered */
 133   while (high_i > low_i)
 134     {
 135       guess = (high_i + low_i)/2;
 136       if (case_sensitive)
 137         ret = strcmp(word,words[guess].word);
 138       else
 139         ret = strcasecmp(word,words[guess].word);
 140       if (ret == 0)
 141         return(guess);
 142       if (ret > 0)
 143         low_i = guess+1;
 144       if (ret < 0)
 145         high_i = guess;
 146     }
 147   return(low_i);
 148 }
 149
 150
 151 /*****************************************************************************
 152 ask if a word has occurred before - return word index if it has
 153 return -1 if not
 154 ******************************************************************************/
 155 int find_word(char *word)
 156 {
 157   int ret;
 158
 159   /* we can take advantage of the order of the words */
 160   ret = insert_position(word);
 161   if (ret >= num_words || !wordequal(word,words[ret].word))
 162     return(-1);
 163   return(ret);
 164
 165 }
 166
 167
 168 /*****************************************************************************
 169 Insert a word into the list of words that have occurred.
 170 If the word already exists then just set the occured bit for the filenum
 171 ******************************************************************************/
 172 void insert_word(char *word,int filenum)
 173 {
 174   int wordnum,i;
 175   wordnum = find_word(word);
 176
 177   if (wordnum < 0 && !add_words)
 178     return;
 179
 180   if (wordnum < 0)
 181     {
 182       if (debug > 0)
 183         printf("new word %s from filenum %d\n",word,filenum);
 184
 185       wordnum = insert_position(word);
 186       words = (struct word *)Realloc(words,sizeof(struct word)*(num_words+1));
 187       num_words++;
 188
 189       for (i=num_words-2;i>=wordnum;i--)
 190         words[i+1] = words[i];
 191
 192       words[wordnum].count=0;
 193       words[wordnum].word = strdup(word);
 194       words[wordnum].occurred =
 195         (BOOL *)Realloc(NULL,sizeof(BOOL)*(num_files+1));
 196
 197       for (i=0;i<=num_files;i++) words[wordnum].occurred[i] = False;
 198     }
 199
 200   if (!words[wordnum].occurred[filenum]) words[wordnum].count++;
 201   words[wordnum].occurred[filenum] = True;
 202 }
 203
 204 /*****************************************************************************
 205 dump the word occurrance table
 206 ******************************************************************************/
 207 void dump_word_table(void)
 208 {
 209   int i,j;
 210   for (i=0;i<num_words;i++)
 211     {
 212       printf("%20.20s ",words[i].word);
 213       for (j=0;j<=num_files;j++)
 214         {
 215           if (words[i].occurred[j])
 216             printf("1 ");
 217           else
 218             printf("  ");
 219         }
 220       printf("\n");
 221     }
 222 }
 223
 224 /*****************************************************************************
 225 read one character - skipping over the chars in ignore
 226 ******************************************************************************/
 227 int Read(FILE *f,char *ignore)
 228 {
 229   int ch = fgetc(f);
 230   if (ch == EOF) return(EOF);
 231   if (*ignore && (strchr(ignore,(char)ch) != NULL))
 232     return(Read(f,ignore));
 233   return(ch);
 234 }
 235
 236 /*****************************************************************************
 237 get a word from a stream. return the word. return NULL if no more words
 238 exist. possibly skip comments?
 239 ******************************************************************************/
 240 char *get_word(FILE *f)
 241 {
 242   static char word[MAX_WORD_LEN+1];
 243   int word_len = 0;
 244   char ch;
 245
 246   strcpy(word,"");
 247
 248   /* skip over non alphanum chars */
 249   while ((ch = Read(f,ignore_chars)) != EOF)
 250     {
 251       if (isalnum(ch)) break;
 252     }
 253   if (ch == EOF) return(NULL);
 254
 255   word[word_len++] = ch;
 256
 257   /* grab all the alphanums */
 258   while (word_len < MAX_WORD_LEN && (isalnum(ch = Read(f,ignore_chars))))
 259     word[word_len++] = ch;
 260
 261   word[word_len] = '\0';
 262   return(word);
 263 }
 264
 265
 266 /* used for cheating matrix */
 267 struct pair
 268 {
 269   int i,j;
 270   int count;
 271 } *mat = NULL;
 272
 273 /*****************************************************************************
 274    compare 2 pair structures - used for sorting
 275 ******************************************************************************/
 276 int pair_compare(struct pair *t1,struct pair *t2)
 277 {
 278   return(t2->count - t1->count);
 279 }
 280
 281 /*****************************************************************************
 282    compare 2 word structures - used for sorting
 283 ******************************************************************************/
 284 int words_compare(struct word *t1,struct word *t2)
 285 {
 286   if (t2->count == t1->count)
 287     return(strcmp(t1->word,t2->word));
 288   return(t1->count - t2->count);
 289 }
 290
 291 /*****************************************************************************
 292 form the cheating matrix.
 293 The match between two files is increased if
 294 the word occurs in both files but doesn't occur in more than THRESH % of files.
 295 ******************************************************************************/
 296 void cheat_matrix(double threshold1,double threshold2,double output_thresh)
 297 {
 298   int i,j,wordnum;
 299   int max_matches1,max_matches2;
 300   int output_count;
 301
 302   /* max matches can't be less than 2 */
 303   if (threshold1 >= 1.0)
 304     max_matches1 = threshold1;
 305   else
 306     max_matches1 = num_files * threshold1;
 307   if (max_matches1 < 2) max_matches1 = 2;
 308
 309   if (threshold2 > 1.0)
 310     max_matches2 = threshold2;
 311   else
 312     max_matches2 = num_files * threshold2;
 313   if (max_matches2 < max_matches1) max_matches2 = max_matches1;
 314
 315
 316   if (output_thresh >= 1.0)
 317     output_count = output_thresh;
 318   else
 319     output_count = output_thresh * num_files;
 320   if (output_count < 1) output_count = 1;
 321   if (output_count > num_files) output_count = num_files;
 322
 323
 324 #define CMAT(i,j) mat[(i)*(num_files+1) + j]
 325
 326   /* alloc the cheat matrix */
 327   mat = (struct pair *)Realloc(mat,(num_files+1)*(num_files+1)*sizeof(*mat));
 328
 329   /* reset it */
 330   for (i=0;i<=num_files;i++)
 331     for (j=0;j<=num_files;j++)
 332       {
 333         CMAT(i,j).i = i; CMAT(i,j).j = j;
 334         CMAT(i,j).count=0;
 335       }
 336
 337   /* process the words one at a time */
 338   for (wordnum=0;wordnum<num_words;wordnum++)
 339     {
 340       int occurrances = words[wordnum].count;
 341
 342       /* if the word is very common then forget it */
 343       if (occurrances > max_matches1)
 344         {
 345           if (debug > 3)
 346             printf("ignoring common word %s (%d occurrances)\n",
 347                    words[wordnum].word,occurrances);
 348
 349           if (occurrances > max_matches2)
 350             words[wordnum].count = 0;
 351
 352           continue;
 353         }
 354
 355       if (debug > 3)
 356         printf("%s occurred %d times\n",words[wordnum].word,occurrances);
 357
 358       /* increment possible cheaters */
 359       for (i=0;i<=num_files;i++)
 360         {
 361           if (words[wordnum].occurred[i])
 362             for (j=i+1;j<=num_files;j++)
 363                     if (words[wordnum].occurred[j]) {
 364                             if (do_weighting) {
 365                                     CMAT(i,j).count += ((max_matches1+1) - words[wordnum].count);
 366                             }   else {
 367                                     CMAT(i,j).count++;
 368                             }
 369                     }
 370         }
 371     }
 372
 373   /* sort them */
 374   qsort(mat,(num_files+1)*(num_files+1),sizeof(*mat),(int (*)())pair_compare);
 375
 376   /* sort the wordlist so least frequent words are at the top */
 377   qsort(words,num_words,sizeof(*words),(int (*)())words_compare);
 378
 379   /* find the highest */
 380   {
 381     int f;
 382     for (f=0;f<output_count;f++)
 383       if (mat[f].count > 0)
 384         {
 385           i = mat[f].i; j = mat[f].j;
 386           printf("scored %3d in (%s %s)\n",mat[f].count,
 387                  FileName(i),
 388                  FileName(j));
 389
 390           for (wordnum=0;wordnum<num_words;wordnum++)
 391             {
 392               if (words[wordnum].count>0 &&
 393                   words[wordnum].occurred[i] &&
 394                   words[wordnum].occurred[j])
 395                 printf("%s(%d) ",words[wordnum].word,words[wordnum].count);
 396             }
 397           printf("\n\n");
 398         }
 399   }
 400 }
 401
 402
 403 /*****************************************************************************
 404 process one file
 405 ******************************************************************************/
 406 void process_one(char *filename,char *filter_prog,int filenum)
 407 {
 408   FILE *f;
 409   char *word;
 410   static char lastword[MAX_WORD_LEN+1];
 411
 412   if (filter_prog)
 413     {
 414       char cmd[1000];
 415       sprintf(cmd,"cat %s | %s",filename,filter_prog);
 416       f = popen(cmd,"r");
 417     }
 418   else
 419     f = fopen(filename,"r");
 420
 421   if (!f)
 422     {
 423       perror(filename);
 424       return;
 425     }
 426
 427   if (debug > 0)
 428     printf("processing file %s\n",filename);
 429
 430   fflush(stdout);
 431
 432   while ((word = get_word(f)) != NULL) {
 433           insert_word(word,filenum);
 434           if (concat_words) {
 435                   strcat(lastword, word);
 436                   insert_word(lastword,filenum);
 437                   strcpy(lastword, word);
 438           }
 439   }
 440   if (filter_prog)
 441     pclose(f);
 442   else
 443     fclose(f);
 444 }
 445
 446
 447 /*****************************************************************************
 448 process the files by opening them and getting the words
 449 ******************************************************************************/
 450 void process_files(char *filter_prog)
 451 {
 452   int i;
 453
 454   if (suspicious_file)
 455     {
 456       process_one(suspicious_file,filter_prog,num_files);
 457       add_words = False;
 458     }
 459
 460   for (i=0;i<num_files;i++)
 461     {
 462       if (suspicious_file && strcmp(suspicious_file,FileName(i))==0)
 463         continue;
 464
 465       printf(".");
 466       process_one(FileName(i),filter_prog,i);
 467     }
 468
 469   printf("\nfinished initial processing\n");
 470 }
 471
 472 /*****************************************************************************
 473 usage of the program
 474 ******************************************************************************/
 475 void usage(void)
 476 {
 477   printf("detect: find cheaters by looking at token similarities between files\n");
 478   printf("\ndetect [options] <files..>\n");
 479   printf("\t-h = give extended help and examples of use (RECOMMENDED)\n");
 480   printf("\t-c = be case sensitive\n");
 481   printf("\t-n num = output the top n pairs (default top 20%%)\n");
 482   printf("\t-t thresh = set threshold1 (default 3)\n");
 483   printf("\t-T thresh = set threshold2 (default 10)\n");
 484   printf("\t-C concatenate adjacent tokens to make new tokens (very useful!)\n");
 485   printf("\t-I ignore_str = ignore these chars in the file\n");
 486   printf("\t-f prog = filter the files through the given program\n");
 487   printf("\t-s filename = only look for matches to this particular suspicious file\n");
 488   printf("\t-w toggle weighting of the score by (threshold1-frequency) (default True)\n");
 489   printf("\n");
 490 }
 491
 492 /*****************************************************************************
 493 provide extended help
 494 ******************************************************************************/
 495 void help(void)
 496 {
 497   /* NOTE: this bit of text relies on a compiler that can handle new */
 498   /* lines in string literals. gcc is good at this */
 499
 500   char *help_txt = "
 501 This program tries to identify common words between a list of files
 502 in an attempt to find cases of plagiarism.
 503
 504 Algorithm:
 505 ==========
 506
 507 1) Build a list words that occur in the files, and a bitmap of which
 508    words occur in which files.
 509 2) Discard any words that occur in more than <threshold2> files
 510 3) Produce a matrix M where M(i,j) is the number of times a word was
 511    used both in file i and file j, not counting words that occur in
 512    more than <threshold1> files.
 513 4) Sort this matrix to produce the top <n> pairs, weighting by
 514    how infrequent the words are.
 515 5) Write out the pairs along with what words matched between them and
 516    the frequency of those words.
 517
 518
 519 Interpreting the output:
 520 ========================
 521
 522 Here is some sample output.
 523
 524 scored  13 in (stu1.mod,stu2.mod)
 525 AveAss(2) CalculateAss(2) OutputResult(2) StuMark(2) stuNum(2)
 526 InputFile(3) MaxAss(3) ReadAss(3) index(4) TutTotal(5)
 527
 528 This means that these two files (stu1.mod and stu2.mod) both contained
 529 all the identifiers listed. The identifiers with a (2) after them
 530 occurred only in these two files. The identifier TutTotal was used in
 531 5 files and is therefore less suspicious.
 532
 533 Example 1:
 534 ==========
 535
 536    detect *.mod
 537
 538 This will find common tokens between a list of modula files, listing
 539 which files have lots of common tokens, what the common tokens are and
 540 how common those tokens are across all the files.
 541
 542 Example 2:
 543 ==========
 544
 545   detect -c -n 10 -t 2 -T 20 -I ',_ ' *.c
 546
 547 This will look in a bunch of C progs, and rank the pairs that used
 548 tokens that were used by only 2 files. It will list common tokens
 549 between the files up to those tokens used by 20 files. It will skip
 550 over all underscores, commas and spaces. This last step is useful to
 551 catch people who just inserted underscores. Skipping spaces and commas
 552 has the effect of treating whole argument lists as a single word which
 553 can be a very effective technique. This example will also be case
 554 sensitive in comparisons. It will only show the top 10 matching pairs.
 555
 556 Example 3:
 557 ==========
 558
 559   detect -f 'grep PROCEDURE' *.mod
 560
 561 This will only look at lines that have the word PROCEDURE in them.
 562 This can be good for checking only whether procedure names match.
 563
 564
 565 Example 4:
 566 ==========
 567
 568    detect -s stu1.mod *.mod
 569
 570 This will only look for matches between the 'suspicious file' stu1.mod
 571 and all the other files. This is useful to detect cheating where you
 572 suspect a particular person, or to find a group who collaborated once
 573 you find one pair that are a good match.
 574
 575 Example 5:
 576 ==========
 577
 578   detect -w *.mod
 579
 580 This will disable weighting of the scores by (threshold1-frequency)
 581 thus making all matches of equal weight.
 582
 583 Example 6:
 584 ==========
 585
 586   detect -C `find . -name \"*.txt\"`
 587
 588 This will process all text files below the current directory and will
 589 also enable token concatenation. Token concatenation is very useful as
 590 it builds a much larger token pool with much less common tokens. It is
 591 particularly useful for essay type assignments which lack unique
 592 programming identifiers.
 593
 594 Advanced Notes:
 595 ===============
 596
 597 *) The filter programs used by -f can be any program that reads from
 598    standard input and writes to standard output.
 599
 600 *) The two thesholds and the output count can be given either as a
 601    proportion of the total number of files or as a absolute count. Thus
 602    `-t 0.05' will set the first threshold to 5% of the total number of
 603    files. Any number less than 1 is interpreted as a proportion.
 604
 605
 606 Author:
 607 =======
 608
 609  Andrew Tridgell
 610  Andrew.Tridgell@anu.edu.au
 611
 612 ";
 613
 614
 615   usage();
 616   puts(help_txt);
 617
 618 }
 619
 620
 621
 622 /*****************************************************************************
 623 the main program
 624 ******************************************************************************/
 625 int main(int argc,char **argv)
 626 {
 627   double threshold1 = 3;
 628   double threshold2 = 10;
 629   double output_thresh = 0.2;
 630   char *filter_prog=NULL;
 631
 632   int opt;
 633   extern char *optarg;
 634   extern int optind;
 635
 636   /* get some options */
 637   while ((opt = getopt (argc, argv, "Cwct:T:I:d:hbf:n:s:")) != EOF)
 638     switch (opt)
 639       {
 640       case 'n':
 641         output_thresh = atof(optarg);
 642         break;
 643       case 'f':
 644         filter_prog = strdup(optarg);
 645         break;
 646       case 'h':
 647         help();
 648         exit(0);
 649       case 'c':
 650         case_sensitive = !case_sensitive;
 651         break;
 652       case 'C':
 653         concat_words = !concat_words;
 654         break;
 655       case 'w':
 656         do_weighting = !do_weighting;
 657         break;
 658       case 'b':
 659         beta = !beta;
 660         break;
 661       case 't':
 662         threshold1 = atof(optarg);
 663         break;
 664       case 'T':
 665         threshold2 = atof(optarg);
 666         break;
 667       case 'I':
 668         strcat(ignore_chars,optarg);
 669         break;
 670       case 'd':
 671         debug = atoi(optarg);
 672         break;
 673       case 's':
 674         suspicious_file = strdup(optarg);
 675         break;
 676       default:
 677         usage();
 678         exit(0);
 679       }
 680
 681   /* get the file names */
 682   num_files = argc-optind;
 683   file_names = &argv[optind];
 684
 685   if (num_files < 2)
 686     {
 687       usage();
 688       exit(0);
 689     }
 690
 691   printf("\nProcessing %d files\n",num_files);
 692
 693   if (suspicious_file)
 694     printf("Suspicious file = %s\n",suspicious_file);
 695
 696   printf("Weighting=%s\tCase Sensitive=%s\n",
 697          bool_str(do_weighting),bool_str(case_sensitive));
 698
 699   printf("Threshold1=%g\tThreshold2=%g\n",threshold1,threshold2);
 700
 701   if (*ignore_chars)
 702     printf("Ignoring chars `%s'\n",ignore_chars);
 703
 704   if (filter_prog)
 705     printf("Filtering with `%s'\n",filter_prog);
 706
 707   /* process the files */
 708   process_files(filter_prog);
 709
 710   if (debug > 2)
 711     dump_word_table();
 712
 713   /* and get the matches */
 714   cheat_matrix(threshold1,threshold2,output_thresh);
 715
 716   return 0;
 717 }