detect2.c

   1 /*****************************************************************************
   2 A simple C program to detect plagiarism
   3
   4 Idea:
   5
   6 - tokenise input
   7 - collect groups of N tokens, overlapping my M tokens
   8 - hash each group
   9 - sort the resultant hashes
  10
  11 Andrew Tridgell <tridge@samba.org> July 2003
  12
  13 ******************************************************************************/
  14 #include <stdio.h>
  15 #include <unistd.h>
  16 #include <stdlib.h>
  17 #include <string.h>
  18 #include <math.h>
  19 #include <ctype.h>
  20
  21 typedef enum {False=0,True=1} BOOL;
  22
  23 /* experimental ? */
  24 BOOL beta = False;
  25
  26 /* should I be case sensitive? */
  27 BOOL case_sensitive = False;
  28
  29 /* how big can a word get? */
  30 #define MAX_WORD_LEN 500
  31
  32 /* how many files are we processing? */
  33 int num_files = 0;
  34
  35 /* the names of the files we are processing - normally stolen from argv */
  36 char **file_names = NULL;
  37
  38 /* how many words have we found? */
  39 int num_words = 0;
  40
  41 /* a list of chars to ignore - can be interesting */
  42 char ignore_chars[MAX_WORD_LEN] = "";
  43
  44 /* what debug level? */
  45 int debug = 0;
  46
  47
  48 /* will I add new words to the word list ? */
  49 BOOL add_words = True;
  50
  51 /* shall I weight the answers ? */
  52 BOOL do_weighting = True;
  53
  54 /* shall I concatenate all adjacent words to form word pairs? */
  55 BOOL concat_words = False;
  56
  57 /* a suspicious filename */
  58 char *suspicious_file = NULL;
  59
  60 /* how to write a boolean */
  61 #define bool_str(v) ((v)?"True":"False")
  62
  63 /*
  64 a list of all the words found and where they occurred
  65 */
  66 struct word
  67 {
  68   int count; /* how many files this word occurs in */
  69   char *word; /* text of word */
  70   BOOL *occurred; /* yes/no has occurred in this file */
  71 } *words = NULL;
  72
  73 /*****************************************************************************
  74 like realloc() but can start from scratch, and exits if there is an error
  75 ******************************************************************************/
  76 void *Realloc(void *ptr,int size)
  77 {
  78   if (!ptr)
  79     ptr = malloc(size);
  80   else
  81     ptr = realloc(ptr,size);
  82   if (!ptr)
  83     {
  84       printf("Can't allocate %d bytes\n",size);
  85       exit(0);
  86     }
  87   return(ptr);
  88 }
  89
  90 /*****************************************************************************
  91 return a filename
  92 ******************************************************************************/
  93 char *FileName(int i)
  94 {
  95   if (i < num_files)
  96     return(file_names[i]);
  97   return(suspicious_file);
  98 }
  99
 100
 101 /*****************************************************************************
 102 decide if two words are equal
 103 ******************************************************************************/
 104 BOOL wordequal(char *word1,char *word2)
 105 {
 106   if (case_sensitive)
 107     return(strcmp(word1,word2) == 0);
 108   else
 109     return(strcasecmp(word1,word2) == 0);
 110 }
 111
 112
 113 /*****************************************************************************
 114 find an insertion position for a new word
 115 ******************************************************************************/
 116 int insert_position(char *word)
 117 {
 118   int high_i = num_words;
 119   int low_i = 0;
 120   int guess,ret;
 121
 122   /* do a bisection search - this assumes the list is kept ordered */
 123   while (high_i > low_i)
 124     {
 125       guess = (high_i + low_i)/2;
 126       if (case_sensitive)
 127         ret = strcmp(word,words[guess].word);
 128       else
 129         ret = strcasecmp(word,words[guess].word);
 130       if (ret == 0)
 131         return(guess);
 132       if (ret > 0)
 133         low_i = guess+1;
 134       if (ret < 0)
 135         high_i = guess;
 136     }
 137   return(low_i);
 138 }
 139
 140
 141 /*****************************************************************************
 142 ask if a word has occurred before - return word index if it has
 143 return -1 if not
 144 ******************************************************************************/
 145 int find_word(char *word)
 146 {
 147   int ret;
 148
 149   /* we can take advantage of the order of the words */
 150   ret = insert_position(word);
 151   if (ret >= num_words || !wordequal(word,words[ret].word))
 152     return(-1);
 153   return(ret);
 154
 155 }
 156
 157
 158 /*****************************************************************************
 159 Insert a word into the list of words that have occurred.
 160 If the word already exists then just set the occured bit for the filenum
 161 ******************************************************************************/
 162 void insert_word(char *word,int filenum)
 163 {
 164   int wordnum,i;
 165   wordnum = find_word(word);
 166
 167   if (wordnum < 0 && !add_words)
 168     return;
 169
 170   if (wordnum < 0)
 171     {
 172       if (debug > 0)
 173         printf("new word %s from filenum %d\n",word,filenum);
 174
 175       wordnum = insert_position(word);
 176       words = (struct word *)Realloc(words,sizeof(struct word)*(num_words+1));
 177       num_words++;
 178
 179       for (i=num_words-2;i>=wordnum;i--)
 180         words[i+1] = words[i];
 181
 182       words[wordnum].count=0;
 183       words[wordnum].word = strdup(word);
 184       words[wordnum].occurred =
 185         (BOOL *)Realloc(NULL,sizeof(BOOL)*(num_files+1));
 186
 187       for (i=0;i<=num_files;i++) words[wordnum].occurred[i] = False;
 188     }
 189
 190   if (!words[wordnum].occurred[filenum]) words[wordnum].count++;
 191   words[wordnum].occurred[filenum] = True;
 192 }
 193
 194 /*****************************************************************************
 195 dump the word occurrance table
 196 ******************************************************************************/
 197 void dump_word_table(void)
 198 {
 199   int i,j;
 200   for (i=0;i<num_words;i++)
 201     {
 202       printf("%20.20s ",words[i].word);
 203       for (j=0;j<=num_files;j++)
 204         {
 205           if (words[i].occurred[j])
 206             printf("1 ");
 207           else
 208             printf("  ");
 209         }
 210       printf("\n");
 211     }
 212 }
 213
 214 /*****************************************************************************
 215 read one character - skipping over the chars in ignore
 216 ******************************************************************************/
 217 int Read(FILE *f,char *ignore)
 218 {
 219   int ch = fgetc(f);
 220   if (ch == EOF) return(EOF);
 221   if (*ignore && (strchr(ignore,(char)ch) != NULL))
 222     return(Read(f,ignore));
 223   return(ch);
 224 }
 225
 226 /*****************************************************************************
 227 get a word from a stream. return the word. return NULL if no more words
 228 exist. possibly skip comments?
 229 ******************************************************************************/
 230 char *get_word(FILE *f)
 231 {
 232   static char word[MAX_WORD_LEN+1];
 233   int word_len = 0;
 234   char ch;
 235
 236   strcpy(word,"");
 237
 238   /* skip over non alphanum chars */
 239   while ((ch = Read(f,ignore_chars)) != EOF)
 240     {
 241       if (isalnum(ch)) break;
 242     }
 243   if (ch == EOF) return(NULL);
 244
 245   word[word_len++] = ch;
 246
 247   /* grab all the alphanums */
 248   while (word_len < MAX_WORD_LEN && (isalnum(ch = Read(f,ignore_chars))))
 249     word[word_len++] = ch;
 250
 251   word[word_len] = '\0';
 252   return(word);
 253 }
 254
 255
 256 /* used for cheating matrix */
 257 struct pair
 258 {
 259   int i,j;
 260   int count;
 261 } *mat = NULL;
 262
 263 /*****************************************************************************
 264    compare 2 pair structures - used for sorting
 265 ******************************************************************************/
 266 int pair_compare(struct pair *t1,struct pair *t2)
 267 {
 268   return(t2->count - t1->count);
 269 }
 270
 271 /*****************************************************************************
 272    compare 2 word structures - used for sorting
 273 ******************************************************************************/
 274 int words_compare(struct word *t1,struct word *t2)
 275 {
 276   if (t2->count == t1->count)
 277     return(strcmp(t1->word,t2->word));
 278   return(t1->count - t2->count);
 279 }
 280
 281 /*****************************************************************************
 282 form the cheating matrix.
 283 The match between two files is increased if
 284 the word occurs in both files but doesn't occur in more than THRESH % of files.
 285 ******************************************************************************/
 286 void cheat_matrix(double threshold1,double threshold2,double output_thresh)
 287 {
 288   int i,j,wordnum;
 289   int max_matches1,max_matches2;
 290   int output_count;
 291
 292   /* max matches can't be less than 2 */
 293   if (threshold1 >= 1.0)
 294     max_matches1 = threshold1;
 295   else
 296     max_matches1 = num_files * threshold1;
 297   if (max_matches1 < 2) max_matches1 = 2;
 298
 299   if (threshold2 > 1.0)
 300     max_matches2 = threshold2;
 301   else
 302     max_matches2 = num_files * threshold2;
 303   if (max_matches2 < max_matches1) max_matches2 = max_matches1;
 304
 305
 306   if (output_thresh >= 1.0)
 307     output_count = output_thresh;
 308   else
 309     output_count = output_thresh * num_files;
 310   if (output_count < 1) output_count = 1;
 311   if (output_count > num_files) output_count = num_files;
 312
 313
 314 #define CMAT(i,j) mat[(i)*(num_files+1) + j]
 315
 316   /* alloc the cheat matrix */
 317   mat = (struct pair *)Realloc(mat,(num_files+1)*(num_files+1)*sizeof(*mat));
 318
 319   /* reset it */
 320   for (i=0;i<=num_files;i++)
 321     for (j=0;j<=num_files;j++)
 322       {
 323         CMAT(i,j).i = i; CMAT(i,j).j = j;
 324         CMAT(i,j).count=0;
 325       }
 326
 327   /* process the words one at a time */
 328   for (wordnum=0;wordnum<num_words;wordnum++)
 329     {
 330       int occurrances = words[wordnum].count;
 331
 332       /* if the word is very common then forget it */
 333       if (occurrances > max_matches1)
 334         {
 335           if (debug > 3)
 336             printf("ignoring common word %s (%d occurrances)\n",
 337                    words[wordnum].word,occurrances);
 338
 339           if (occurrances > max_matches2)
 340             words[wordnum].count = 0;
 341
 342           continue;
 343         }
 344
 345       if (debug > 3)
 346         printf("%s occurred %d times\n",words[wordnum].word,occurrances);
 347
 348       /* increment possible cheaters */
 349       for (i=0;i<=num_files;i++)
 350         {
 351           if (words[wordnum].occurred[i])
 352             for (j=i+1;j<=num_files;j++)
 353                     if (words[wordnum].occurred[j]) {
 354                             if (do_weighting) {
 355                                     CMAT(i,j).count += ((max_matches1+1) - words[wordnum].count);
 356                             }   else {
 357                                     CMAT(i,j).count++;
 358                             }
 359                     }
 360         }
 361     }
 362
 363   /* sort them */
 364   qsort(mat,(num_files+1)*(num_files+1),sizeof(*mat),(int (*)())pair_compare);
 365
 366   /* sort the wordlist so least frequent words are at the top */
 367   qsort(words,num_words,sizeof(*words),(int (*)())words_compare);
 368
 369   /* find the highest */
 370   {
 371     int f;
 372     for (f=0;f<output_count;f++)
 373       if (mat[f].count > 0)
 374         {
 375           i = mat[f].i; j = mat[f].j;
 376           printf("scored %3d in (%s %s)\n",mat[f].count,
 377                  FileName(i),
 378                  FileName(j));
 379
 380           for (wordnum=0;wordnum<num_words;wordnum++)
 381             {
 382               if (words[wordnum].count>0 &&
 383                   words[wordnum].occurred[i] &&
 384                   words[wordnum].occurred[j])
 385                 printf("%s(%d) ",words[wordnum].word,words[wordnum].count);
 386             }
 387           printf("\n\n");
 388         }
 389   }
 390 }
 391
 392
 393 /*****************************************************************************
 394 process one file
 395 ******************************************************************************/
 396 void process_one(char *filename,char *filter_prog,int filenum)
 397 {
 398   FILE *f;
 399   char *word;
 400   static char lastword[MAX_WORD_LEN+1];
 401
 402   if (filter_prog)
 403     {
 404       char cmd[1000];
 405       sprintf(cmd,"cat %s | %s",filename,filter_prog);
 406       f = popen(cmd,"r");
 407     }
 408   else
 409     f = fopen(filename,"r");
 410
 411   if (!f)
 412     {
 413       perror(filename);
 414       return;
 415     }
 416
 417   if (debug > 0)
 418     printf("processing file %s\n",filename);
 419
 420   fflush(stdout);
 421
 422   while ((word = get_word(f)) != NULL) {
 423           insert_word(word,filenum);
 424           if (concat_words) {
 425                   strcat(lastword, word);
 426                   insert_word(lastword,filenum);
 427                   strcpy(lastword, word);
 428           }
 429   }
 430   if (filter_prog)
 431     pclose(f);
 432   else
 433     fclose(f);
 434 }
 435
 436
 437 /*****************************************************************************
 438 process the files by opening them and getting the words
 439 ******************************************************************************/
 440 void process_files(char *filter_prog)
 441 {
 442   int i;
 443
 444   if (suspicious_file)
 445     {
 446       process_one(suspicious_file,filter_prog,num_files);
 447       add_words = False;
 448     }
 449
 450   for (i=0;i<num_files;i++)
 451     {
 452       if (suspicious_file && strcmp(suspicious_file,FileName(i))==0)
 453         continue;
 454
 455       printf(".");
 456       process_one(FileName(i),filter_prog,i);
 457     }
 458
 459   printf("\nfinished initial processing\n");
 460 }
 461
 462 /*****************************************************************************
 463 usage of the program
 464 ******************************************************************************/
 465 void usage(void)
 466 {
 467   printf("detect: find cheaters by looking at token similarities between files\n");
 468   printf("\ndetect [options] <files..>\n");
 469   printf("\t-h = give extended help and examples of use (RECOMMENDED)\n");
 470   printf("\t-c = be case sensitive\n");
 471   printf("\t-n num = output the top n pairs (default top 20%%)\n");
 472   printf("\t-t thresh = set threshold1 (default 3)\n");
 473   printf("\t-T thresh = set threshold2 (default 10)\n");
 474   printf("\t-C concatenate adjacent tokens to make new tokens (very useful!)\n");
 475   printf("\t-I ignore_str = ignore these chars in the file\n");
 476   printf("\t-f prog = filter the files through the given program\n");
 477   printf("\t-s filename = only look for matches to this particular suspicious file\n");
 478   printf("\t-w toggle weighting of the score by (threshold1-frequency) (default True)\n");
 479   printf("\n");
 480 }
 481
 482 /*****************************************************************************
 483 provide extended help
 484 ******************************************************************************/
 485 void help(void)
 486 {
 487   /* NOTE: this bit of text relies on a compiler that can handle new */
 488   /* lines in string literals. gcc is good at this */
 489
 490   char *help_txt = "
 491 This program tries to identify common words between a list of files
 492 in an attempt to find cases of plagiarism.
 493
 494 Algorithm:
 495 ==========
 496
 497 1) Build a list words that occur in the files, and a bitmap of which
 498    words occur in which files.
 499 2) Discard any words that occur in more than <threshold2> files
 500 3) Produce a matrix M where M(i,j) is the number of times a word was
 501    used both in file i and file j, not counting words that occur in
 502    more than <threshold1> files.
 503 4) Sort this matrix to produce the top <n> pairs, weighting by
 504    how infrequent the words are.
 505 5) Write out the pairs along with what words matched between them and
 506    the frequency of those words.
 507
 508
 509 Interpreting the output:
 510 ========================
 511
 512 Here is some sample output.
 513
 514 scored  13 in (stu1.mod,stu2.mod)
 515 AveAss(2) CalculateAss(2) OutputResult(2) StuMark(2) stuNum(2)
 516 InputFile(3) MaxAss(3) ReadAss(3) index(4) TutTotal(5)
 517
 518 This means that these two files (stu1.mod and stu2.mod) both contained
 519 all the identifiers listed. The identifiers with a (2) after them
 520 occurred only in these two files. The identifier TutTotal was used in
 521 5 files and is therefore less suspicious.
 522
 523 Example 1:
 524 ==========
 525
 526    detect *.mod
 527
 528 This will find common tokens between a list of modula files, listing
 529 which files have lots of common tokens, what the common tokens are and
 530 how common those tokens are across all the files.
 531
 532 Example 2:
 533 ==========
 534
 535   detect -c -n 10 -t 2 -T 20 -I ',_ ' *.c
 536
 537 This will look in a bunch of C progs, and rank the pairs that used
 538 tokens that were used by only 2 files. It will list common tokens
 539 between the files up to those tokens used by 20 files. It will skip
 540 over all underscores, commas and spaces. This last step is useful to
 541 catch people who just inserted underscores. Skipping spaces and commas
 542 has the effect of treating whole argument lists as a single word which
 543 can be a very effective technique. This example will also be case
 544 sensitive in comparisons. It will only show the top 10 matching pairs.
 545
 546 Example 3:
 547 ==========
 548
 549   detect -f 'grep PROCEDURE' *.mod
 550
 551 This will only look at lines that have the word PROCEDURE in them.
 552 This can be good for checking only whether procedure names match.
 553
 554
 555 Example 4:
 556 ==========
 557
 558    detect -s stu1.mod *.mod
 559
 560 This will only look for matches between the 'suspicious file' stu1.mod
 561 and all the other files. This is useful to detect cheating where you
 562 suspect a particular person, or to find a group who collaborated once
 563 you find one pair that are a good match.
 564
 565 Example 5:
 566 ==========
 567
 568   detect -w *.mod
 569
 570 This will disable weighting of the scores by (threshold1-frequency)
 571 thus making all matches of equal weight.
 572
 573 Example 6:
 574 ==========
 575
 576   detect -C `find . -name \"*.txt\"`
 577
 578 This will process all text files below the current directory and will
 579 also enable token concatenation. Token concatenation is very useful as
 580 it builds a much larger token pool with much less common tokens. It is
 581 particularly useful for essay type assignments which lack unique
 582 programming identifiers.
 583
 584 Advanced Notes:
 585 ===============
 586
 587 *) The filter programs used by -f can be any program that reads from
 588    standard input and writes to standard output.
 589
 590 *) The two thesholds and the output count can be given either as a
 591    proportion of the total number of files or as a absolute count. Thus
 592    `-t 0.05' will set the first threshold to 5% of the total number of
 593    files. Any number less than 1 is interpreted as a proportion.
 594
 595
 596 Author:
 597 =======
 598
 599  Andrew Tridgell
 600  Andrew.Tridgell@anu.edu.au
 601
 602 ";
 603
 604
 605   usage();
 606   puts(help_txt);
 607
 608 }
 609
 610
 611
 612 /*****************************************************************************
 613 the main program
 614 ******************************************************************************/
 615 int main(int argc,char **argv)
 616 {
 617   double threshold1 = 3;
 618   double threshold2 = 10;
 619   double output_thresh = 0.2;
 620   char *filter_prog=NULL;
 621
 622   int opt;
 623   extern char *optarg;
 624   extern int optind;
 625
 626   /* get some options */
 627   while ((opt = getopt (argc, argv, "Cwct:T:I:d:hbf:n:s:")) != EOF)
 628     switch (opt)
 629       {
 630       case 'n':
 631         output_thresh = atof(optarg);
 632         break;
 633       case 'f':
 634         filter_prog = strdup(optarg);
 635         break;
 636       case 'h':
 637         help();
 638         exit(0);
 639       case 'c':
 640         case_sensitive = !case_sensitive;
 641         break;
 642       case 'C':
 643         concat_words = !concat_words;
 644         break;
 645       case 'w':
 646         do_weighting = !do_weighting;
 647         break;
 648       case 'b':
 649         beta = !beta;
 650         break;
 651       case 't':
 652         threshold1 = atof(optarg);
 653         break;
 654       case 'T':
 655         threshold2 = atof(optarg);
 656         break;
 657       case 'I':
 658         strcat(ignore_chars,optarg);
 659         break;
 660       case 'd':
 661         debug = atoi(optarg);
 662         break;
 663       case 's':
 664         suspicious_file = strdup(optarg);
 665         break;
 666       default:
 667         usage();
 668         exit(0);
 669       }
 670
 671   /* get the file names */
 672   num_files = argc-optind;
 673   file_names = &argv[optind];
 674
 675   if (num_files < 2)
 676     {
 677       usage();
 678       exit(0);
 679     }
 680
 681   printf("\nProcessing %d files\n",num_files);
 682
 683   if (suspicious_file)
 684     printf("Suspicious file = %s\n",suspicious_file);
 685
 686   printf("Weighting=%s\tCase Sensitive=%s\n",
 687          bool_str(do_weighting),bool_str(case_sensitive));
 688
 689   printf("Threshold1=%g\tThreshold2=%g\n",threshold1,threshold2);
 690
 691   if (*ignore_chars)
 692     printf("Ignoring chars `%s'\n",ignore_chars);
 693
 694   if (filter_prog)
 695     printf("Filtering with `%s'\n",filter_prog);
 696
 697   /* process the files */
 698   process_files(filter_prog);
 699
 700   if (debug > 2)
 701     dump_word_table();
 702
 703   /* and get the matches */
 704   cheat_matrix(threshold1,threshold2,output_thresh);
 705
 706   return 0;
 707 }