1 /*****************************************************************************
2 A simple C program to detect cheating
6 - create a list of words in the files
7 - foreach word create a bitmap of where they occurred
8 - ignore words that occur in a large proportion of files
9 - produce a matrix of co-occurances
10 - output the pairs with the highest co-occurances
14 - two thresholds, one for rating matches, one for display of matches
15 - ignore strings; allowing certain chars to be skipped in the input
17 Andrew Tridgell, May 1994
19 ******************************************************************************/
25 typedef enum {False=0,True=1} BOOL;
30 /* should I be case sensitive? */
31 BOOL case_sensitive = False;
33 /* how big can a word get? */
34 #define MAX_WORD_LEN 500
36 /* how many files are we processing? */
39 /* the names of the files we are processing - normally stolen from argv */
40 char **file_names = NULL;
42 /* how many words have we found? */
45 /* a list of chars to ignore - can be interesting */
46 char ignore_chars[MAX_WORD_LEN] = "";
48 /* what debug level? */
52 a list of all the words found and where they occurred
56 int count; /* how many files this word occurs in */
57 char *word; /* text of word */
58 int *occurred; /* count of occurrance in each file */
61 int *file_totals=NULL;
63 /*****************************************************************************
64 like realloc() but can start from scratch, and exits if there is an error
65 ******************************************************************************/
66 void *Realloc(void *ptr,int size)
71 ptr = realloc(ptr,size);
74 printf("Can't allocate %d bytes\n",size);
80 /*****************************************************************************
81 decide if two words are equal
82 ******************************************************************************/
83 BOOL wordequal(char *word1,char *word2)
86 return(strcmp(word1,word2) == 0);
88 return(strcasecmp(word1,word2) == 0);
92 /*****************************************************************************
93 ask if a word has occurred before - return word index if it has
95 ******************************************************************************/
96 int find_word(char *word)
101 /* we can take advantage of the order of the words */
102 ret = insert_position(word);
103 if (ret >= num_words || !wordequal(word,words[ret].word))
109 /*****************************************************************************
110 find an insertion position for a new word
111 ******************************************************************************/
112 int insert_position(char *word)
114 int high_i = num_words;
118 /* do a bisection search - this assumes the list is kept ordered */
119 while (high_i > low_i)
121 guess = (high_i + low_i)/2;
123 ret = strcmp(word,words[guess].word);
125 ret = strcasecmp(word,words[guess].word);
137 /*****************************************************************************
138 Insert a word into the list of words that have occurred.
139 If the word already exists then just set the occured bit for the filenum
140 ******************************************************************************/
141 void insert_word(char *word,int filenum)
144 wordnum = find_word(word);
148 printf("new word %s from filenum %d\n",word,filenum);
150 wordnum = insert_position(word);
151 words = (struct word *)Realloc(words,sizeof(struct word)*(num_words+1));
154 for (i=num_words-2;i>=wordnum;i--)
155 words[i+1] = words[i];
157 words[wordnum].count=0;
158 words[wordnum].word = strdup(word);
159 words[wordnum].occurred =
160 (int *)Realloc(NULL,sizeof(int)*num_files);
162 for (i=0;i<num_files;i++) words[wordnum].occurred[i] = 0;
165 if (!words[wordnum].occurred[filenum]) words[wordnum].count++;
166 words[wordnum].occurred[filenum]++;
169 /*****************************************************************************
170 dump the word occurrance table
171 ******************************************************************************/
172 void dump_word_table(void)
175 for (i=0;i<num_files;i++)
176 printf("%s has %d words\n",file_names[i],file_totals[i]);
179 for (i=0;i<num_words;i++)
181 printf("%-20.20s\t",words[i].word);
182 for (j=0;j<num_files;j++)
183 printf("%6.2f (%5d) \t",
184 1000.0*100.0*words[i].occurred[j]/file_totals[j],
185 words[i].occurred[j]);
190 /*****************************************************************************
191 read one character - skipping over the chars in ignore
192 ******************************************************************************/
193 int Read(FILE *f,char *ignore)
196 if (ch == EOF) return(EOF);
197 if (*ignore && (strchr(ignore,(char)ch) != NULL))
198 return(Read(f,ignore));
202 /*****************************************************************************
203 get a word from a stream. return the word. return NULL if no more words
204 exist. possibly skip comments?
205 ******************************************************************************/
206 char *get_word(FILE *f)
208 static char word[MAX_WORD_LEN+1];
214 /* skip over non alphanum chars */
215 while ((ch = Read(f,ignore_chars)) != EOF)
217 if (isalnum(ch)) break;
219 if (ch == EOF) return(NULL);
221 word[word_len++] = ch;
223 /* grab all the alphanums */
224 while (word_len < MAX_WORD_LEN && (isalnum(ch = Read(f,ignore_chars))))
225 word[word_len++] = ch;
227 word[word_len] = '\0';
232 /* used for cheating matrix */
239 /*****************************************************************************
240 compare 2 pair structures - used for sorting
241 ******************************************************************************/
242 int pair_compare(struct pair *t1,struct pair *t2)
244 return(t2->count - t1->count);
247 /*****************************************************************************
248 compare 2 word structures - used for sorting
249 ******************************************************************************/
250 int words_compare(struct word *t1,struct word *t2)
252 if (t2->count == t1->count)
253 return(strcmp(t1->word,t2->word));
254 return(t1->count - t2->count);
258 /*****************************************************************************
259 process the files by opening them and getting the words
260 ******************************************************************************/
261 void process_files(void)
265 file_totals = (int *)malloc(sizeof(int)*num_files);
266 bzero(file_totals,sizeof(int)*num_files);
268 for (i=0;i<num_files;i++)
270 FILE *f = fopen(file_names[i],"r");
274 printf("Can't open %s\n",file_names[i]);
278 printf("processing file %s\n",file_names[i]);
282 while ((word = get_word(f)) != NULL) {
291 /*****************************************************************************
292 provide extended help
293 ******************************************************************************/
299 This program tries to identify common words between a list of files,
300 in an attempt to find cases of plagiarism. It first builds a list of
301 all words in all the files, and a bitmap of which words occur in which
302 files. It then ignores very common words
305 printf("extended help is not written yet - are you volunteering?\n\n");
309 /*****************************************************************************
311 ******************************************************************************/
314 printf("detect: find cheaters by looking at token similarities between files\n");
315 printf("\ndetect [options] <files..>\n");
316 printf("\t-c = be case sensitive\n");
317 printf("\t-t thresh = set threshold1\n");
318 printf("\t-T thresh = set threshold2\n");
319 printf("\t-I ignore_str = ignore these chars in the file\n");
325 /*****************************************************************************
327 ******************************************************************************/
328 int main(int argc,char **argv)
330 double threshold1 = 0.01;
331 double threshold2 = 0.1;
337 /* get some options */
338 while ((opt = getopt (argc, argv, "ct:T:I:d:hb")) != EOF)
345 case_sensitive = !case_sensitive;
351 threshold1 = atof(optarg);
354 threshold2 = atof(optarg);
357 strcat(ignore_chars,optarg);
360 debug = atoi(optarg);
366 /* get the file names */
367 num_files = argc-optind;
368 file_names = &argv[optind];
373 /* process the files */