1 /* COPYRIGHT (C) The Australian National University 1995 */
2 /*************************************************************************/
4 /* Digraph Boyer-Moore-Gosper algorithm */
6 /***************************************************************************/
8 /* $Id: bmg2.c,v 1.1 2000/10/10 02:16:17 tridge Exp $ */
11 a digraph version of the bmg algorithm
18 This module has the following interface:
20 int bmg_build_table(uchar *target_string,uchar *char_map,int wsmode);
21 void bmg_search(uchar *buf,int len,void (*fn)());
23 You call bmg_build_table() with the list of search terms, separated by
24 | The char_map[] is a character array which maps characters from the
25 data into characters in the search string. If this is null then a
26 default mapping is performed. The default mapping maps all non
27 alphanumeric characters to a space and other characters to themselves.
29 If wsmode is true then the search routine will only find words that begin
30 with the target strings. If wsmode is 0 then it will accept the
31 targets at any alignment.
33 bmg_search() is called with a pointer to the data, the length of the
34 data and a function to call when a match is found. This function will
35 be called with a pointer to the point at which the match was
36 found. When this function returns the search will automatically
37 continue from where it left off, calling fn() at each matching string.
39 Calling bmg_build_table() with a NULL target_string will free the tables
40 that are created by bmg_build_table().
45 #include <sys/fcntl.h>
54 typedef unsigned short tag;
59 /* if you have targets longer than 255 chars they will be truncated */
60 #define MAX_TARGET_LENGTH 255
62 /* this is what is used to delimit search alternates in the target
63 string passed to the bmg_build_table() routine */
64 #define DELIMITERS "|"
65 #define SPACE_CHAR ' '
67 /* the following defines are optimisation constants. Leave them
68 alone unless you know the code */
69 #define DIGRAPH_THRESHOLD 3
72 #define MIN(a,b) ((a)<(b)?(a):(b))
75 #define MAX(a,b) ((a)>(b)?(a):(b))
85 static u_char space_char = SPACE_CHAR;
86 static u_char char_bits = 0;
87 static int max_char = 0;
88 static int table_size = 0;
89 static u_char *delta_table = NULL;
91 static u_char *delta_table2 = NULL;
93 static tag *tag_table = NULL;
94 static struct target *targets = NULL;
95 static tag num_targets = 0;
96 static int min_target_length = MAX_TARGET_LENGTH;
97 static int max_target_length = 0;
98 static tag char_table[256];
99 static tag char_table_shifted[256];
101 static int digraph=1;
102 static int tagsize=2;
105 #define TAGV2(buf) ((((tag)(((u_char *)buf)[1])&0x7F)<<7)|(((u_char *)buf)[0]&0x7f))
108 #define TAGV2(buf) ((((tag)(((u_char *)buf)[1]))<<8)|(((u_char *)buf)[0]))
111 /* extract a digraph tag value from a buffer. This must be fast, and
112 is a clear candidate for assembly optimisation */
113 static inline tag tag_value16(u_char *buf)
115 return(char_table_shifted[buf[1]]|char_table[buf[0]]);
118 /* this is like tag_value16 but it uses the unmapped characters */
119 static inline tag tag_value16u(u_char *buf)
121 return((buf[1]<<char_bits)|buf[0]);
124 /* check if two characters match */
125 static inline int u_char_match(u_char c,u_char b)
127 return(char_table[c] == b);
130 /* compare two strings. We unroll a bit for speed. */
131 static inline int strnequal(u_char *s1,u_char *s2,int n)
134 if (!(u_char_match(s2[0],s1[0]) &&
135 u_char_match(s2[1],s1[1]) &&
136 u_char_match(s2[2],s1[2]))) return(0);
141 if (!u_char_match(s2[0],s1[0])) return(0);
148 /* add a tag to the delta table. Possibly update the list of targets
149 for the tag as well */
150 static inline void add_to_table(tag t,int i,int j)
152 int skip = targets[i].length-(j+tagsize);
155 delta_table[t] = MIN(delta_table[t],skip);
158 targets[i].next = MAX(targets[i].next,(int)t);
159 tag_table[t] = (tag)i;
164 /* remap the character tables and the targets to pack them into a small
165 area in the delta table - this helps the cacheing */
166 static void remap_char_table(void)
169 u_char target_chars[256];
171 /* work out what chars exist in the targets */
172 memset(target_chars,0,256);
173 for (i=0;i<num_targets;i++)
174 for (j=0;j<targets[i].length;j++)
175 target_chars[targets[i].str[j]] = 1;
177 /* force a space to exist */
178 if (wsmode) target_chars[SPACE_CHAR] = 1;
180 /* map each of the chars to a number */
183 if (target_chars[i]) target_chars[i] = j++;
186 /* remember what the space is */
187 space_char = target_chars[SPACE_CHAR];
189 /* remap the targets */
190 for (i=0;i<num_targets;i++)
191 for (j=0;j<targets[i].length;j++)
192 targets[i].str[j] = target_chars[targets[i].str[j]];
194 /* remap the char table */
196 char_table[i] = target_chars[char_table[i]];
198 /* find how many bits are needed for the char table */
206 /* create the shifted table */
208 char_table_shifted[i] = char_table[i]<<char_bits;
212 int compare_targets(struct target *t1,struct target *t2)
214 return(t1->next - t2->next);
217 int compare_targets2(int *t1,int *t2)
219 return(targets[*t1].next - targets[*t2].next);
223 static void free_old_tables(void)
226 /* free up from the last search */
227 for (i=0;i<num_targets;i++) free(targets[i].str);
228 if (targets) {free(targets); targets = NULL;}
229 if (tag_table) {free(tag_table); tag_table = NULL;}
230 if (delta_table) {free(delta_table); delta_table = NULL;}
232 if (delta_table2) {free(delta_table2); delta_table2 = NULL;}
237 /* build the tables needed for the bmg search */
238 int bmg_build_table(u_char *target_string,u_char *char_map,int mode)
246 /* remember the wsmode */
249 /* clean up from a previous run */
251 space_char = SPACE_CHAR;
253 if (!target_string) return(-1);
257 char_table[i] = char_map[i];
260 char_table[i] = isalnum(i)?i:SPACE_CHAR;
263 min_target_length = MAX_TARGET_LENGTH;
264 max_target_length = 0;
266 /* count the targets */
267 p = (u_char *)strdup(target_string);
268 for (tok=strtok(p,DELIMITERS); tok; tok = strtok(NULL,DELIMITERS))
272 /* allocate the targets */
273 targets = (struct target *)malloc(sizeof(targets[0])*num_targets);
280 /* parse the command */
281 p = (u_char *)strdup(target_string);
287 for (tok=strtok(p,DELIMITERS); tok; tok = strtok(NULL,DELIMITERS)) {
289 targets[i].next = -1;
290 targets[i].str = strdup(tok);
291 if (!targets[i].str) {
295 l = MIN(l,MAX_TARGET_LENGTH);
296 targets[i].length = l;
297 min_target_length = MIN(min_target_length,targets[i].length);
298 max_target_length = MAX(max_target_length,targets[i].length);
299 if (targets[i].str[l-1]==' ') dodigraph=1;
305 /* determine if we will do a digraph search or not */
306 if (!dodigraph && num_targets<DIGRAPH_THRESHOLD) {
321 /* build the bmg table - this is a mess */
323 table_size = 1<<(char_bits*2);
327 delta_table = (u_char *)malloc(table_size*sizeof(delta_table[0]));
328 tag_table = (tag *)malloc(table_size*sizeof(tag_table[0]));
330 if (!delta_table || !tag_table) {
342 delta_table2 = (u_char *)malloc(tabsize*sizeof(delta_table[0]));
350 for (t=0;t<table_size;t++) {
351 delta_table[t] = min_target_length;
352 tag_table[t] = num_targets;
357 for (i=0;i<num_targets;i++)
359 int length = targets[i].length;
360 char *str = targets[i].str;
363 if (length == min_target_length) {
367 add_to_table(t,i,-1);
371 /* this little loop handles the first char in each
372 target - our tags must match for any possible char
375 add_to_table(t,i,-1);
376 if (c[0] == (max_char-1)) break;
380 for (j=MAX(0,length-(min_target_length+1));j<=(length-2);j++)
382 t = tag_value16u(str+j);
390 for (i=0;i<num_targets;i++)
392 int length = targets[i].length;
393 char *str = targets[i].str;
394 for (j=(length-1);j>=(length-min_target_length);j--)
395 if (char_table[t] == str[j]) {
397 break; /* go to the next target */
405 int *target_map1 = (int *)malloc(sizeof(int)*num_targets);
406 int *target_map2 = (int *)malloc(sizeof(int)*num_targets);
407 if (!target_map1 || !target_map2) {
412 for (i=0;i<num_targets;i++) target_map1[i]=i;
414 /* sort the targets by the "next" value thay have in the target struct */
415 qsort(target_map1,num_targets,sizeof(int),(int (*)())compare_targets2);
416 qsort(targets,num_targets,sizeof(targets[0]),(int (*)())compare_targets);
418 for (i=0;i<num_targets;i++) target_map2[target_map1[i]]=i;
420 for (t=0;t<table_size;t++) {
421 if (delta_table[t]) continue;
422 tag_table[t] = target_map2[tag_table[t]];
423 while (tag_table[t] &&
424 targets[tag_table[t]-1].next == targets[tag_table[t]].next)
431 for (i=0;i<num_targets-1;i++)
432 if (targets[i].next == targets[i+1].next)
433 targets[i].next = i+1;
435 targets[i].next = -1;
436 targets[num_targets-1].next = -1;
441 for (c[0]=0;c[0]<(1<<7);c[0]++)
442 for (c[1]=0;c[1]<(1<<7);c[1]++)
443 delta_table2[TAGV2(c)] = delta_table[tag_value16(c)];
449 u_char *c = (u_char *)&v;
450 for (t=0;t<(1<<16);t++) {
452 delta_table2[TAGV2(c)] = delta_table[tag_value16(c)];
461 /* check if a single proposed match is valid */
462 static inline int check_target(u_char *buf,int i,void (*fn)())
464 if (wsmode && char_table[buf[-1]]!=space_char) return(0);
465 if (strnequal(targets[i].str,buf,targets[i].length) && fn) {
473 /* check if a proposed match is valid against the list of alternates
475 static void check_targets(u_char *buf,tag v,void (*fn)())
477 int i = tag_table[v];
479 u_char *buf2 = buf-targets[i].length;
480 u_char *str = targets[i].str;
481 if (u_char_match(*buf2,*str))
482 check_target(buf2,i,fn);
488 /* the search proper, 8 bit version - it takes a buffer and finds the
489 targets previously specified when building the bmg tables */
490 #define TAGV(buf) ((buf)[0])
491 #define DELTAB(buf) delta_table[TAGV(buf)]
493 static void bmg_search8(u_char *buf,int len,void (*fn)())
495 u_char *bufend = buf + len - (TAGSIZE-1);
498 buf += min_target_length - TAGSIZE;
499 bufend -= 8*MAX_TARGET_LENGTH;
506 buf += (k = DELTAB(buf));
508 if (DELTAB(buf-2) < 3 && DELTAB(buf-1) < 2)
509 check_targets(buf+TAGSIZE,TAGV(buf),fn);
514 bufend += 8*MAX_TARGET_LENGTH;
518 buf += (k = DELTAB(buf));
520 if (DELTAB(buf-2) < 3 && DELTAB(buf-1) < 2)
521 check_targets(buf+TAGSIZE,TAGV(buf),fn);
532 #define TAGV(buf) tag_value16(buf)
534 #define DELTAB(buf) delta_table2[TAGV2(buf)]
536 #define DELTAB(buf) delta_table[TAGV(buf)]
539 /* this code is identical to the 8 bit search, except for the above two
541 static void bmg_search16(u_char *buf,int len,void (*fn)())
543 u_char *bufend = buf + len - (TAGSIZE-1);
546 buf += min_target_length - TAGSIZE;
547 bufend -= 8*MAX_TARGET_LENGTH;
554 buf += (k=DELTAB(buf));
556 if (DELTAB(buf-2) < 3 && DELTAB(buf-1) < 2)
557 check_targets(buf+TAGSIZE,TAGV(buf),fn);
562 bufend += 8*MAX_TARGET_LENGTH;
566 buf += (k = DELTAB(buf));
568 if (DELTAB(buf-2) < 3 && DELTAB(buf-1) < 2)
569 check_targets(buf+TAGSIZE,TAGV(buf),fn);
577 /* choose 8 or 16 bit search */
578 void bmg_search(u_char *buf,int len,void (*fn)())
581 if (!num_targets || !buf || !fn || len<1) return;
583 /* a special case - check the start of the buffer */
584 for (j=0;j<max_target_length;j++) {
585 for (i=0;i<num_targets;i++)
586 if (len >= targets[i].length && check_target(buf,i,fn)) break;
593 bmg_search16(buf,len,fn);
595 bmg_search8(buf,len,fn);