2 a quick hack to test an idea for 16 bit BMG string searches
4 Inspired by the fact that Padre is slow when lots of alternates are
7 This method reduces the density of zero entries in the delta_table and
8 also makes the list of candidate targets much smaller when a zero
11 It still needs a lot of optimisation. In particular it needs to be
12 hard wired for 16 bit operation. Currently it can compile for ngraph
13 based matching, although n>2 would be silly.
17 compile it with "gcc -O2 -o bmg16 -DTAG=short bmg.c"
19 run it with "bmg16 'hello|goodbye|yellow|blue' /usr/dict/words"
26 #include <sys/fcntl.h>
31 /* this determines if the algorithm is 8 or 16 bit - it works for both */
36 typedef unsigned TAG tag;
37 typedef unsigned char uchar;
40 #define TABLE_SIZE (1<<(sizeof(tag)*8))
41 #define DELIMITERS "|"
42 #define MAX_ALTERNATES 255
43 #define AVG_ALTERNATES 4
44 #define PUNCTUATION " \t,;:.\"'`"
45 #define CASE_MASK (~(1<<5))
48 #define MIN(a,b) ((a)<(b)?(a):(b))
51 /* this is a case insensitive character match */
52 #define CHAR_EQUAL(c1,c2) (((c1)&CASE_MASK) == ((c2)&CASE_MASK))
54 static uchar delta_table[TABLE_SIZE];
55 static uchar target_list[TABLE_SIZE][AVG_ALTERNATES];
56 static uchar target_list_len[TABLE_SIZE];
57 static uchar *targets[MAX_ALTERNATES];
58 static int target_lengths[MAX_ALTERNATES];
59 static int num_targets = 0;
60 static int min_target_length = (1<<30);
62 /* extract a tag value from a buffer. Obviously if the tag size
63 were fixed (8 or 16 bit) this would be much faster */
64 static inline tag tag_value(uchar *buf)
68 for (i=0;i<sizeof(tag);i++)
69 ret |= (buf[i]<<(8*i));
73 /* check if two characters match using the rules for
75 static inline int uchar_match(uchar c,uchar b)
77 return(CHAR_EQUAL(c,b) || (b==' ' && strchr(PUNCTUATION,c)));
80 /* check if two tags are equal. This is quite general, and slow */
81 static inline int tag_equal(tag t,uchar *buf,int offset)
84 if (offset>0) offset=0;
85 for (i=-offset;i<sizeof(tag);i++)
86 if (!uchar_match((t>>(8*i))&0xFF,buf[i])) return(0);
90 /* compare two strings */
91 static inline int strnequal(uchar *s1,uchar *s2,int n)
94 if (!uchar_match(*s2,*s1)) return(0);
101 /* generate a table of what chars occur at all in the targets */
102 static void generate_char_table(char *char_table)
107 uchar *p = (uchar *)PUNCTUATION;
109 memset(char_table,0,256);
110 for (i=0;i<num_targets;i++)
111 for (j=0;j<target_lengths[i];j++) {
116 char_table[c & CASE_MASK] = char_table[c | ~CASE_MASK] = 1;
119 for (i=0;i<strlen(p);i++)
120 char_table[p[i]] = 1;
125 /* check if a tag is possible */
126 static inline int valid_tag(char *char_table,tag t)
128 if (!char_table[(t>>(8*(sizeof(tag)-1)))&0xFF]) return(0);
133 /* build the tables needed for the bmg search */
134 void bmg_build_table(uchar *target_string)
136 uchar *p = (uchar *)strdup(target_string);
140 char char_table[256];
144 /* parse the command */
145 for (tok=strtok(p,DELIMITERS); tok; tok = strtok(NULL,DELIMITERS)) {
146 targets[num_targets] = strdup(tok);
147 target_lengths[num_targets] = strlen(tok);
148 min_target_length = MIN(min_target_length,target_lengths[num_targets]);
153 if (min_target_length < sizeof(tag))
154 printf("Error! you have a target smaller than the tag\n");
156 if (!num_targets) return;
158 generate_char_table(char_table);
160 /* built the bmg table - this is a mess */
161 for (t=0;t<TABLE_SIZE;t++) {
163 delta_table[t] = min_target_length;
164 target_list_len[t] = 0;
166 if (!valid_tag(char_table,t))
169 for (i=0;i<num_targets;i++)
171 for (j=(target_lengths[i]-sizeof(tag));j+sizeof(tag)>=1;j--)
172 if (tag_equal(t,targets[i]+j,j))
174 delta_table[t] = MIN(delta_table[t],
175 target_lengths[i]-(j+sizeof(tag)));
176 if (target_list_len[t] < AVG_ALTERNATES) {
177 target_list[t][target_list_len[t]] = i;
178 target_list_len[t]++;
180 target_list_len[t] = MAX_ALTERNATES;
189 /* some counters to see how we did */
190 static int stringops=0;
191 static int comparisons=0;
193 /* check if a proposed match is valid */
194 static inline int check_target(uchar *buf,int pos,int i)
196 int posi = pos-target_lengths[i];
197 if (posi<0) return(0);
199 if (strnequal(targets[i],buf+posi,target_lengths[i])) {
200 printf("Found target %d (%s) at %d\n",i,targets[i],posi);
207 /* the search proper - it takes a buffer and finds the targets previously
208 specified when building the bmg tables */
209 void bmg_search(uchar *buf,int len)
219 for (pos=min_target_length; pos <= len;)
221 tag v = tag_value(buf+pos-sizeof(tag));
222 int skip = delta_table[v];
229 if (target_list_len[v] <= AVG_ALTERNATES)
230 for (found=0,i=0;i<target_list_len[v] && !found;i++)
231 found = check_target(buf,pos,target_list[v][i]);
233 for (found=0,i=0;i<num_targets && !found;i++)
234 found = check_target(buf,pos,i);
240 printf("\nGot %d hits with %d comparisons and %d stringops\n",
241 hits,comparisons,stringops);
245 /* a simple harness to test the above */
246 int main(int argc,char *argv[])
255 printf("Usage: bmg string|string|string... file\n");
259 target = (uchar *)argv[1];
261 fd = open(fname,O_RDONLY);
263 printf("couldn't open %s\n",fname);
266 len = lseek(fd,0,SEEK_END);
267 lseek(fd,0,SEEK_SET);
268 buf = (uchar *)malloc(len);
270 printf("failed to alloc buffer\n");
273 len = read(fd,buf,len);
275 printf("Loaded %d bytes\n",len);
277 bmg_build_table(target);
278 printf("Finished building tables\n");