Merge Samba3 and Samba4 together
[bbaumbach/samba-autobuild/.git] / source4 / lib / appweb / ejs-2.0 / exml / exmlParser.c
1 /*
2  *      exml.c -- A simple SAX style XML parser
3  */
4
5 /********************************* Description ********************************/
6 /*
7  *      This is a recursive descent parser for XML text files. It is a one-pass
8  *      simple parser that invokes a user supplied callback for key tokens in the
9  *      XML file. The user supplies a read function so that XML files can be parsed
10  *      from disk or in-memory. 
11  */
12 /********************************** Includes **********************************/
13
14 #include        "exml.h"
15
16 /****************************** Forward Declarations **************************/
17 /* MOB -- FIX */
18 #if BLD_FEATURE_EXML || 1
19
20 static int               parseNext(Exml *xp, int state);
21 static ExmlToken getToken(Exml *xp, int state);
22 static int               getNextChar(Exml *xp);
23 static int               scanFor(Exml *xp, char *str);
24 static int               putLastChar(Exml *xp, int c);
25 static void      error(Exml *xp, char *fmt, ...);
26 static void      trimToken(Exml *xp);
27
28 /************************************ Code ************************************/
29
30 Exml *exmlOpen(MprCtx ctx, int initialSize, int maxSize)
31 {
32         Exml    *xp;
33
34         xp = mprAllocTypeZeroed(ctx, Exml);
35         
36         xp->inBuf = mprCreateBuf(xp, EXML_BUFSIZE, EXML_BUFSIZE);
37         xp->tokBuf = mprCreateBuf(xp, initialSize, maxSize);
38
39         return xp;
40 }
41
42 /******************************************************************************/
43
44 void exmlClose(Exml *xp)
45 {
46         mprAssert(xp);
47
48         mprFree(xp);
49 }
50
51 /******************************************************************************/
52
53 void exmlSetParserHandler(Exml *xp, ExmlHandler h)
54 {
55         mprAssert(xp);
56
57         xp->handler = h;
58 }
59
60 /******************************************************************************/
61
62 void exmlSetInputStream(Exml *xp, ExmlInputStream s, void *arg)
63 {
64         mprAssert(xp);
65
66         xp->readFn = s;
67         xp->inputArg = arg;
68 }
69
70 /******************************************************************************/
71 /*
72  *      Set the parse arg
73  */ 
74
75 void exmlSetParseArg(Exml *xp, void *parseArg)
76 {
77         mprAssert(xp);
78
79         xp->parseArg = parseArg;
80 }
81
82 /******************************************************************************/
83 /*
84  *      Set the parse arg
85  */ 
86
87 void *exmlGetParseArg(Exml *xp)
88 {
89         mprAssert(xp);
90
91         return xp->parseArg;
92 }
93
94 /******************************************************************************/
95 /*
96  *      Parse an XML file. Return 0 for success, -1 for error.
97  */ 
98
99 int     exmlParse(Exml *xp)
100 {
101         mprAssert(xp);
102
103         return parseNext(xp, EXML_BEGIN);
104 }
105
106 /******************************************************************************/
107 /*
108  *      XML parser. This is a recursive descent parser. Return -1 for errors, 0 for
109  *      EOF and 1 if there is still more data to parse.
110  */
111
112 static int parseNext(Exml *xp, int state)
113 {
114         ExmlHandler     handler;
115         ExmlToken       token;
116         MprBuf          *tokBuf;
117         char            *tname, *aname;
118         int                     rc;
119
120         mprAssert(state >= 0);
121
122         tokBuf = xp->tokBuf;
123         handler = xp->handler;
124         tname = aname = 0;
125         rc = 0;
126         
127         /*
128          *      In this parse loop, the state is never assigned EOF or ERR. In
129          *      such cases we always return EOF or ERR.
130          */
131         while (1) {
132
133                 token = getToken(xp, state);
134
135                 if (token == TOKEN_TOO_BIG) {
136                         error(xp, "XML token is too big");
137                         goto err;
138                 }
139
140                 switch (state) {
141                 case EXML_BEGIN:                /* ------------------------------------------ */
142                         /*
143                          *      Expect to get an element, comment or processing instruction 
144                          */
145                         switch (token) {
146                         case TOKEN_EOF:
147                                 goto exit;
148
149                         case TOKEN_LS:
150                                 /*
151                                  *      Recurse to handle the new element, comment etc.
152                                  */
153                                 rc = parseNext(xp, EXML_AFTER_LS);
154                                 if (rc < 0) {
155                                         goto exit;
156                                 }
157                                 break;
158
159                         default:
160                                 error(xp, "Syntax error");
161                                 goto err;
162                         }
163                         break;
164
165                 case EXML_AFTER_LS: /* ------------------------------------------ */
166                         switch (token) {
167                         case TOKEN_COMMENT:
168                                 state = EXML_COMMENT;
169                                 rc = (*handler)(xp, state, "!--", 0, mprGetBufStart(tokBuf));
170                                 if (rc < 0) {
171                                         goto err;
172                                 }
173                                 rc = 1;
174                                 goto exit;
175
176                         case TOKEN_CDATA:
177                                 state = EXML_CDATA;
178                                 rc = (*handler)(xp, state, "!--", 0, mprGetBufStart(tokBuf));
179                                 if (rc < 0) {
180                                         goto err;
181                                 }
182                                 rc = 1;
183                                 goto exit;
184
185                         case TOKEN_INSTRUCTIONS:
186                                 /* Just ignore processing instructions */
187                                 rc = 1;
188                                 goto exit;
189
190                         case TOKEN_TEXT:
191                                 state = EXML_NEW_ELT;
192                                 tname = mprStrdup(xp, mprGetBufStart(tokBuf));
193                                 if (tname == 0) {
194                                         rc = MPR_ERR_MEMORY;
195                                         goto exit;
196                                 }
197                                 rc = (*handler)(xp, state, tname, 0, 0);
198                                 if (rc < 0) {
199                                         goto err;
200                                 }
201                                 break;
202
203                         default:
204                                 error(xp, "Syntax error");
205                                 goto err;
206                         }
207                         break;
208
209                 case EXML_NEW_ELT:      /* ------------------------------------------ */
210                         /*
211                          *      We have seen the opening "<element" for a new element and have
212                          *      not yet seen the terminating ">" of the opening element.
213                          */
214                         switch (token) {
215                         case TOKEN_TEXT:
216                                 /*
217                                  *      Must be an attribute name
218                                  */
219                                 aname = mprStrdup(xp, mprGetBufStart(tokBuf));
220                                 token = getToken(xp, state);
221                                 if (token != TOKEN_EQ) {
222                                         error(xp, "Missing assignment for attribute \"%s\"", aname);
223                                         goto err;
224                                 }
225
226                                 token = getToken(xp, state);
227                                 if (token != TOKEN_TEXT) {
228                                         error(xp, "Missing value for attribute \"%s\"", aname);
229                                         goto err;
230                                 }
231                                 state = EXML_NEW_ATT;
232                                 rc = (*handler)(xp, state, tname, aname,
233                                                 mprGetBufStart(tokBuf));
234                                 if (rc < 0) {
235                                         goto err;
236                                 }
237                                 state = EXML_NEW_ELT;
238                                 break;
239
240                         case TOKEN_GR:
241                                 /*
242                                  *      This is ">" the termination of the opening element
243                                  */
244                                 if (*tname == '\0') {
245                                         error(xp, "Missing element name");
246                                         goto err;
247                                 }
248
249                                 /*
250                                  *      Tell the user that the opening element is now complete
251                                  */
252                                 state = EXML_ELT_DEFINED;
253                                 rc = (*handler)(xp, state, tname, 0, 0);
254                                 if (rc < 0) {
255                                         goto err;
256                                 }
257                                 state = EXML_ELT_DATA;
258                                 break;
259
260                         case TOKEN_SLASH_GR:
261                                 /*
262                                  *      If we see a "/>" then this is a solo element
263                                  */
264                                 if (*tname == '\0') {
265                                         error(xp, "Missing element name");
266                                         goto err;
267                                 }
268                                 state = EXML_SOLO_ELT_DEFINED;
269                                 rc = (*handler)(xp, state, tname, 0, 0);
270                                 if (rc < 0) {
271                                         goto err;
272                                 }
273                                 rc = 1;
274                                 goto exit;
275         
276                         default:
277                                 error(xp, "Syntax error");
278                                 goto err;
279                         }
280                         break;
281
282                 case EXML_ELT_DATA:             /* -------------------------------------- */
283                         /*
284                          *      We have seen the full opening element "<name ...>" and now 
285                          *      await data or another element.
286                          */
287                         if (token == TOKEN_LS) {
288                                 /*
289                                  *      Recurse to handle the new element, comment etc.
290                                  */
291                                 rc = parseNext(xp, EXML_AFTER_LS);
292                                 if (rc < 0) {
293                                         goto exit;
294                                 }
295                                 break;
296
297                         } else if (token == TOKEN_LS_SLASH) {
298                                 state = EXML_END_ELT;
299                                 break;
300
301                         } else if (token != TOKEN_TEXT) {
302                                 goto err;
303                         }
304                         if (mprGetBufLength(tokBuf) > 0) {
305                                 /*
306                                  *      Pass the data between the element to the user
307                                  */
308                                 rc = (*handler)(xp, state, tname, 0, mprGetBufStart(tokBuf));
309                                 if (rc < 0) {
310                                         goto err;
311                                 }
312                         }
313                         break;
314
315                 case EXML_END_ELT:                      /* -------------------------------------- */
316                         if (token != TOKEN_TEXT) {
317                                 error(xp, "Missing closing element name for \"%s\"", tname);
318                                 goto err;
319                         }
320                         /*
321                          *      The closing element name must match the opening element name 
322                          */
323                         if (strcmp(tname, mprGetBufStart(tokBuf)) != 0) {
324                                 error(xp, 
325                                         "Closing element name \"%s\" does not match on line %d"
326                                         "opening name \"%s\"",
327                                         mprGetBufStart(tokBuf), xp->lineNumber, tname);
328                                 goto err;
329                         }
330                         rc = (*handler)(xp, state, tname, 0, 0);
331                         if (rc < 0) {
332                                 goto err;
333                         }
334                         if (getToken(xp, state) != TOKEN_GR) {
335                                 error(xp, "Syntax error");
336                                 goto err;
337                         }
338                         return 1;
339
340                 case EXML_EOF:          /* ---------------------------------------------- */
341                         goto exit;
342
343                 case EXML_ERR:          /* ---------------------------------------------- */
344                 default:
345                         goto err;
346                 }
347         }
348         mprAssert(0);
349
350 err:
351         rc = -1;
352
353 exit:
354         mprFree(tname);
355         mprFree(aname);
356
357         return rc;
358 }
359
360 /******************************************************************************/
361 /*
362  *      Lexical analyser for XML. Return the next token reading input as required.
363  *      It uses a one token look ahead and push back mechanism (LAR1 parser).
364  *      Text token identifiers are left in the tokBuf parser buffer on exit.
365  *      This Lex has special cases for the states EXML_ELT_DATA where we
366  *      have an optimized read of element data, and EXML_AFTER_LS where we 
367  *      distinguish between element names, processing instructions and comments. 
368  */
369
370 static ExmlToken getToken(Exml *xp, int state)
371 {
372         MprBuf          *tokBuf, *inBuf;
373         uchar           *cp;
374         int                     c, rc;
375
376         tokBuf = xp->tokBuf;
377         inBuf = xp->inBuf;
378
379         mprAssert(state >= 0);
380
381         if ((c = getNextChar(xp)) < 0) {
382                 return TOKEN_EOF;
383         }
384         mprFlushBuf(tokBuf);
385
386         /*
387          *      Special case parsing for names and for element data. We do this for
388          *      performance so we can return to the caller the largest token possible
389          */
390         if (state == EXML_ELT_DATA) {
391                 /*
392                  *      Read all the data up to the start of the closing element "<" or the
393                  *      start of a sub-element.
394                  */
395 #if UNUSED
396                 while (isspace(c)) {
397                         if ((c = getNextChar(xp)) < 0) {
398                                 return TOKEN_EOF;
399                         }
400                 }
401 #endif
402                 if (c == '<') {
403                         if ((c = getNextChar(xp)) < 0) {
404                                 return TOKEN_EOF;
405                         }
406                         if (c == '/') {
407                                 return TOKEN_LS_SLASH;
408                         }
409                         putLastChar(xp, c);
410                         return TOKEN_LS;
411                 }
412                 do {
413                         if (mprPutCharToBuf(tokBuf, c) < 0) {
414                                 return TOKEN_TOO_BIG;
415                         }
416                         if ((c = getNextChar(xp)) < 0) {
417                                 return TOKEN_EOF;
418                         }
419                 } while (c != '<');
420
421                 /*
422                  *      Put back the last look-ahead character
423                  */
424                 putLastChar(xp, c);
425
426                 /*
427                  *      If all white space, then zero the token buffer
428                  */
429                 for (cp = tokBuf->start; *cp; cp++) {
430                         if (!isspace(*cp)) {
431                                 return TOKEN_TEXT;
432                         }
433                 }
434                 mprFlushBuf(tokBuf);
435                 return TOKEN_TEXT;
436         }
437
438         while (1) {
439                 switch (c) {
440                 case ' ':
441                 case '\n':
442                 case '\t':
443                 case '\r':
444                         break;
445
446                 case '<':
447                         if ((c = getNextChar(xp)) < 0) {
448                                 return TOKEN_EOF;
449                         }
450                         if (c == '/') {
451                                 return TOKEN_LS_SLASH;
452                         }
453                         putLastChar(xp, c);
454                         return TOKEN_LS;
455         
456                 case '=':
457                         return TOKEN_EQ;
458
459                 case '>':
460                         return TOKEN_GR;
461
462                 case '/':
463                         if ((c = getNextChar(xp)) < 0) {
464                                 return TOKEN_EOF;
465                         }
466                         if (c == '>') {
467                                 return TOKEN_SLASH_GR;
468                         }
469                         return TOKEN_ERR;
470                 
471                 case '\"':
472                 case '\'':
473                         xp->quoteChar = c;
474                         /* Fall through */
475
476                 default:
477                         /*
478                          *      We handle element names, attribute names and attribute values 
479                          *      here. We do NOT handle data between elements here. Read the 
480                          *      token.  Stop on white space or a closing element ">"
481                          */
482                         if (xp->quoteChar) {
483                                 if ((c = getNextChar(xp)) < 0) {
484                                         return TOKEN_EOF;
485                                 }
486                                 while (c != xp->quoteChar) {
487                                         if (mprPutCharToBuf(tokBuf, c) < 0) {
488                                                 return TOKEN_TOO_BIG;
489                                         }
490                                         if ((c = getNextChar(xp)) < 0) {
491                                                 return TOKEN_EOF;
492                                         }
493                                 }
494                                 xp->quoteChar = 0;
495
496                         } else {
497                                 while (!isspace(c) && c != '>' && c != '/' && c != '=') {
498                                         if (mprPutCharToBuf(tokBuf, c) < 0) {
499                                                 return TOKEN_TOO_BIG;
500                                         }
501                                         if ((c = getNextChar(xp)) < 0) {
502                                                 return TOKEN_EOF;
503                                         }
504                                 }
505                                 putLastChar(xp, c);
506                         }
507                         if (mprGetBufLength(tokBuf) <= 0) {
508                                 return TOKEN_ERR;
509                         }
510                         mprAddNullToBuf(tokBuf);
511
512                         if (state == EXML_AFTER_LS) {
513                                 /*
514                                  *      If we are just inside an element "<", then analyze what we
515                                  *      have to see if we have an element name, instruction or
516                                  *      comment. Tokbuf will hold "?" for instructions or "!--"
517                                  *      for comments.
518                                  */
519                                 if (mprLookAtNextCharInBuf(tokBuf) == '?') {
520                                         /*      Just ignore processing instructions */
521                                         rc = scanFor(xp, "?>");
522                                         if (rc < 0) {
523                                                 return TOKEN_TOO_BIG;
524                                         } else if (rc == 0) {
525                                                 return TOKEN_ERR;
526                                         }
527                                         return TOKEN_INSTRUCTIONS;
528
529                                 } else if (mprLookAtNextCharInBuf(tokBuf) == '!') {
530                                         /*
531                                          *      First discard the comment leadin "!--" and eat leading 
532                                          *      white space.
533                                          */
534                                         if (strcmp((char*) tokBuf->start, "![CDATA[") == 0) {
535                                                 mprFlushBuf(tokBuf);
536 #if UNUSED
537                                                 c = mprLookAtNextCharInBuf(inBuf);
538                                                 while (isspace(c)) {
539                                                         if ((c = getNextChar(xp)) < 0) {
540                                                                 return TOKEN_EOF;
541                                                         }
542                                                         c = mprLookAtNextCharInBuf(inBuf);
543                                                 }
544 #endif
545                                                 rc = scanFor(xp, "]]>");
546                                                 if (rc < 0) {
547                                                         return TOKEN_TOO_BIG;
548                                                 } else if (rc == 0) {
549                                                         return TOKEN_ERR;
550                                                 }
551                                                 return TOKEN_CDATA;
552
553                                         } else {
554                                                 mprFlushBuf(tokBuf);
555 #if UNUSED
556                                                 c = mprLookAtNextCharInBuf(inBuf);
557                                                 while (isspace(c)) {
558                                                         if ((c = getNextChar(xp)) < 0) {
559                                                                 return TOKEN_EOF;
560                                                         }
561                                                         c = mprLookAtNextCharInBuf(inBuf);
562                                                 }
563 #endif
564                                                 rc = scanFor(xp, "-->");
565                                                 if (rc < 0) {
566                                                         return TOKEN_TOO_BIG;
567                                                 } else if (rc == 0) {
568                                                         return TOKEN_ERR;
569                                                 }
570                                                 return TOKEN_COMMENT;
571                                         }
572                                 }
573                         }
574                         trimToken(xp);
575                         return TOKEN_TEXT;
576                 }
577                 if ((c = getNextChar(xp)) < 0) {
578                         return TOKEN_EOF;
579                 }
580         }
581
582         /* Should never get here */
583         mprAssert(0);
584         return TOKEN_ERR;
585 }
586
587 /******************************************************************************/
588 /*
589  *      Scan for a pattern. Eat and discard input up to the pattern. Return 1 if
590  *      the pattern was found, return 0 if not found. Return < 0 on errors.
591  */
592
593 static int scanFor(Exml *xp, char *str)
594 {
595         MprBuf  *tokBuf;
596         char    *cp;
597         int             c;
598
599         mprAssert(str);
600
601         tokBuf = xp->tokBuf;
602
603         while (1) {
604                 for (cp = str; *cp; cp++) {
605                         if ((c = getNextChar(xp)) < 0) {
606                                 return 0;
607                         }
608                         if (tokBuf) {
609                                 if (mprPutCharToBuf(tokBuf, c) < 0) {
610                                         return -1;
611                                 }
612                         }
613                         if (c != *cp) {
614                                 break;
615                         }
616                 }
617                 if (*cp == '\0') {
618                         /*
619                          *      Remove the pattern from the tokBuf
620                          */
621                         if (tokBuf) {
622                                 mprAdjustBufEnd(tokBuf, -(int) strlen(str));
623                                 trimToken(xp);
624                         }
625                         return 1;
626                 }
627         }
628 }
629
630 /******************************************************************************/
631 /*
632  *      Get another character. We read and buffer blocks of data if we need more
633  *      data to parse.
634  */
635
636 static int getNextChar(Exml *xp)
637 {
638         MprBuf  *inBuf;
639         char    c;
640         int             l;
641
642         inBuf = xp->inBuf;
643         if (mprGetBufLength(inBuf) <= 0) {
644                 /*
645                  *      Flush to reset the servp/endp pointers to the start of the buffer
646                  *      so we can do a maximal read 
647                  */
648                 mprFlushBuf(inBuf);
649                 l = (xp->readFn)(xp, xp->inputArg, mprGetBufStart(inBuf), 
650                         mprGetBufLinearSpace(inBuf));
651                 if (l <= 0) {
652                         return -1;
653                 }
654                 mprAdjustBufEnd(inBuf, l);
655         }
656         c = mprGetCharFromBuf(inBuf);
657
658         if (c == '\n') {
659                 xp->lineNumber++;
660         }
661         return c;
662 }
663
664 /******************************************************************************/
665 /*
666  *      Put back a character in the input buffer
667  */
668
669 static int putLastChar(Exml *xp, int c)
670 {
671         if (mprInsertCharToBuf(xp->inBuf, (char) c) < 0) {
672                 mprAssert(0);
673                 return -1;
674         }
675         if (c == '\n') {
676                 xp->lineNumber--;
677         }
678         return 0;
679 }
680
681 /******************************************************************************/
682 /*
683  *      Output a parse message
684  */ 
685
686 static void error(Exml *xp, char *fmt, ...)
687 {
688         va_list         args;
689         char            *buf;
690
691         mprAssert(fmt);
692
693         va_start(args, fmt);
694         mprAllocVsprintf(MPR_LOC_ARGS(xp), &buf, MPR_MAX_STRING, fmt, args);
695         va_end(args);
696
697         /*
698          *      MOB need to add the failing line text and a pointer to which column
699          */
700         mprFree(xp->errMsg);
701         mprAllocSprintf(MPR_LOC_ARGS(xp), &xp->errMsg, MPR_MAX_STRING, 
702                 "XML error: %s\nAt line %d\n", buf, xp->lineNumber);
703
704         mprFree(buf);
705 }
706
707 /******************************************************************************/
708 /*
709  *      Remove trailing whitespace in a token and ensure it is terminated with
710  *      a NULL for easy parsing
711  */
712
713 static void trimToken(Exml *xp)
714 {
715         while (isspace(mprLookAtLastCharInBuf(xp->tokBuf))) {
716                 mprAdjustBufEnd(xp->tokBuf, -1);
717         }
718         mprAddNullToBuf(xp->tokBuf);
719 }
720
721 /******************************************************************************/
722
723 const char *exmlGetErrorMsg(Exml *xp)
724 {
725         if (xp->errMsg == 0) {
726                 return "";
727         }
728         return xp->errMsg;
729 }
730
731 /******************************************************************************/
732
733 int exmlGetLineNumber(Exml *xp)
734 {
735         return xp->lineNumber;
736 }
737
738 /******************************************************************************/
739 #else
740
741 void exmlParserDummy() {}
742 #endif /* BLD_FEATURE_EXML */
743
744 /*
745  * Local variables:
746  * tab-width: 4
747  * c-basic-offset: 4
748  * End:
749  * vim:tw=78
750  * vim600: sw=4 ts=4 fdm=marker
751  * vim<600: sw=4 ts=4
752  */