3 * @brief EJS Lexical Analyser
4 * @overview EJS lexical analyser. This implementes a lexical analyser
5 * for a subset of the JavaScript language.
7 /********************************* Copyright **********************************/
11 * Copyright (c) Mbedthis Software LLC, 2003-2005. All Rights Reserved.
12 * Portions Copyright (c) GoAhead Software, 1995-2000. All Rights Reserved.
14 * This software is distributed under commercial and open source licenses.
15 * You may use the GPL open source license described below or you may acquire
16 * a commercial license from Mbedthis Software. You agree to be fully bound
17 * by the terms of either license. Consult the LICENSE.TXT distributed with
18 * this software for full details.
20 * This software is open source; you can redistribute it and/or modify it
21 * under the terms of the GNU General Public License as published by the
22 * Free Software Foundation; either version 2 of the License, or (at your
23 * option) any later version. See the GNU General Public License for more
24 * details at: http://www.mbedthis.com/downloads/gplLicense.html
26 * This program is distributed WITHOUT ANY WARRANTY; without even the
27 * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
29 * This GPL license does NOT permit incorporating this software into
30 * proprietary programs. If you are unable to comply with the GPL, you must
31 * acquire a commercial license to use this software. Commercial licenses
32 * for this software and support services are available from Mbedthis
33 * Software at http://www.mbedthis.com
37 /********************************** Includes **********************************/
39 #include "ejsInternal.h"
43 /****************************** Forward Declarations **************************/
45 static int getLexicalToken(Ejs *ep, int state);
46 static int tokenAddChar(Ejs *ep, int c);
47 static int inputGetc(Ejs *ep);
48 static void inputPutback(Ejs *ep, int c);
49 static int charConvert(Ejs *ep, int base, int maxDig);
51 /************************************* Code ***********************************/
53 * Open a new input script
56 int ejsLexOpenScript(Ejs *ep, char *script)
63 if ((ip = mprMalloc(sizeof(EjsInput))) == NULL) {
66 memset(ip, 0, sizeof(*ip));
69 ip->procName = ep->proc?ep->proc->procName:NULL;
72 * Create the parse token buffer and script buffer
74 ip->tokbuf = mprMalloc(EJS_PARSE_INCR);
75 ip->tokSize = EJS_PARSE_INCR;
76 ip->tokServp = ip->tokbuf;
77 ip->tokEndp = ip->tokbuf;
79 ip->script = mprStrdup(script);
80 ip->scriptSize = strlen(script);
81 ip->scriptServp = ip->script;
88 ip->putBackIndex = -1;
93 /******************************************************************************/
95 * Close the input script
98 void ejsLexCloseScript(Ejs *ep)
107 ep->input = ip->next;
109 for (i = 0; i < EJS_TOKEN_STACK; i++) {
110 mprFree(ip->putBack[i].token);
111 ip->putBack[i].token = 0;
121 /******************************************************************************/
123 * Initialize an input state structure
126 int ejsInitInputState(EjsInput *ip)
130 memset(ip, 0, sizeof(*ip));
131 ip->putBackIndex = -1;
135 /******************************************************************************/
137 * Save the input state
140 void ejsLexSaveInputState(Ejs *ep, EjsInput *state)
152 for (i = 0; i < ip->putBackIndex; i++) {
153 state->putBack[i].token = mprStrdup(ip->putBack[i].token);
154 state->putBack[i].id = ip->putBack[i].id;
156 for (; i < EJS_TOKEN_STACK; i++) {
157 state->putBack[i].token = 0;
160 state->line = mprMalloc(ip->lineLength);
161 mprStrcpy(state->line, ip->lineLength, ip->line);
163 state->lineColumn = ip->lineColumn;
164 state->lineNumber = ip->lineNumber;
165 state->lineLength = ip->lineLength;
168 /******************************************************************************/
170 * Restore the input state
173 void ejsLexRestoreInputState(Ejs *ep, EjsInput *state)
184 ip->tokbuf = state->tokbuf;
185 ip->tokServp = state->tokServp;
186 ip->tokEndp = state->tokEndp;
187 ip->tokSize = state->tokSize;
189 ip->script = state->script;
190 ip->scriptServp = state->scriptServp;
191 ip->scriptSize = state->scriptSize;
193 ip->putBackIndex = state->putBackIndex;
194 for (i = 0; i < ip->putBackIndex; i++) {
195 mprFree(ip->putBack[i].token);
196 ip->putBack[i].id = state->putBack[i].id;
197 ip->putBack[i].token = mprStrdup(state->putBack[i].token);
201 ip->line = mprMalloc(state->lineLength);
202 mprStrcpy(ip->line, state->lineLength, state->line);
204 ip->lineColumn = state->lineColumn;
205 ip->lineNumber = state->lineNumber;
206 ip->lineLength = state->lineLength;
209 /******************************************************************************/
211 * Free a saved input state
214 void ejsLexFreeInputState(Ejs *ep, EjsInput *state)
221 for (i = 0; i < EJS_TOKEN_STACK; i++) {
222 mprFree(state->putBack[i].token);
224 state->putBackIndex = -1;
225 mprFree(state->line);
226 state->lineLength = 0;
227 state->lineColumn = 0;
230 /******************************************************************************/
232 * Get the next EJS token
235 int ejsLexGetToken(Ejs *ep, int state)
239 ep->tid = getLexicalToken(ep, state);
243 /******************************************************************************/
246 * Check for reserved words "if", "else", "var", "for", "foreach",
247 * "delete", "function", and "return". "new", "in" and "function"
248 * done below. "true", "false", "null", "undefined" are handled
251 * Other reserved words not supported:
252 * "break", "case", "catch", "continue", "default", "do",
253 * "finally", "instanceof", "switch", "this", "throw", "try",
254 * "typeof", "while", "with"
256 * ECMA extensions reserved words (not supported):
257 * "abstract", "boolean", "byte", "char", "class", "const",
258 * "debugger", "double", "enum", "export", "extends",
259 * "final", "float", "goto", "implements", "import", "int",
260 * "interface", "long", "native", "package", "private",
261 * "protected", "public", "short", "static", "super",
262 * "synchronized", "throws", "transient", "volatile"
265 static int checkReservedWord(Ejs *ep, int state, int c, int tid)
267 if (state == EJS_STATE_STMT) {
268 if (strcmp(ep->token, "if") == 0) {
271 } else if (strcmp(ep->token, "else") == 0) {
274 } else if (strcmp(ep->token, "var") == 0) {
277 } else if (strcmp(ep->token, "for") == 0) {
280 } else if (strcmp(ep->token, "delete") == 0) {
282 return EJS_TOK_DELETE;
283 } else if (strcmp(ep->token, "function") == 0) {
285 return EJS_TOK_FUNCTION;
286 } else if (strcmp(ep->token, "return") == 0) {
287 if ((c == ';') || (c == '(')) {
290 return EJS_TOK_RETURN;
292 } else if (state == EJS_STATE_EXPR) {
293 if (strcmp(ep->token, "new") == 0) {
296 } else if (strcmp(ep->token, "in") == 0) {
299 } else if (strcmp(ep->token, "function") == 0) {
301 return EJS_TOK_FUNCTION;
307 /******************************************************************************/
309 * Get the next EJS token
312 static int getLexicalToken(Ejs *ep, int state)
316 int done, tid, c, quote, style, idx;
324 type = BLD_FEATURE_NUM_TYPE_ID;
327 * Use a putback tokens first. Don't free strings as caller needs access.
329 if (ip->putBackIndex >= 0) {
330 idx = ip->putBackIndex;
331 tid = ip->putBack[idx].id;
332 ep->token = (char*) ip->putBack[idx].token;
333 tid = checkReservedWord(ep, state, 0, tid);
337 ep->token = ip->tokServp = ip->tokEndp = ip->tokbuf;
338 *ip->tokServp = '\0';
340 if ((c = inputGetc(ep)) < 0) {
345 * Main lexical analyser
347 for (done = 0; !done; ) {
356 if ((c = inputGetc(ep)) < 0)
358 } while (c == ' ' || c == '\t' || c == '\r');
362 return EJS_TOK_NEWLINE;
366 return EJS_TOK_LPAREN;
370 return EJS_TOK_RPAREN;
374 return EJS_TOK_LBRACKET;
378 return EJS_TOK_RBRACKET;
382 return EJS_TOK_PERIOD;
386 return EJS_TOK_LBRACE;
390 return EJS_TOK_RBRACE;
393 if ((c = inputGetc(ep)) < 0) {
394 ejsError(ep, "Syntax Error");
399 tokenAddChar(ep, EJS_EXPR_PLUS);
402 tokenAddChar(ep, EJS_EXPR_INC);
403 return EJS_TOK_INC_DEC;
406 if ((c = inputGetc(ep)) < 0) {
407 ejsError(ep, "Syntax Error");
412 tokenAddChar(ep, EJS_EXPR_MINUS);
415 tokenAddChar(ep, EJS_EXPR_DEC);
416 return EJS_TOK_INC_DEC;
419 tokenAddChar(ep, EJS_EXPR_MUL);
423 tokenAddChar(ep, EJS_EXPR_MOD);
428 * Handle the division operator and comments
430 if ((c = inputGetc(ep)) < 0) {
431 ejsError(ep, "Syntax Error");
434 if (c != '*' && c != '/') {
436 tokenAddChar(ep, EJS_EXPR_DIV);
441 * Eat comments. Both C and C++ comment styles are supported.
444 if ((c = inputGetc(ep)) < 0) {
448 ejsError(ep, "Syntax Error");
451 if (c == '\n' && style == '/') {
453 } else if (c == '*') {
467 * Continue looking for a token, so get the next character
469 if ((c = inputGetc(ep)) < 0) {
474 case '<': /* < and <= */
475 if ((c = inputGetc(ep)) < 0) {
476 ejsError(ep, "Syntax Error");
480 tokenAddChar(ep, EJS_EXPR_LSHIFT);
482 } else if (c == '=') {
483 tokenAddChar(ep, EJS_EXPR_LESSEQ);
486 tokenAddChar(ep, EJS_EXPR_LESS);
490 case '>': /* > and >= */
491 if ((c = inputGetc(ep)) < 0) {
492 ejsError(ep, "Syntax Error");
496 tokenAddChar(ep, EJS_EXPR_RSHIFT);
498 } else if (c == '=') {
499 tokenAddChar(ep, EJS_EXPR_GREATEREQ);
502 tokenAddChar(ep, EJS_EXPR_GREATER);
507 if ((c = inputGetc(ep)) < 0) {
508 ejsError(ep, "Syntax Error");
512 tokenAddChar(ep, EJS_EXPR_EQ);
516 return EJS_TOK_ASSIGNMENT;
518 case '!': /* "!=" or "!"*/
519 if ((c = inputGetc(ep)) < 0) {
520 ejsError(ep, "Syntax Error");
524 tokenAddChar(ep, EJS_EXPR_NOTEQ);
528 tokenAddChar(ep, EJS_EXPR_BOOL_COMP);
537 return EJS_TOK_COMMA;
540 if ((c = inputGetc(ep)) < 0 || c != '|') {
541 ejsError(ep, "Syntax Error");
544 tokenAddChar(ep, EJS_COND_OR);
545 return EJS_TOK_LOGICAL;
548 if ((c = inputGetc(ep)) < 0 || c != '&') {
549 ejsError(ep, "Syntax Error");
552 tokenAddChar(ep, EJS_COND_AND);
553 return EJS_TOK_LOGICAL;
555 case '\"': /* String quote */
558 if ((c = inputGetc(ep)) < 0) {
559 ejsError(ep, "Syntax Error");
565 * Check for escape sequence characters
572 * Octal support, \101 maps to 65 = 'A'. Put first
573 * char back so converter will work properly.
576 c = charConvert(ep, 8, 3);
592 * Hex support, \x41 maps to 65 = 'A'
594 c = charConvert(ep, 16, 2);
598 * Unicode support, \x0401 maps to 65 = 'A'
600 c = charConvert(ep, 16, 2);
601 c = c*16 + charConvert(ep, 16, 2);
609 ejsError(ep, "Invalid Escape Sequence");
613 if (tokenAddChar(ep, c) < 0) {
617 if (tokenAddChar(ep, c) < 0) {
621 if ((c = inputGetc(ep)) < 0) {
622 ejsError(ep, "Unmatched Quote");
626 return EJS_TOK_LITERAL;
629 if (tokenAddChar(ep, c) < 0) {
632 if ((c = inputGetc(ep)) < 0) {
635 if (tolower(c) == 'x') {
637 if (tokenAddChar(ep, c) < 0) {
640 if ((c = inputGetc(ep)) < 0) {
643 } while (isdigit(c) || (tolower(c) >= 'a' && tolower(c) <= 'f'));
645 mprDestroyVar(&ep->tokenNumber);
646 ep->tokenNumber = mprParseVar(ep->token, type);
648 return EJS_TOK_NUMBER;
651 #if BLD_FEATURE_FLOATING_POINT
652 if (c == '.' || tolower(c) == 'e' || c == '+' || c == '-') {
654 type = MPR_TYPE_FLOAT;
658 mprDestroyVar(&ep->tokenNumber);
659 ep->tokenNumber = mprParseVar(ep->token, type);
661 return EJS_TOK_NUMBER;
664 /* Fall through to get more digits */
666 case '1': case '2': case '3': case '4':
667 case '5': case '6': case '7': case '8': case '9':
669 if (tokenAddChar(ep, c) < 0) {
672 if ((c = inputGetc(ep)) < 0) {
675 #if BLD_FEATURE_FLOATING_POINT
676 if (c == '.' || tolower(c) == 'e' || tolower(c) == 'f') {
677 type = MPR_TYPE_FLOAT;
679 } while (isdigit(c) || c == '.' || tolower(c) == 'e' || tolower(c) == 'f' ||
680 ((type == MPR_TYPE_FLOAT) && (c == '+' || c == '-')));
682 } while (isdigit(c));
685 mprDestroyVar(&ep->tokenNumber);
686 ep->tokenNumber = mprParseVar(ep->token, type);
688 return EJS_TOK_NUMBER;
692 * Identifiers or a function names
696 if ((c = inputGetc(ep)) < 0) {
699 if (c == '\n' || c == '\r') {
702 } else if (tokenAddChar(ep, c) < 0) {
705 if ((c = inputGetc(ep)) < 0) {
708 if (!isalnum(c) && c != '$' && c != '_' && c != '\\') {
712 if (*ep->token == '\0') {
716 if (! isalpha((int) *ep->token) && *ep->token != '$' &&
718 ejsError(ep, "Invalid identifier %s", ep->token);
722 tid = checkReservedWord(ep, state, c, EJS_TOK_ID);
723 if (tid != EJS_TOK_ID) {
728 * Skip white space after token to find out whether this is
731 while (c == ' ' || c == '\t' || c == '\r' || c == '\n') {
732 if ((c = inputGetc(ep)) < 0)
742 * Putback the last extra character for next time
748 /******************************************************************************/
750 * Convert a hex or octal character back to binary, return original char if
754 static int charConvert(Ejs *ep, int base, int maxDig)
756 int i, c, lval, convChar;
759 for (i = 0; i < maxDig; i++) {
760 if ((c = inputGetc(ep)) < 0) {
764 * Initialize to out of range value
769 } else if (c >= 'a' && c <= 'f') {
770 convChar = c - 'a' + 10;
771 } else if (c >= 'A' && c <= 'F') {
772 convChar = c - 'A' + 10;
775 * If unexpected character then return it to buffer.
777 if (convChar >= base) {
781 lval = (lval * base) + convChar;
786 /******************************************************************************/
788 * Putback the last token read. Accept at most one push back token.
791 void ejsLexPutbackToken(Ejs *ep, int tid, char *string)
800 ip->putBackIndex += 1;
801 idx = ip->putBackIndex;
802 ip->putBack[idx].id = tid;
804 if (ip->putBack[idx].token) {
805 if (ip->putBack[idx].token == string) {
808 mprFree(ip->putBack[idx].token);
810 ip->putBack[idx].token = mprStrdup(string);
813 /******************************************************************************/
815 * Add a character to the token buffer
818 static int tokenAddChar(Ejs *ep, int c)
827 if (ip->tokEndp >= &ip->tokbuf[ip->tokSize - 1]) {
828 ip->tokSize += EJS_PARSE_INCR;
830 ip->tokbuf = mprRealloc(ip->tokbuf, ip->tokSize);
831 if (ip->tokbuf == 0) {
832 ejsError(ep, "Token too big");
835 ip->tokEndp += (int) ((uchar*) ip->tokbuf - oldbuf);
836 ip->tokServp += (int) ((uchar*) ip->tokbuf - oldbuf);
837 ep->token += (int) ((uchar*) ip->tokbuf - oldbuf);
845 /******************************************************************************/
847 * Get another input character
850 static int inputGetc(Ejs *ep)
858 if (ip->scriptSize <= 0) {
862 c = (uchar) (*ip->scriptServp++);
866 * For debugging, accumulate the line number and the currenly parsed line
870 if (ip->lineColumn > 0) {
871 printf("PARSED: %s\n", ip->line);
877 if ((ip->lineColumn + 2) >= ip->lineLength) {
878 ip->lineLength += 80;
879 ip->line = mprRealloc(ip->line, ip->lineLength * sizeof(char));
881 ip->line[ip->lineColumn++] = c;
882 ip->line[ip->lineColumn] = '\0';
887 /******************************************************************************/
889 * Putback a character onto the input queue
892 static void inputPutback(Ejs *ep, int c)
900 *--ip->scriptServp = c;
903 ip->line[ip->lineColumn] = '\0';
907 /******************************************************************************/
910 void ejsLexDummy() {}
912 /******************************************************************************/
913 #endif /* BLD_FEATURE_EJS */
921 * vim600: sw=4 ts=4 fdm=marker