3 * @brief EJS Lexical Analyser
4 * @overview EJS lexical analyser. This implementes a lexical analyser
5 * for a subset of the JavaScript language.
7 /********************************* Copyright **********************************/
11 * Copyright (c) Mbedthis Software LLC, 2003-2005. All Rights Reserved.
12 * Portions Copyright (c) GoAhead Software, 1995-2000. All Rights Reserved.
14 * This software is distributed under commercial and open source licenses.
15 * You may use the GPL open source license described below or you may acquire
16 * a commercial license from Mbedthis Software. You agree to be fully bound
17 * by the terms of either license. Consult the LICENSE.TXT distributed with
18 * this software for full details.
20 * This software is open source; you can redistribute it and/or modify it
21 * under the terms of the GNU General Public License as published by the
22 * Free Software Foundation; either version 2 of the License, or (at your
23 * option) any later version. See the GNU General Public License for more
24 * details at: http://www.mbedthis.com/downloads/gplLicense.html
26 * This program is distributed WITHOUT ANY WARRANTY; without even the
27 * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
29 * This GPL license does NOT permit incorporating this software into
30 * proprietary programs. If you are unable to comply with the GPL, you must
31 * acquire a commercial license to use this software. Commercial licenses
32 * for this software and support services are available from Mbedthis
33 * Software at http://www.mbedthis.com
37 /********************************** Includes **********************************/
39 #include "ejsInternal.h"
43 /****************************** Forward Declarations **************************/
45 static int getLexicalToken(Ejs *ep, int state);
46 static int tokenAddChar(Ejs *ep, int c);
47 static int inputGetc(Ejs *ep);
48 static void inputPutback(Ejs *ep, int c);
49 static int charConvert(Ejs *ep, int base, int maxDig);
51 /************************************* Code ***********************************/
53 * Open a new input script
56 int ejsLexOpenScript(Ejs *ep, char *script)
63 if ((ep->input = mprMalloc(sizeof(EjsInput))) == NULL) {
67 memset(ip, 0, sizeof(*ip));
70 * Create the parse token buffer and script buffer
72 ip->tokbuf = mprMalloc(EJS_PARSE_INCR);
73 ip->tokSize = EJS_PARSE_INCR;
74 ip->tokServp = ip->tokbuf;
75 ip->tokEndp = ip->tokbuf;
77 ip->script = mprStrdup(script);
78 ip->scriptSize = strlen(script);
79 ip->scriptServp = ip->script;
86 ip->putBackIndex = -1;
91 /******************************************************************************/
93 * Close the input script
96 void ejsLexCloseScript(Ejs *ep)
106 for (i = 0; i < EJS_TOKEN_STACK; i++) {
107 mprFree(ip->putBack[i].token);
108 ip->putBack[i].token = 0;
118 /******************************************************************************/
120 * Initialize an input state structure
123 int ejsInitInputState(EjsInput *ip)
127 memset(ip, 0, sizeof(*ip));
128 ip->putBackIndex = -1;
132 /******************************************************************************/
134 * Save the input state
137 void ejsLexSaveInputState(Ejs *ep, EjsInput *state)
149 for (i = 0; i < ip->putBackIndex; i++) {
150 state->putBack[i].token = mprStrdup(ip->putBack[i].token);
151 state->putBack[i].id = ip->putBack[i].id;
153 for (; i < EJS_TOKEN_STACK; i++) {
154 state->putBack[i].token = 0;
157 state->line = mprMalloc(ip->lineLength);
158 mprStrcpy(state->line, ip->lineLength, ip->line);
160 state->lineColumn = ip->lineColumn;
161 state->lineNumber = ip->lineNumber;
162 state->lineLength = ip->lineLength;
165 /******************************************************************************/
167 * Restore the input state
170 void ejsLexRestoreInputState(Ejs *ep, EjsInput *state)
181 ip->tokbuf = state->tokbuf;
182 ip->tokServp = state->tokServp;
183 ip->tokEndp = state->tokEndp;
184 ip->tokSize = state->tokSize;
186 ip->script = state->script;
187 ip->scriptServp = state->scriptServp;
188 ip->scriptSize = state->scriptSize;
190 ip->putBackIndex = state->putBackIndex;
191 for (i = 0; i < ip->putBackIndex; i++) {
192 mprFree(ip->putBack[i].token);
193 ip->putBack[i].id = state->putBack[i].id;
194 ip->putBack[i].token = mprStrdup(state->putBack[i].token);
198 ip->line = mprMalloc(state->lineLength);
199 mprStrcpy(ip->line, state->lineLength, state->line);
201 ip->lineColumn = state->lineColumn;
202 ip->lineNumber = state->lineNumber;
203 ip->lineLength = state->lineLength;
206 /******************************************************************************/
208 * Free a saved input state
211 void ejsLexFreeInputState(Ejs *ep, EjsInput *state)
218 for (i = 0; i < EJS_TOKEN_STACK; i++) {
219 mprFree(state->putBack[i].token);
221 state->putBackIndex = -1;
222 mprFree(state->line);
223 state->lineLength = 0;
224 state->lineColumn = 0;
227 /******************************************************************************/
229 * Get the next EJS token
232 int ejsLexGetToken(Ejs *ep, int state)
236 ep->tid = getLexicalToken(ep, state);
240 /******************************************************************************/
243 * Check for reserved words "if", "else", "var", "for", "foreach",
244 * "delete", "function", and "return". "new", "in" and "function"
245 * done below. "true", "false", "null", "undefined" are handled
248 * Other reserved words not supported:
249 * "break", "case", "catch", "continue", "default", "do",
250 * "finally", "instanceof", "switch", "this", "throw", "try",
251 * "typeof", "while", "with"
253 * ECMA extensions reserved words (not supported):
254 * "abstract", "boolean", "byte", "char", "class", "const",
255 * "debugger", "double", "enum", "export", "extends",
256 * "final", "float", "goto", "implements", "import", "int",
257 * "interface", "long", "native", "package", "private",
258 * "protected", "public", "short", "static", "super",
259 * "synchronized", "throws", "transient", "volatile"
262 static int checkReservedWord(Ejs *ep, int state, int c, int tid)
264 if (state == EJS_STATE_STMT) {
265 if (strcmp(ep->token, "if") == 0) {
268 } else if (strcmp(ep->token, "else") == 0) {
271 } else if (strcmp(ep->token, "var") == 0) {
274 } else if (strcmp(ep->token, "for") == 0) {
277 } else if (strcmp(ep->token, "delete") == 0) {
279 return EJS_TOK_DELETE;
280 } else if (strcmp(ep->token, "function") == 0) {
282 return EJS_TOK_FUNCTION;
283 } else if (strcmp(ep->token, "return") == 0) {
284 if ((c == ';') || (c == '(')) {
287 return EJS_TOK_RETURN;
289 } else if (state == EJS_STATE_EXPR) {
290 if (strcmp(ep->token, "new") == 0) {
293 } else if (strcmp(ep->token, "in") == 0) {
296 } else if (strcmp(ep->token, "function") == 0) {
298 return EJS_TOK_FUNCTION;
304 /******************************************************************************/
306 * Get the next EJS token
309 static int getLexicalToken(Ejs *ep, int state)
313 int done, tid, c, quote, style, idx;
321 type = BLD_FEATURE_NUM_TYPE_ID;
324 * Use a putback tokens first. Don't free strings as caller needs access.
326 if (ip->putBackIndex >= 0) {
327 idx = ip->putBackIndex;
328 tid = ip->putBack[idx].id;
329 ep->token = (char*) ip->putBack[idx].token;
330 tid = checkReservedWord(ep, state, 0, tid);
334 ep->token = ip->tokServp = ip->tokEndp = ip->tokbuf;
335 *ip->tokServp = '\0';
337 if ((c = inputGetc(ep)) < 0) {
342 * Main lexical analyser
344 for (done = 0; !done; ) {
353 if ((c = inputGetc(ep)) < 0)
355 } while (c == ' ' || c == '\t' || c == '\r');
359 return EJS_TOK_NEWLINE;
363 return EJS_TOK_LPAREN;
367 return EJS_TOK_RPAREN;
371 return EJS_TOK_LBRACKET;
375 return EJS_TOK_RBRACKET;
379 return EJS_TOK_PERIOD;
383 return EJS_TOK_LBRACE;
387 return EJS_TOK_RBRACE;
390 if ((c = inputGetc(ep)) < 0) {
391 ejsError(ep, "Syntax Error");
396 tokenAddChar(ep, EJS_EXPR_PLUS);
399 tokenAddChar(ep, EJS_EXPR_INC);
400 return EJS_TOK_INC_DEC;
403 if ((c = inputGetc(ep)) < 0) {
404 ejsError(ep, "Syntax Error");
409 tokenAddChar(ep, EJS_EXPR_MINUS);
412 tokenAddChar(ep, EJS_EXPR_DEC);
413 return EJS_TOK_INC_DEC;
416 tokenAddChar(ep, EJS_EXPR_MUL);
420 tokenAddChar(ep, EJS_EXPR_MOD);
425 * Handle the division operator and comments
427 if ((c = inputGetc(ep)) < 0) {
428 ejsError(ep, "Syntax Error");
431 if (c != '*' && c != '/') {
433 tokenAddChar(ep, EJS_EXPR_DIV);
438 * Eat comments. Both C and C++ comment styles are supported.
441 if ((c = inputGetc(ep)) < 0) {
445 ejsError(ep, "Syntax Error");
448 if (c == '\n' && style == '/') {
450 } else if (c == '*') {
464 * Continue looking for a token, so get the next character
466 if ((c = inputGetc(ep)) < 0) {
471 case '<': /* < and <= */
472 if ((c = inputGetc(ep)) < 0) {
473 ejsError(ep, "Syntax Error");
477 tokenAddChar(ep, EJS_EXPR_LSHIFT);
479 } else if (c == '=') {
480 tokenAddChar(ep, EJS_EXPR_LESSEQ);
483 tokenAddChar(ep, EJS_EXPR_LESS);
487 case '>': /* > and >= */
488 if ((c = inputGetc(ep)) < 0) {
489 ejsError(ep, "Syntax Error");
493 tokenAddChar(ep, EJS_EXPR_RSHIFT);
495 } else if (c == '=') {
496 tokenAddChar(ep, EJS_EXPR_GREATEREQ);
499 tokenAddChar(ep, EJS_EXPR_GREATER);
504 if ((c = inputGetc(ep)) < 0) {
505 ejsError(ep, "Syntax Error");
509 tokenAddChar(ep, EJS_EXPR_EQ);
513 return EJS_TOK_ASSIGNMENT;
515 case '!': /* "!=" or "!"*/
516 if ((c = inputGetc(ep)) < 0) {
517 ejsError(ep, "Syntax Error");
521 tokenAddChar(ep, EJS_EXPR_NOTEQ);
525 tokenAddChar(ep, EJS_EXPR_BOOL_COMP);
534 return EJS_TOK_COMMA;
537 if ((c = inputGetc(ep)) < 0 || c != '|') {
538 ejsError(ep, "Syntax Error");
541 tokenAddChar(ep, EJS_COND_OR);
542 return EJS_TOK_LOGICAL;
545 if ((c = inputGetc(ep)) < 0 || c != '&') {
546 ejsError(ep, "Syntax Error");
549 tokenAddChar(ep, EJS_COND_AND);
550 return EJS_TOK_LOGICAL;
552 case '\"': /* String quote */
555 if ((c = inputGetc(ep)) < 0) {
556 ejsError(ep, "Syntax Error");
562 * Check for escape sequence characters
569 * Octal support, \101 maps to 65 = 'A'. Put first
570 * char back so converter will work properly.
573 c = charConvert(ep, 8, 3);
589 * Hex support, \x41 maps to 65 = 'A'
591 c = charConvert(ep, 16, 2);
595 * Unicode support, \x0401 maps to 65 = 'A'
597 c = charConvert(ep, 16, 2);
598 c = c*16 + charConvert(ep, 16, 2);
606 ejsError(ep, "Invalid Escape Sequence");
610 if (tokenAddChar(ep, c) < 0) {
614 if (tokenAddChar(ep, c) < 0) {
618 if ((c = inputGetc(ep)) < 0) {
619 ejsError(ep, "Unmatched Quote");
623 return EJS_TOK_LITERAL;
626 if (tokenAddChar(ep, c) < 0) {
629 if ((c = inputGetc(ep)) < 0) {
632 if (tolower(c) == 'x') {
633 if (tokenAddChar(ep, c) < 0) {
636 if ((c = inputGetc(ep)) < 0) {
641 #if BLD_FEATURE_FLOATING_POINT
642 if (c == '.' || tolower(c) == 'e' || c == '+' || c == '-') {
644 type = MPR_TYPE_FLOAT;
648 mprDestroyVar(&ep->tokenNumber);
649 ep->tokenNumber = mprParseVar(ep->token, type);
651 return EJS_TOK_NUMBER;
654 /* Fall through to get more digits */
656 case '1': case '2': case '3': case '4':
657 case '5': case '6': case '7': case '8': case '9':
659 if (tokenAddChar(ep, c) < 0) {
662 if ((c = inputGetc(ep)) < 0) {
665 #if BLD_FEATURE_FLOATING_POINT
666 if (c == '.' || tolower(c) == 'e' || tolower(c) == 'f') {
667 type = MPR_TYPE_FLOAT;
669 } while (isdigit(c) || c == '.' || tolower(c) == 'e' || tolower(c) == 'f' ||
670 ((type == MPR_TYPE_FLOAT) && (c == '+' || c == '-')));
672 } while (isdigit(c));
675 mprDestroyVar(&ep->tokenNumber);
676 ep->tokenNumber = mprParseVar(ep->token, type);
678 return EJS_TOK_NUMBER;
682 * Identifiers or a function names
686 if ((c = inputGetc(ep)) < 0) {
689 if (c == '\n' || c == '\r') {
692 } else if (tokenAddChar(ep, c) < 0) {
695 if ((c = inputGetc(ep)) < 0) {
698 if (!isalnum(c) && c != '$' && c != '_' && c != '\\') {
702 if (*ep->token == '\0') {
706 if (! isalpha((int) *ep->token) && *ep->token != '$' &&
708 ejsError(ep, "Invalid identifier %s", ep->token);
712 tid = checkReservedWord(ep, state, c, EJS_TOK_ID);
713 if (tid != EJS_TOK_ID) {
718 * Skip white space after token to find out whether this is
721 while (c == ' ' || c == '\t' || c == '\r' || c == '\n') {
722 if ((c = inputGetc(ep)) < 0)
732 * Putback the last extra character for next time
738 /******************************************************************************/
740 * Convert a hex or octal character back to binary, return original char if
744 static int charConvert(Ejs *ep, int base, int maxDig)
746 int i, c, lval, convChar;
749 for (i = 0; i < maxDig; i++) {
750 if ((c = inputGetc(ep)) < 0) {
754 * Initialize to out of range value
759 } else if (c >= 'a' && c <= 'f') {
760 convChar = c - 'a' + 10;
761 } else if (c >= 'A' && c <= 'F') {
762 convChar = c - 'A' + 10;
765 * If unexpected character then return it to buffer.
767 if (convChar >= base) {
771 lval = (lval * base) + convChar;
776 /******************************************************************************/
778 * Putback the last token read. Accept at most one push back token.
781 void ejsLexPutbackToken(Ejs *ep, int tid, char *string)
790 ip->putBackIndex += 1;
791 idx = ip->putBackIndex;
792 ip->putBack[idx].id = tid;
794 if (ip->putBack[idx].token) {
795 if (ip->putBack[idx].token == string) {
798 mprFree(ip->putBack[idx].token);
800 ip->putBack[idx].token = mprStrdup(string);
803 /******************************************************************************/
805 * Add a character to the token buffer
808 static int tokenAddChar(Ejs *ep, int c)
817 if (ip->tokEndp >= &ip->tokbuf[ip->tokSize - 1]) {
818 ip->tokSize += EJS_PARSE_INCR;
820 ip->tokbuf = mprRealloc(ip->tokbuf, ip->tokSize);
821 if (ip->tokbuf == 0) {
822 ejsError(ep, "Token too big");
825 ip->tokEndp += (int) ((uchar*) ip->tokbuf - oldbuf);
826 ip->tokServp += (int) ((uchar*) ip->tokbuf - oldbuf);
827 ep->token += (int) ((uchar*) ip->tokbuf - oldbuf);
835 /******************************************************************************/
837 * Get another input character
840 static int inputGetc(Ejs *ep)
848 if (ip->scriptSize <= 0) {
852 c = (uchar) (*ip->scriptServp++);
856 * For debugging, accumulate the line number and the currenly parsed line
860 if (ip->lineColumn > 0) {
861 printf("PARSED: %s\n", ip->line);
867 if ((ip->lineColumn + 2) >= ip->lineLength) {
868 ip->lineLength += 80;
869 ip->line = mprRealloc(ip->line, ip->lineLength * sizeof(char));
871 ip->line[ip->lineColumn++] = c;
872 ip->line[ip->lineColumn] = '\0';
877 /******************************************************************************/
879 * Putback a character onto the input queue
882 static void inputPutback(Ejs *ep, int c)
890 *--ip->scriptServp = c;
893 ip->line[ip->lineColumn] = '\0';
897 /******************************************************************************/
900 void ejsLexDummy() {}
902 /******************************************************************************/
903 #endif /* BLD_FEATURE_EJS */
911 * vim600: sw=4 ts=4 fdm=marker