Jabber WindowGram Client (JWGC)

00001 /*
00002  *      Copyright (c) 1989 by the Massachusetts Institute of Technology.
00003  *      For copying and distribution information, see the file
00004  *      "mit-copyright.h".
00005  *
00006  *      Modified for jwgc by Daniel Henninger.
00007  */
00008 
00009 #include "mit-copyright.h"
00010 
00011 /****************************************************************************/
00012 /* */
00013 /* The lexer for the jwgc description language:               */
00014 /* */
00015 /****************************************************************************/
00016 
00017 #include "main.h"
00018 #include "new_string.h"
00019 #include "int_dictionary.h"
00020 #include "lexer.h"
00021 #include "parser.h"
00022 
00023 /*
00024  * yylineno - this holds the current line # we are on.  Updated automatically
00025  *            by input() and unput().
00026  */
00027 
00028 int yylineno;
00029 int yybufferpos;
00030 
00031 /*
00032  * keyword_dict - this dictionary maps keyword names to their token numbers.
00033  */
00034 
00035 static int_dictionary keyword_dict = NULL;
00036 
00037 /****************************************************************************/
00038 /* */
00039 /* I/O functions:                             */
00040 /* */
00041 /****************************************************************************/
00042 
00043 /*
00044  * input_file - this holds the FILE pointer to the file currently being lexed.
00045  */
00046 
00047 static FILE *input_file;
00048 static char *input_buffer;
00049 
00050 /*
00051  * pushback - if not -1, holds a character that was pushed back by unput but
00052  *            not yet read by input.
00053  */
00054 
00055 static int pushback = -1;
00056 
00057 static char 
00058 input()
00059 {
00060         int c;
00061 
00062         if (pushback != -1) {
00063                 c = pushback;
00064                 pushback = -1;
00065                 if (c == '\n')
00066                         yylineno++;
00067                 return (c);
00068         }
00069 
00070         if (input_file) {
00071                 c = getc(input_file);
00072         }
00073         else {
00074                 c = input_buffer[yybufferpos++];
00075         }
00076         if (c == '\n')
00077                 yylineno++;
00078         if (c == EOF)
00079                 c = 0;
00080 
00081         return (c);
00082 }
00083 
00084 static void 
00085 unput(c)
00086         int c;
00087 {
00088         pushback = c;
00089         if (c == '\n')
00090                 yylineno--;
00091 }
00092 
00093 /****************************************************************************/
00094 /* */
00095 /* Initialization routines:                       */
00096 /* */
00097 /****************************************************************************/
00098 
00099 struct keyword_info {
00100         string keyword;
00101         int keyword_number;
00102 };
00103 
00104 /*
00105  * keywords - This table holds a copy of the mapping from keyword name to
00106  *            token number and is used to initialize keyword_dict:
00107  */
00108 
00109 static struct keyword_info keywords[] = {
00110         {"and", '&'},
00111         {"appendport", APPENDPORT},
00112         {"buffer", BUFFER},
00113         {"break", BREAK},
00114         {"closeinput", CLOSEINPUT},
00115         {"closeoutput", CLOSEOUTPUT},
00116         {"closeport", CLOSEPORT},
00117         {"case", CASE},
00118         {"clearbuf", CLEARBUF},
00119         {"default", DEFAULT},
00120         {"do", DO},
00121         {"downcase", DOWNCASE},
00122         {"else", ELSE},
00123         {"elseif", ELSEIF},
00124         {"endcase", ENDCASE},
00125         {"endif", ENDIF},
00126         {"endwhile", ENDWHILE},
00127         {"exec", EXEC},
00128         {"execport", EXECPORT},
00129         {"exit", EXIT},
00130         {"fields", FIELDS},
00131         {"get", GET},
00132         {"getenv", GETENV},
00133         {"if", IF},
00134         {"inputport", INPUTPORT},
00135         {"lany", LANY},
00136         {"lbreak", LBREAK},
00137         {"lspan", LSPAN},
00138         {"match", MATCH},
00139         {"noop", NOOP},
00140         {"not", '!'},
00141         {"or", '|'},
00142         {"outputport", OUTPUTPORT},
00143         {"print", PRINT},
00144         {"protect", PROTECT},
00145         {"put", PUT},
00146         {"rany", RANY},
00147         {"rbreak", RBREAK},
00148         {"rspan", RSPAN},
00149         {"set", SET},
00150         {"show", SHOW},
00151         {"substitute", SUBSTITUTE},
00152         {"then", THEN},
00153         {"upcase", UPCASE},
00154         {"while", WHILE},
00155         {"verbatim", VERBATIM},
00156         {"jvar", JVAR},
00157 {"paragraph", PARAGRAPH}};
00158 
00159 /*
00160  * lex_open - this routine [re]initializes the lexer & prepares it to lex
00161  *            a file.  Resets current line # to 1.
00162  */
00163 
00164 void 
00165 lex_open(file)
00166         FILE *file;
00167 {
00168         /*
00169          * Initialize I/O:
00170          */
00171         input_file = file;
00172         input_buffer = NULL;
00173         yylineno = 1;
00174         pushback = -1;
00175 
00176         /*
00177          * Initialize keyword_dict from keywords if needed:
00178          */
00179         if (!keyword_dict) {
00180                 int i;
00181 
00182                 keyword_dict = int_dictionary_Create(101);
00183 
00184                 for (i = 0; i < sizeof(keywords) / sizeof(struct keyword_info); i++)
00185                         int_dictionary_Define(keyword_dict, keywords[i].keyword,
00186                                      0)->value = keywords[i].keyword_number;
00187         }
00188 }
00189 
00190 void 
00191 lex_open_buffer(buffer)
00192         char *buffer;
00193 {
00194         /*
00195          * Initialize I/O:
00196          */
00197         input_buffer = buffer;
00198         input_file = NULL;
00199         yylineno = 1;
00200         yybufferpos = 0;
00201         pushback = -1;
00202 
00203         /*
00204          * Initialize keyword_dict from keywords if needed:
00205          */
00206         if (!keyword_dict) {
00207                 int i;
00208 
00209                 keyword_dict = int_dictionary_Create(101);
00210 
00211                 for (i = 0; i < sizeof(keywords) / sizeof(struct keyword_info); i++)
00212                         int_dictionary_Define(keyword_dict, keywords[i].keyword,
00213                                      0)->value = keywords[i].keyword_number;
00214         }
00215 }
00216 
00217 /****************************************************************************/
00218 /* */
00219 /* lex subroutines:                              */
00220 /* */
00221 /****************************************************************************/
00222 
00223 /*
00224  * eat_escape_code - this rountine eats an escape code & returns the character
00225  *                   it codes for or 0 if it codes for "".
00226  *                   (an escape code is what follows a '\\' in a quoted
00227  *                   string)  Current escape codes are:
00228  *
00229  *                       "n"          == '\n'
00230  *                       "t"          == '\t'
00231  *                       "b"          == '\b'
00232  *                       "\n"         == "" (i.e., returns 0)
00233  *                       <EOF>        == ""
00234  *                       [0-7]{1,3}   == the character represented by the code
00235  *                                       interpreted as an octal number.
00236  *                       [^ntb0-7\n]  == the same character.  I.e., "*" == '*'
00237  */
00238 
00239 #define  is_octal_digit(c)           (((c)>='0') && ((c)<='7'))
00240 
00241 static char 
00242 eat_escape_code()
00243 {
00244         int c, coded_char;
00245 
00246         c = input();
00247 
00248         switch (c) {
00249                 case 0: /* i.e., EOF */
00250                         unput(c);
00251                         return (c);
00252                 case '\n':
00253                         return (0);
00254                 case 'n':
00255                         return ('\n');
00256                 case 't':
00257                         return ('\t');
00258                 case 'b':
00259                         return ('\b');
00260                 case '0':
00261                 case '1':
00262                 case '2':
00263                 case '3':
00264                 case '4':
00265                 case '5':
00266                 case '6':
00267                 case '7':
00268                         coded_char = c - '0';
00269                         c = input();
00270                         if (!is_octal_digit(c)) {
00271                                 unput(c);
00272                                 return (coded_char);
00273                         }
00274                         coded_char = coded_char * 8 + c - '0';
00275                         c = input();
00276                         if (!is_octal_digit(c)) {
00277                                 unput(c);
00278                                 return (coded_char);
00279                         }
00280                         return (coded_char * 8 + c - '0');
00281                 default:
00282                         return (c);
00283         }
00284 }
00285 
00286 /*
00287  * eat_string - this routine eats characters allowing escape codes via '\\'
00288  *              until a '"' is eaten.  If no '"' is seen before a '\n' or
00289  *              the <EOF>, a parse_error is set & 0 is returned.  Otherwise,
00290  *              the string represented by what has been eaten is returned.
00291  *              I.e., 'hello \n there"' would cause "hello \n there" to be
00292  *              returned.  (thats not a <cr> in the first case, a <cr> in the
00293  *              second)  The returned string is on the heap & must be freed
00294  *              eventually.  This routine should be passed the line # that the
00295  *              string we are eating started on.
00296  */
00297 
00298 static char *
00299 eat_string(starting_line)
00300         int starting_line;
00301 {
00302         int c;
00303         char buffer[500];
00304         char *ptr = buffer;
00305 
00306         for (;;) {
00307                 /*
00308                  * Get the next input character, handling EOF:
00309                  */
00310                 c = input();
00311                 if (!c) {
00312                         unput(c);
00313                         report_parse_error("unterminated string found beginning",
00314                                            starting_line);
00315                         return (0);
00316                 }
00317 
00318                 /*
00319                  * Deal with special characters ('\\', '"', and '\n'):
00320                  */
00321                 if (c == '\\') {
00322                         c = eat_escape_code();
00323                         if (!c)
00324                                 continue;
00325                 }
00326                 else if (c == '"') {
00327                         *ptr = 0;
00328                         return (string_Copy(buffer));
00329                 }
00330                 else if (c == '\n') {
00331                         unput(c);       /* fix line # reference to right line
00332                                          * # */
00333                         report_parse_error("carriage return found in string", yylineno);
00334                         return (0);
00335                 }
00336 
00337                 /*
00338                  * Add the character c to the current string:
00339                  */
00340                 *ptr = c;
00341                 ptr++;
00342 
00343                 /*
00344                  * If out of buffer space, do a recursive call then
00345                  * concatanate the result to the string read in so far to get the
00346                  * entire string and return that:
00347                  */
00348                 if (ptr > buffer + sizeof(buffer) - 20) {
00349                         string rest_of_string, result;
00350 
00351                         rest_of_string = eat_string(starting_line);
00352                         if (!rest_of_string)
00353                                 return (0);
00354 
00355                         *ptr = 0;
00356                         result = string_Concat(buffer, rest_of_string);
00357                         free(rest_of_string);
00358                         return (result);
00359                 }
00360         }
00361 }
00362 
00363 /*
00364  * eat_show_line - internal routine for eat_show:
00365  *
00366  *        This routine reads in a physical line of text allowing escape
00367  *    codes via '\\'.  If the line ends with a newline, the newline is eaten.
00368  *    If the line ends with a EOF, the EOF is not eaten.  The string
00369  *    represented by what has been eaten is returned.  The returned string
00370  *    is on the heap & must be freed eventually.  If test_for_endshow is
00371  *    true and the line read in starts off with "endshow" exactly
00372  *    (i.e., no escape codes) followed by any non-identifier-char, then
00373  *    instead of doing the above, we just eat the "endshow" & return 0.
00374  */
00375 
00376 static char *
00377 eat_show_line(test_for_endshow)
00378         int test_for_endshow;
00379 {
00380         int c;
00381         int saw_escape_code = 0;
00382         int starting_line = yylineno;
00383         char buffer[200];       /* This must be large enough to hold
00384                                  * "endshow" */
00385         char *ptr = buffer;
00386 
00387         while (yylineno == starting_line) {
00388                 c = input();
00389                 if (!c) {
00390                         unput(c);
00391                         *ptr = '\0';
00392                         return (string_Copy(buffer));
00393                 }
00394                 else if (c == '\\') {
00395                         saw_escape_code = 1;
00396                         c = eat_escape_code();
00397                         if (!c)
00398                                 continue;
00399                 }
00400 
00401                 *ptr = c;
00402                 ptr++;
00403 
00404                 if ((ptr == buffer + strlen("endshow")) && test_for_endshow)
00405                         if (!strncmp(buffer, "endshow", strlen("endshow"))
00406                             && !saw_escape_code) {
00407                                 c = input();
00408                                 unput(c);
00409                                 if (!is_identifier_char(c))
00410                                         return (0);
00411                         }
00412 
00413                 if (ptr > buffer + sizeof(buffer) - 2) {
00414                         string the_line;
00415                         string rest_of_line = eat_show_line(0);
00416 
00417                         *ptr = '\0';
00418                         the_line = string_Concat(buffer, rest_of_line);
00419                         free(rest_of_line);
00420                         return (the_line);
00421                 }
00422         }
00423 
00424         *ptr = '\0';
00425         return (string_Copy(buffer));
00426 }
00427 
00428 /*
00429  * eat_til_endshow - this routine eats characters allowing escape codes via
00430  *                   '\\' up to a endshow\{nonalpha} found at the
00431  *                   start of a line not counting leading whitespace.
00432  *                   If <EOF> is seen before the terminator, a parse_error
00433  *                   is set & 0 returned.  Otherwise, the string represented
00434  *                   by what has been eaten (escape codes replaced by what
00435  *                   they stand for and leading spaces and tabs removed from
00436  *                   each physical line) is returned.  The returned string
00437  *                   is on the heap & must be freed eventually.  Note that
00438  *                   to embed endshow in a message, endsho\w can be used.
00439  *                   This routine should be passed the line # of the show
00440  *                   command it is being used to process for use in error
00441  *                   messages.
00442  */
00443 
00444 static char *
00445 eat_til_endshow(start_line_no)
00446         int start_line_no;
00447 {
00448         register int c;
00449         string text_so_far = string_Copy("");
00450         string next_line;
00451 
00452         for (;;) {
00453                 /*
00454                  * Skip the spaces & tabs at the start of the current line:
00455                  */
00456                 while ((c = input()), c == ' ' || c == '\t');
00457                 unput(c);
00458 
00459                 /*
00460                  * Handle unterminated shows:
00461                  */
00462                 if (!c) {
00463                         report_parse_error("unterminated show beginning", start_line_no);
00464                         free(text_so_far);
00465                         return (0);
00466                 }
00467 
00468                 /*
00469                  * Read in rest of the line (including the <cr> at end), allowing
00470                  * for escape codes and checking for "endshow{nonalpha}" at the
00471                  * start of the line.  (Note: <newline> is considered the
00472                  * end of a line here!)
00473                  */
00474                 next_line = eat_show_line(1);
00475 
00476                 if (!next_line) /* i.e., is this the endshow line? */
00477                         return (text_so_far);
00478 
00479                 text_so_far = string_Concat2(text_so_far, next_line);
00480                 free(next_line);
00481         }
00482 }
00483 
00484 /*
00485  * handle_show - this routine is called after "show"\{nonalpha} is
00486  *               found to handle up to the endshow.  The token # is
00487  *               returned.
00488  */
00489 
00490 static int 
00491 handle_show()
00492 {
00493         int c;
00494         int start_line_no = yylineno;
00495 
00496         /*
00497          * Eat up ' ' and '\t's after show.  If the next character is a newline,
00498          * eat it.  This is so we don't get an extra newline when we call
00499          * eat_til_endshow:
00500          */
00501         while (c = input(), c == ' ' || c == '\t');
00502         if (c != '\n')
00503                 unput(c);
00504 
00505         if ((yylval.text = eat_til_endshow(start_line_no)))
00506                 return (SHOW);
00507         else
00508                 return (ERROR);
00509 }
00510 
00511 /****************************************************************************/
00512 /* */
00513 /* The main lexer itself:                           */
00514 /* */
00515 /****************************************************************************/
00516 
00517 /*
00518  * yylex - performs as per. the yacc manual's requirements
00519  */
00520 
00521 int 
00522 yylex()
00523 {
00524         register int c, last_char;
00525         register char *ptr;
00526         int start_line_no;
00527         int_dictionary_binding *binding;
00528         char varname[MAX_IDENTIFIER_LENGTH + 1];
00529 
00530         for (;;) {
00531                 switch (c = input()) {
00532 
00533                                 /*
00534                                  * Skip whitespace:
00535                                  */
00536                         case ' ':
00537                         case '\t':
00538                         case '\n':
00539                                 continue;
00540 
00541                                 /*
00542                                  * '#' comments out everything up to the and including
00543                                  * the next <cr>:
00544                                  */
00545                         case '#':
00546                                 while ((c = input()) && (c != '\n'));
00547                                 if (!c)
00548                                         unput(c);
00549                                 continue;
00550 
00551                                 /*
00552                                  * Handle c-style comments.  Note that "/[^*]" is not the start
00553                                  * of any valid token.
00554                                  */
00555                         case '/':
00556                                 start_line_no = yylineno;
00557 
00558                                 /* verify that next character is a '*': */
00559                                 if ((c = input()) != '*')
00560                                         return (ERROR);
00561 
00562                                 /* Scan until "*\/" or <EOF>: */
00563                                 for (last_char = 0;; last_char = c) {
00564                                         c = input();
00565                                         if (c == '/' && (last_char == '*'))
00566                                                 break;
00567                                         if (!c) {
00568                                                 unput(c);
00569                                                 report_parse_error("unterminated c style comment found beginning", start_line_no);
00570                                                 return (ERROR);
00571                                         }
00572                                 }
00573                                 continue;
00574 
00575                                 /*
00576                                  * The following characters lex as themselves:
00577                                  *   '+', '|', '&', '(', ')', '.', ',' and <EOF>:
00578                                  */
00579                         case 0:
00580                         case '+':
00581                         case '|':
00582                         case '&':
00583                         case '(':
00584                         case ')':
00585                         case '.':
00586                         case ',':
00587                                 return (c);
00588 
00589                                 /*
00590                                  * Handle "=[^~=]", "=~", and "==":
00591                                  */
00592                         case '=':
00593                                 switch (c = input()) {
00594                                         case '~':
00595                                                 return (REGEQ);
00596                                         case '=':
00597                                                 return (EQ);
00598                                         default:
00599                                                 unput(c);
00600                                                 return ('=');
00601                                 }
00602 
00603                                 /*
00604                                  * Handle "![^~=]", "!~", and "!=":
00605                                  */
00606                         case '!':
00607                                 switch (c = input()) {
00608                                         case '~':
00609                                                 return (REGNEQ);
00610                                         case '=':
00611                                                 return (NEQ);
00612                                         default:
00613                                                 unput(c);
00614                                                 return ('!');
00615                                 }
00616 
00617                                 /*
00618                                  * Handle identifiers and keywords:
00619                                  *
00620                                  * Note that the below set of characters is hard coded from
00621                                  * is_identifier_char from parser.h.
00622                                  */
00623                         case 'a':
00624                         case 'b':
00625                         case 'c':
00626                         case 'd':
00627                         case 'e':
00628                         case 'f':
00629                         case 'g':
00630                         case 'h':
00631                         case 'i':
00632                         case 'j':
00633                         case 'k':
00634                         case 'l':
00635                         case 'm':
00636                         case 'n':
00637                         case 'o':
00638                         case 'p':
00639                         case 'q':
00640                         case 'r':
00641                         case 's':
00642                         case 't':
00643                         case 'u':
00644                         case 'v':
00645                         case 'w':
00646                         case 'x':
00647                         case 'y':
00648                         case 'z':
00649                         case 'A':
00650                         case 'B':
00651                         case 'C':
00652                         case 'D':
00653                         case 'E':
00654                         case 'F':
00655                         case 'G':
00656                         case 'H':
00657                         case 'I':
00658                         case 'J':
00659                         case 'K':
00660                         case 'L':
00661                         case 'M':
00662                         case 'N':
00663                         case 'O':
00664                         case 'P':
00665                         case 'Q':
00666                         case 'R':
00667                         case 'S':
00668                         case 'T':
00669                         case 'U':
00670                         case 'V':
00671                         case 'W':
00672                         case 'X':
00673                         case 'Y':
00674                         case 'Z':
00675                         case '0':
00676                         case '1':
00677                         case '2':
00678                         case '3':
00679                         case '4':
00680                         case '5':
00681                         case '6':
00682                         case '7':
00683                         case '8':
00684                         case '9':
00685                         case '_':
00686                                 /*
00687                                  * Read in the first MAX_IDENTIFIER_LENGTH characters of the
00688                                  * identifier into varname null terminated.  Eat
00689                                  * the rest of the characters of the identifier:
00690                                  */
00691                                 for (ptr = varname;;) {
00692                                         if (ptr < varname + MAX_IDENTIFIER_LENGTH)
00693                                                 *(ptr++) = c;
00694                                         c = input();
00695                                         if (!is_identifier_char(c))
00696                                                 break;
00697                                 }
00698                                 unput(c);
00699                                 *ptr = '\0';
00700 
00701                                 /*
00702                                  * Look up the identifier in the keyword dictionary.
00703                                  * If its a match, return the keyword's #.  In the case
00704                                  * of show, call handle_show to do more processing.
00705                                  * If not a match, treat as a variable name.
00706                                  */
00707                                 binding = int_dictionary_Lookup(keyword_dict, varname);
00708                                 if (!binding) {
00709                                         yylval.text = string_Copy(varname);
00710                                         return (VARNAME);
00711                                 }
00712                                 if (binding->value == SHOW)
00713                                         return (handle_show());
00714                                 else
00715                                         return (binding->value);
00716 
00717                                 /*
00718                                  * Handle "${identifier}".  Note that $ followed by a
00719                                  * non-identifier character is not the start of any valid token.
00720                                  */
00721                         case '$':
00722                                 c = input();
00723                                 if (!is_identifier_char(c))
00724                                         return (ERROR);
00725 
00726                                 /*
00727                                  * Read in the first MAX_IDENTIFIER_LENGTH characters of the
00728                                  * identifier into varname null terminated.  Eat
00729                                  * the rest of the characters of the identifier:
00730                                  */
00731                                 for (ptr = varname;;) {
00732                                         if (ptr < varname + MAX_IDENTIFIER_LENGTH)
00733                                                 *(ptr++) = c;
00734                                         c = input();
00735                                         if (!is_identifier_char(c))
00736                                                 break;
00737                                 }
00738                                 unput(c);
00739                                 *ptr = '\0';
00740 
00741                                 yylval.text = string_Copy(varname);
00742                                 return (VARREF);
00743 
00744                                 /*
00745                                  * Handle constant strings:
00746                                  */
00747                         case '"':
00748                                 if ((yylval.text = eat_string(yylineno)))
00749                                         return (STRING);
00750                                 else
00751                                         return (ERROR);
00752 
00753                                 /*
00754                                  * All other characters do not start valid tokens:
00755                                  */
00756                         default:
00757                                 return (ERROR);
00758                 }
00759         }
00760 }
Jabber WindowGram Client (JWGC)

lexer.c