home *** CD-ROM | disk | FTP | other *** search
- /* (skel)lex.c
-
- When run through lr1p with a suitable grammar table file,
- this handles all source file opens and access, service defines,
- skip blanks and comments, fetch characters. The external
- interface is essentially one of initialization, then calling
- NEXT_TOKEN and READTOKEN (from the parser, mostly), then
- closing the lexical interface.
- Please note that certain conventions are observed in recognizing
- the special tokens <identifier>, <real>, and <string>. See the
- comments in 'get_symbol', 'get_number' and 'get_string'.
- Define CMODE to cause strings and comments to be handled ala C.
- The default is Pascal mode.
-
- If this file has been extensively modified, we suggest compiling it
- with option -DTEST, which will yield a standalone version. Pass
- it a sample source file and it will report all the tokens seen
- by name.
-
- */
-
- #include <stdio.h>
- #include <ctype.h>
- #include <string.h>
- #include <math.h>
- #include "decl.h"
-
- /* Definition of an EOF as a character in a scanned line */
- #define EOFCH 4
-
- /* Definition of the character for start of a comment field
- when <eol> is used in the grammar */
- #define COMMENT_START ';'
-
- /* Definition of the character that opens and closes a STRING and a
- CHARACTER in C or Pascal */
- #ifdef CMODE
- # define STRING_CHAR '"'
- # define CHAR_QUOTE '\''
- #else /* Pascal mode */
- # define STRING_CHAR '\''
- # define CHAR_QUOTE '\''
- #endif
-
- /* Definition of break character in a STRING */
- #define BREAK_CHAR '\\'
-
- /* Number of errors tolerated before quitting */
- #define MAXERRORS 10
-
- /* Maximum length of any input line */
- #define MAXLINELEN 256
- static char line[MAXLINELEN];
-
- static int lx, err_pos= 0, linenum= 0, show_source= 0;
- int errors= 0, err_count= 0;
-
- /* Source file */
- static FILE *sfile;
- static char ch;
- int tokenx= 0;
- int tokary[2];
-
- int token= 0;
- semrectype *lsemp, *lsempary[2];
-
- /* These limits apply to any particular string or symbol.
- They do not allocate permanent space */
- #define MAXSTRINGLEN 256
- #define MAXSYMLEN 72
-
- /* This is only used internally to skip over debug characters */
- #define DEBUG_TOKX -1
-
- /* ................. */
- static void getline()
- /* read the next source line from sfile, when nextch exhausts
- the current one. */
- {
- if (prompt_len) {
- printf("> ");
- fflush(stdout);
- }
- if (fgets(line, MAXLINELEN, sfile)) {
- /* introduces a \n character at the line end */
- if (show_source) printf("%s", line);
- linenum++;
- ch= *line;
- }
- else { /* end of file */
- if (show_source) printf("EOF\n");
- line[0]= EOFCH;
- line[1]= '\0';
- ch= EOFCH;
- }
- lx= 1;
- }
-
- /* lex is the first utility to use these, so we define them here */
-
- /* .......... */
- void readline(line, maxlen)
- char *line;
- int maxlen;
- /* reads a line from stdin */
- { char *lp;
-
- #ifdef UNIX
- read(0, line, maxlen);
- if (*line=='\0') *line= '\n';
- #else /* Microsoft on PC */
- fgets(line, maxlen, stdin);
- #endif
- lp= strchr(line, '\n');
- *lp= '\0';
- }
-
- /* ................ */
- char to_upper(ch)
- char ch;
- {
- if (islower(ch)) return toupper(ch);
- else return ch;
- }
-
- /* ........... */
- static int write_anychar(ch)
- char ch;
- {
- if (ch >= ' ' && ch <= '~') {
- printf("%c", ch);
- return 1;
- }
- else {
- char msg[20];
-
- sprintf(msg, "<%d>", (int) ch);
- printf(msg);
- return strlen(msg);
- }
- }
-
- /* ................ */
- static void report_line(pos, line, lstart)
- int pos, /* position of mark in 'line' */
- lstart; /* where printing of 'line' starts in current display line */
- char *line;
- { char *lp;
- int lpos= 0, newpos= -1;
-
- lp= line;
- while (*lp) {
- if (pos == lp-line) newpos= lpos;
- lpos += write_anychar(*(lp++));
- }
- printf("\n");
- if (newpos < 0) newpos= lpos;
- printf("%*s^\n", newpos+lstart, "");
- }
-
- /* ................ */
- void report_err(msg)
- char *msg;
- {
- printf("%s\nline %3d: ", msg, linenum);
- report_line(err_pos, line, strlen("line ddd: "));
- }
-
- #define ABORT_MSG "FATAL ERROR: "
-
- /* ................. */
- void abort_trap(msg)
- char *msg;
- { char *newmsg= (char *) malloc(strlen(msg) + strlen(ABORT_MSG) + 1);
-
- sprintf(newmsg, "%s%s", ABORT_MSG, msg);
- report_err(newmsg);
- printf("quitting...\n");
- exit(1);
- }
-
- #define ERROR_MSG "ERROR: "
-
- /* ................ */
- void error(msg)
- char *msg;
- { char *newmsg= (char *) malloc(strlen(msg) + strlen(ERROR_MSG) + 1);
- char rch;
-
- sprintf(newmsg, "%s%s", ERROR_MSG, msg);
- report_err(newmsg);
- errors++;
- if (errors>MAXERRORS)
- abort_trap("Error limit exceeded");
- else {
- rch= resp("...continue? ");
- if (!(rch=='y' || rch=='Y')) exit(1);
- }
- }
-
- #define WARN_MSG "WARNING: "
-
- /* ................. */
- void warn(msg)
- char *msg;
- { char *newmsg= (char *) malloc(strlen(msg) + strlen(WARN_MSG) + 1);
-
- sprintf(newmsg, "%s%s", WARN_MSG, msg);
- report_err(newmsg);
- }
-
- /* ................. */
- void symerror(sym, msg)
- char *sym, *msg;
- { char *newmsg= (char *) malloc(strlen(sym) + strlen(msg) + 3);
-
- sprintf(newmsg, "%s: %s", sym, msg);
- report_err(newmsg);
- }
-
- /* ............... */
- static char peekch()
- { /* return character past current one
- without advancing the read head */
- return line[lx];
- }
-
- /* ................. */
- static void nextch()
- /* put next character in CH, and advance read head */
- {
- if (ch) {
- if (ch==EOFCH) return;
- ch= line[lx++];
- }
- else getline(); /* end of the line */
- }
-
- /* ................. */
- static void backch()
- /* positions to previous character in line */
- {
- if (lx > 1)
- ch= line[(--lx)-1];
- else ch=' ';
- }
-
- {## copy(not eolseen) ##}
- /* when <eol> has NOT appeared in the grammar */
-
- #ifdef CMODE /* C-style comments */
-
- /* .................. */
- void skipblanks()
- /* This considers slash-star as an open comment and star-slash
- as a close-comment; comments may run over multiple lines. */
- {
- while (1) {
- if (ch == '\0') nextch();
- else if (ch == '/' && peekch()=='*') { /* open a comment */
- while (!(ch == '*' && peekch()=='/') &&
- ch != EOFCH) nextch();
- if (ch == EOFCH)
- error("unclosed comment");
- else
- nextch();
- }
- else if (!(ch==' ' || ch=='\t' || ch=='\n')) break;
- else nextch();
- }
- }
- #else /* Pascal comments */
-
- /* .................. */
- void skipblanks()
- /* This considers left brace as an open comment and right brace
- as a close-comment; comments may run over multiple lines. */
- {
- while (1) {
- if (ch == '\0') nextch();
- else if (ch == '{') { /* open a comment */
- while (ch != '}' && ch != EOFCH) nextch();
- if (ch == EOFCH)
- error("unclosed comment");
- else
- nextch();
- }
- else if (!(ch==' ' || ch=='\t' || ch=='\n')) break;
- else nextch();
- }
- }
- #endif
-
- {## copy(eolseen) ##}
- /* when <eol> HAS appeared in the grammar */
- /* .................. */
- void skipblanks()
- /* This version of skipblanks treats everything from COMMENT_START to the
- end of a line as a comment. */
- {
- while (1) {
- if (ch == '\0') nextch();
- else if (ch == COMMENT_START) {
- while (ch != '\n') nextch();
- break; /* next character is an EOL */
- }
- else if (!(ch == ' ' || ch == '\t')) break;
- else nextch();
- }
- }
-
- {## ##}
-
- /* .............. */
- static void get_symbol()
- { char symbol[MAXSYMLEN+1], *sp;
- symtabtype *stp;
-
- sp= symbol;
- /* Keep grabbing identifier characters.
- These follow C standard, except that an identifier cannot
- start with underbar,
- and all letters are upshifted. */
- while (1) {
- if (isalpha(ch)) *(sp++)= to_upper(ch);
- else if (isdigit(ch)) *(sp++)= ch;
- else if (ch=='_') *(sp++)= ch;
- else break;
- nextch();
- }
- *sp= '\0';
-
- /* makesym allocates a copy of symbol from the heap unless
- the thing is alread in the symtol table */
- stp = makesym(symbol, USER);
- if (stp->symt==RESERVED)
- /* a reserved keyword */
- token = stp->usym.tokval;
- else {
- lsemp= new_sem(IDENT, stp->symt);
- lsemp->usem.symp = stp;
- token = IDENT_TOKX;
- }
- }
-
- /* ............. */
- static long int get_integer()
- /* interpret a non-null sequence of digits as an integer. */
- { long int v= 0, sign= 1;
-
- if (ch=='+') nextch();
- else if (ch=='-') {
- nextch();
- sign= -1;
- }
- while (isdigit(ch)) {
- v = 10*v + ch - '0';
- nextch();
- }
- return v*sign;
- }
-
- /*................*/
- static double get_fraction()
- { double v= 0.0, p= 0.1;
-
- while (isdigit(ch)) {
- v = v + p*(ch-'0');
- p = p/10.0;
- nextch();
- }
- return v;
- }
-
- /* ............... */
- static double pwr10(exp)
- long int exp;
- {
- return pow((double) 10.0, (double) exp);
- }
-
- /* ............... */
- static void get_number()
- /* Accepts a literal integer, decimal or real number.
- The legal forms are defined by this grammar:
-
- Number -> Integer
- -> Decimal
- -> Integer Exponent
- -> Decimal Exponent
- Decimal -> Integer .
- -> Integer . Integer
- Exponent -> ExpTag Sign Integer
- ExpTag -> E
- -> e
- -> L
- -> l
- Sign -> +
- -> -
- -> <empty>
-
- Note that an initial sign (+/-) is not recognized. All the
- characters must be run together without spaces on the same line.
- A float is assembled as a 'double' precision value.
- An integer is assembled as a 'long' value.
- The conversion method may fail for certain extreme values.
- The C forms '0xNNN' and '0NNN' for hex and octal are not supported,
- but are easily added.
- */
-
- { long int v1;
- double rv;
-
- v1 = get_integer();
- if ((ch=='.')) {
- /* real number */
- nextch();
- rv = v1 + get_fraction();
- if (ch=='e' || ch=='E' || ch=='l' || ch=='L') {
- nextch();
- rv= rv * pwr10(get_integer());
- }
- token= REAL_TOKX;
- lsemp= new_sem(FLOAT, INTVAR);
- lsemp->usem.rval= rv;
- }
- else if (ch=='e' || ch=='E' || ch=='l' || ch=='L') {
- /* integer followed by exponent part */
- nextch();
- rv= v1 * pwr10(get_integer());
- token= REAL_TOKX;
- lsemp= new_sem(FLOAT, INTVAR);
- lsemp->usem.rval= rv;
- }
- else {
- token = INT_TOKX;
- lsemp=new_sem(FIXED, INTVAR);
- lsemp->usem.numval = v1;
- }
- } /* get_number */
-
- #ifdef CMODE
-
- /* ............... */
- static void get_string()
- /* Scans a string, allocating space for it, returning the pointer.
- The string follows C conventions, opening with the double quote
- mark ". An embedded double quote is represented as '\"'.
- Other embedded C characters, i.e. \n, etc. are not recognized
- in this simplified scanner. See directory CGRAM for a full
- C lexical analyzer, with extensions for numbers as well as
- strings -- cgram\skellex.c can be used instead of this file */
- { char tstring[MAXSTRINGLEN+1], *tp;
-
- nextch(); /* get past the first quote mark */
- lsemp= new_sem(STRNG, STRVAR);
- tp= tstring;
- while (1) {
- while (ch &&
- !(ch==EOFCH ||
- ch==STRING_CHAR ||
- ch==BREAK_CHAR)) {
- *(tp++)= ch;
- nextch();
- }
- if (ch == STRING_CHAR) {
- nextch();
- break; /* has to be the end */
- }
- else if (ch == BREAK_CHAR) {
- nextch();
- switch (ch) {
- /* a few are provided as an example --
- add other C break characters here as needed */
- case BREAK_CHAR: /* these two just get echoed */
- case STRING_CHAR:
- *(tp++)= ch;
- nextch();
- break;
- case 'n':
- *(tp++)= '\n';
- nextch();
- break;
- default:
- error("unrecognized \ option in string");
- nextch();
- break;
- }
- }
- else error("unterminated string within line or define");
- }
- *tp= '\0';
- lsemp->usem.strx= (char *) malloc(tp - tstring + 1);
- strcpy(lsemp->usem.strx, tstring);
- token = STR_TOKX;
- }
-
- /* ............... */
- static int get_char()
- /* Scans a C character, returning the int value as a SIGNED
- char -127 .. 128.
- +++ This ONLY recognizes simple character forms. See
- function skellex.c under the CGRAM directory for a complete
- recognizer of C character forms
- */
- { int charvalue;
- unsigned char tstring[5], *tp;
-
- nextch(); /* get past the first quote mark */
- if (ch == '\\') {
- nextch(); /* permits \\ and \' as characters */
- charvalue= (int) ch;
- }
- else charvalue= (int) ch;
- nextch(); /* get over the character in quotes */
- if (ch != '\'') error("expecting character quote");
- else nextch(); /* skip the terminating quote */
- lsemp= new_sem(CHAR, CHART);
- lsemp->usem.numval= charvalue;
- token = CHAR_TOKX;
- }
-
- #else /* alternative is PASCAL mode. A character and a string
- are equivalent, except that a character is a single-char string */
-
- /* ............... */
- static void get_string()
- /* Scans a string, allocating space for it, returning the pointer.
- The string follows PASCAL conventions, opening with the single quote
- mark '. An embedded double quote is represented as ''.
- */
- { char tstring[MAXSTRINGLEN+1], *tp;
-
- nextch(); /* get past the first quote mark */
- lsemp= new_sem(STRNG, STRVAR);
- tp= tstring;
- while (1) {
- while (ch &&
- !(ch==EOFCH ||
- ch=='\'')) {
- *(tp++)= ch;
- nextch();
- }
- if (ch == '\'') { /* duplicated quote? */
- nextch();
- if (ch=='\'') { /* yes, a duplicated quote */
- *(tp++)= ch;
- nextch(); /* then keep reading */
- }
- else break; /* this is the string end */
- }
- else error("unterminated string within line or define");
- }
- *tp= '\0';
- if (tp-tstring!=1) {
- lsemp->usem.strx= (char *) malloc(tp - tstring + 1);
- strcpy(lsemp->usem.strx, tstring);
- /* use this in case anyone cares about the distinction */
- token = STR_TOKX;
- }
- else {
- lsemp->usem.numval= *tstring;
- token= CHAR_TOKX;
- }
- }
-
- #endif
-
- /* ..................... */
- static int get_special()
- { /*This recognizes all those non-alphanumeric tokens that
- are such that the first character is a prefix of
- some other token. These are the `is_mult_char' tokens.
- The strategy is to collect all the characters that
- can follow the first character of such tokens, then
- search for the resulting string in the symbol
- table. The set SPECIAL_FOLSET is formed from the
- but-first characters of this token class.
- This class is loaded into the symbol table in the
- INITTABLES procedure.*/
- char symbol[MAXSYMLEN+1], *sp;
- symtabtype *symp;
-
- sp= symbol;
- *(sp++)= ch; /* this is the character sent into this thing */
- nextch();
- token= STOP_TOKX; /* this is just in case */
- {## var K, UD: integer;
-
- {This generates a while loop to look for characters in the
- 'mult_char_fol' set}
- begin
- indent:=2;
- ud:=udim(mult_char_fol);
- if ud >= ldim(mult_char_fol) then begin
- write('while (');
- for k:=ldim(mult_char_fol) to ud do begin
- write('ch==', qcharacter(mult_char_fol[k]));
- if k<ud then write(' || ');
- end;
- writeln(') {');
- writeln(' *(sp++)= ch;');
- writeln(' nextch();');
- writeln(' }');
- end
- end; ##}
-
- *sp='\0'; /* terminate the string formed in the loop */
- while (1) { /* if we can't find the string, reduce it by one
- character at a time until we can find it */
- symp= findsym(symbol);
- if (symp==NULL ||
- symp->symt!=RESERVED) {
- if (sp - symbol >= 2) {
- *(--sp)= '\0'; /* backup one character */
- backch();
- }
- else return 0;
- }
- else { /*should be a long non-alphanumeric token*/
- token= symp->usym.tokval;
- return 1;
- }
- }
- }
-
- /* .................. */
- static void get_token()
- /* C-style lexical analyzer -- sets TOKEN to token number */
- {
- lsemp= NULL; /* default case */
- skipblanks();
- err_pos= lx-1;
- if (isalpha(ch)) get_symbol();
- else if (isdigit(ch)) get_number();
- else switch (ch) {
- case STRING_CHAR: /* C-style string, i.e. "a string" */
- get_string();
- break;
- #if CHAR_QUOTE != STRING_CHAR
- /* a C-style character recognizer, i.e. '\n' */
- case CHAR_QUOTE:
- get_char();
- break;
- #endif
-
- #if DEBUG == 1
- case DEBUG_CHAR: {
- debug_level= 2;
- nextch();
- get_token();
- break;
- }
- #endif
-
- {## {the following generates inline case transfers for
- singlet tokens -- those not starting with an alphanumeric, and
- of length = 1}
- var K, L, U: integer;
- begin
- indent:= 4;
- for k:=1 to term_toks do begin
- if is_singlet[k] then begin {these can be recognized instantly
- as a single character}
- writeln('case ', qcharacter(tokens[k]), ': token=', k,
- '; nextch(); break;');
- end
- end;
-
- {the following generates inline case transfers for the
- multiplet tokens -- those not starting with an alphanumeric, and
- of length > 1 }
- l:=ldim(mult_char_pfx);
- u:=udim(mult_char_pfx);
- if u>=l then begin {don't do anything if there's no need}
- for k:=l to u do
- writeln('case ', qcharacter(mult_char_pfx[k]), ':');
- copy(true);
- end
- else copy(false);
- end;
- ##}
- if (!get_special()) {
- error("illegal token");
- get_token(); /* try again */
- };
- break;
- {## ##}
- default:
- if (ch == EOFCH) token = STOP_TOKX;
- else if (ch == '\n') {
- nextch();
- {## copy(eolseen) ##}
- token = EOL_TOKX; /* accept an end-of-line token */
- {## copy(not eolseen) ##}
- get_token(); /* go find another (significant) character */
- {## ##}
- }
- else {
- error("illegal character");
- nextch();
- get_token(); /* try again */
- }
- break;
- } /* end switch */
- if (err_count>0) err_count--;
- } /* get_token */
-
- /* ................. */
- void next_token()
- {
- if (tokenx>1) {
- tokenx = 1;
- get_token(); /* goes into token, lsemp */
- while (token==DEBUG_TOKX) get_token();
- tokary[1] = token;
- lsempary[1] = lsemp;
- }
- else { /*tokenx== 0 or 1*/
- /* is in tokary */
- token = tokary[tokenx];
- lsemp = lsempary[tokenx];
- }
- }
-
- /* ...............*/
- void tokenread()
- {
- tokenx++;
- }
-
- /*................*/
- static void putsym(str, tv)
- char *str;
- int tv;
- { symtabtype *symp= makesym(str, RESERVED);
-
- symp->usym.tokval=tv;
- }
-
- /* .................. */
- static void init_lex()
- {
- lsempary[0]=NULL;
- lsempary[1]=NULL;
- lsemp = NULL;
- tokenx=2;
- getline();
- }
-
- /*.................*/
- void inittables()
- {
- init_sym(); /*initialize symbol table*/
- clevel= -1;
- {##
- { generate a list of 'putsym' calls that associate each
- token name with a token number }
- var K: integer;
- begin
- indent:= 2;
- for k:=1 to term_toks do
- if not is_wild[k] and
- (like_ident[k] or is_mult_char[k]) then
- writeln('putsym(', qstring(tokens[k]), ', ', k, ');');
- end;
- ##}
- clevel= 0; /*goes to 0 for global level*/
- }
-
- /* ............... */
- int open_lex(fname, show_src)
- char *fname;
- int show_src;
- {
- if (strcmp(fname, "stdin")==0) sfile= stdin;
- else if ((sfile= fopen(fname, "r"))==NULL) {
- printf("Unable to open %s\n", fname);
- return 0;
- }
- show_source= show_src;
- init_lex();
- return 1;
- }
-
- /* ............... */
- void close_lex()
- {
- if (sfile != stdin) fclose(sfile);
- }
-
- #if DEBUG == 1
-
- /* The following is used by the debugger to describe tokens and
- lexical states */
-
- /* ............. */
- void show_lex()
- {
- if (lx > 0) report_line(lx-1, line, 0);
- else report_line(0, line, 0);
- printf("Next character CH= ");
- write_anychar(ch);
- printf(", current token= %s\n",
- (token >= 1 && token <= ALL_TOKS ? tokstring[token] : "???"));
- }
-
- #endif
-
- #ifdef TEST
-
- /* When developing a new lexical analyzer through extensive modification
- of this code, we recommend compiling with -DTEST, which makes this
- a standalone program. Run it with a source file to
- test the lexical analyzer.
- */
-
- /* .......... */
- main(argc, argv)
- int argc;
- char *argv[];
- { /* takes some file as arg 1, sends out copied strings and token lists */
-
- if (argc != 2) {
- printf("Usage: lex filename\n");
- exit(1);
- }
- if (open_lex(argv[1])) {
- while (1) {
- next_token();
- if (token >= 1 && token <= TERM_TOKS)
- printf("\ntoken= %s", tokstring[token]);
- else printf("\nUNRECOGNIZABLE TOKEN: %d", token);
- if (token==STOP_TOKX) break;
- }
- }
- exit(0);
- }
-
- #endif
-
-