The Education Master 1994 (4th Edition)

home *** CD-ROM | disk | FTP | other *** search

/ The Education Master 1994 (4th Edition) / EDUCATIONS_MASTER_4TH_EDITION.bin / files / progmisc / qparser2 / cskels / skellex.c < prev next >

Wrap

C/C++ Source or Header | 1993-11-07 | 21.1 KB | 843 lines

/* (skel)lex.c When run through lr1p with a suitable grammar table file, this handles all source file opens and access, service defines, skip blanks and comments, fetch characters. The external interface is essentially one of initialization, then calling NEXT_TOKEN and READTOKEN (from the parser, mostly), then closing the lexical interface. Please note that certain conventions are observed in recognizing the special tokens <identifier>, <real>, and <string>. See the comments in 'get_symbol', 'get_number' and 'get_string'. Define CMODE to cause strings and comments to be handled ala C. The default is Pascal mode. If this file has been extensively modified, we suggest compiling it with option -DTEST, which will yield a standalone version. Pass it a sample source file and it will report all the tokens seen by name. */ #include <stdio.h> #include <ctype.h> #include <string.h> #include <math.h> #include "decl.h" /* Definition of an EOF as a character in a scanned line */ #define EOFCH 4 /* Definition of the character for start of a comment field when <eol> is used in the grammar */ #define COMMENT_START ';' /* Definition of the character that opens and closes a STRING and a CHARACTER in C or Pascal */ #ifdef CMODE # define STRING_CHAR '"' # define CHAR_QUOTE '\'' #else /* Pascal mode */ # define STRING_CHAR '\'' # define CHAR_QUOTE '\'' #endif /* Definition of break character in a STRING */ #define BREAK_CHAR '\\' /* Number of errors tolerated before quitting */ #define MAXERRORS 10 /* Maximum length of any input line */ #define MAXLINELEN 256 static char line[MAXLINELEN]; static int lx, err_pos= 0, linenum= 0, show_source= 0; int errors= 0, err_count= 0; /* Source file */ static FILE *sfile; static char ch; int tokenx= 0; int tokary[2]; int token= 0; semrectype *lsemp, *lsempary[2]; /* These limits apply to any particular string or symbol. They do not allocate permanent space */ #define MAXSTRINGLEN 256 #define MAXSYMLEN 72 /* This is only used internally to skip over debug characters */ #define DEBUG_TOKX -1 /* ................. */ static void getline() /* read the next source line from sfile, when nextch exhausts the current one. */ { if (prompt_len) { printf("> "); fflush(stdout); } if (fgets(line, MAXLINELEN, sfile)) { /* introduces a \n character at the line end */ if (show_source) printf("%s", line); linenum++; ch= *line; } else { /* end of file */ if (show_source) printf("EOF\n"); line[0]= EOFCH; line[1]= '\0'; ch= EOFCH; } lx= 1; } /* lex is the first utility to use these, so we define them here */ /* .......... */ void readline(line, maxlen) char *line; int maxlen; /* reads a line from stdin */ { char *lp; #ifdef UNIX read(0, line, maxlen); if (*line=='\0') *line= '\n'; #else /* Microsoft on PC */ fgets(line, maxlen, stdin); #endif lp= strchr(line, '\n'); *lp= '\0'; } /* ................ */ char to_upper(ch) char ch; { if (islower(ch)) return toupper(ch); else return ch; } /* ........... */ static int write_anychar(ch) char ch; { if (ch >= ' ' && ch <= '~') { printf("%c", ch); return 1; } else { char msg[20]; sprintf(msg, "<%d>", (int) ch); printf(msg); return strlen(msg); } } /* ................ */ static void report_line(pos, line, lstart) int pos, /* position of mark in 'line' */ lstart; /* where printing of 'line' starts in current display line */ char *line; { char *lp; int lpos= 0, newpos= -1; lp= line; while (*lp) { if (pos == lp-line) newpos= lpos; lpos += write_anychar(*(lp++)); } printf("\n"); if (newpos < 0) newpos= lpos; printf("%*s^\n", newpos+lstart, ""); } /* ................ */ void report_err(msg) char *msg; { printf("%s\nline %3d: ", msg, linenum); report_line(err_pos, line, strlen("line ddd: ")); } #define ABORT_MSG "FATAL ERROR: " /* ................. */ void abort_trap(msg) char *msg; { char *newmsg= (char *) malloc(strlen(msg) + strlen(ABORT_MSG) + 1); sprintf(newmsg, "%s%s", ABORT_MSG, msg); report_err(newmsg); printf("quitting...\n"); exit(1); } #define ERROR_MSG "ERROR: " /* ................ */ void error(msg) char *msg; { char *newmsg= (char *) malloc(strlen(msg) + strlen(ERROR_MSG) + 1); char rch; sprintf(newmsg, "%s%s", ERROR_MSG, msg); report_err(newmsg); errors++; if (errors>MAXERRORS) abort_trap("Error limit exceeded"); else { rch= resp("...continue? "); if (!(rch=='y' || rch=='Y')) exit(1); } } #define WARN_MSG "WARNING: " /* ................. */ void warn(msg) char *msg; { char *newmsg= (char *) malloc(strlen(msg) + strlen(WARN_MSG) + 1); sprintf(newmsg, "%s%s", WARN_MSG, msg); report_err(newmsg); } /* ................. */ void symerror(sym, msg) char *sym, *msg; { char *newmsg= (char *) malloc(strlen(sym) + strlen(msg) + 3); sprintf(newmsg, "%s: %s", sym, msg); report_err(newmsg); } /* ............... */ static char peekch() { /* return character past current one without advancing the read head */ return line[lx]; } /* ................. */ static void nextch() /* put next character in CH, and advance read head */ { if (ch) { if (ch==EOFCH) return; ch= line[lx++]; } else getline(); /* end of the line */ } /* ................. */ static void backch() /* positions to previous character in line */ { if (lx > 1) ch= line[(--lx)-1]; else ch=' '; } {## copy(not eolseen) ##} /* when <eol> has NOT appeared in the grammar */ #ifdef CMODE /* C-style comments */ /* .................. */ void skipblanks() /* This considers slash-star as an open comment and star-slash as a close-comment; comments may run over multiple lines. */ { while (1) { if (ch == '\0') nextch(); else if (ch == '/' && peekch()=='*') { /* open a comment */ while (!(ch == '*' && peekch()=='/') && ch != EOFCH) nextch(); if (ch == EOFCH) error("unclosed comment"); else nextch(); } else if (!(ch==' ' || ch=='\t' || ch=='\n')) break; else nextch(); } } #else /* Pascal comments */ /* .................. */ void skipblanks() /* This considers left brace as an open comment and right brace as a close-comment; comments may run over multiple lines. */ { while (1) { if (ch == '\0') nextch(); else if (ch == '{') { /* open a comment */ while (ch != '}' && ch != EOFCH) nextch(); if (ch == EOFCH) error("unclosed comment"); else nextch(); } else if (!(ch==' ' || ch=='\t' || ch=='\n')) break; else nextch(); } } #endif {## copy(eolseen) ##} /* when <eol> HAS appeared in the grammar */ /* .................. */ void skipblanks() /* This version of skipblanks treats everything from COMMENT_START to the end of a line as a comment. */ { while (1) { if (ch == '\0') nextch(); else if (ch == COMMENT_START) { while (ch != '\n') nextch(); break; /* next character is an EOL */ } else if (!(ch == ' ' || ch == '\t')) break; else nextch(); } } {## ##} /* .............. */ static void get_symbol() { char symbol[MAXSYMLEN+1], *sp; symtabtype *stp; sp= symbol; /* Keep grabbing identifier characters. These follow C standard, except that an identifier cannot start with underbar, and all letters are upshifted. */ while (1) { if (isalpha(ch)) *(sp++)= to_upper(ch); else if (isdigit(ch)) *(sp++)= ch; else if (ch=='_') *(sp++)= ch; else break; nextch(); } *sp= '\0'; /* makesym allocates a copy of symbol from the heap unless the thing is alread in the symtol table */ stp = makesym(symbol, USER); if (stp->symt==RESERVED) /* a reserved keyword */ token = stp->usym.tokval; else { lsemp= new_sem(IDENT, stp->symt); lsemp->usem.symp = stp; token = IDENT_TOKX; } } /* ............. */ static long int get_integer() /* interpret a non-null sequence of digits as an integer. */ { long int v= 0, sign= 1; if (ch=='+') nextch(); else if (ch=='-') { nextch(); sign= -1; } while (isdigit(ch)) { v = 10*v + ch - '0'; nextch(); } return v*sign; } /*................*/ static double get_fraction() { double v= 0.0, p= 0.1; while (isdigit(ch)) { v = v + p*(ch-'0'); p = p/10.0; nextch(); } return v; } /* ............... */ static double pwr10(exp) long int exp; { return pow((double) 10.0, (double) exp); } /* ............... */ static void get_number() /* Accepts a literal integer, decimal or real number. The legal forms are defined by this grammar: Number -> Integer -> Decimal -> Integer Exponent -> Decimal Exponent Decimal -> Integer . -> Integer . Integer Exponent -> ExpTag Sign Integer ExpTag -> E -> e -> L -> l Sign -> + -> - -> <empty> Note that an initial sign (+/-) is not recognized. All the characters must be run together without spaces on the same line. A float is assembled as a 'double' precision value. An integer is assembled as a 'long' value. The conversion method may fail for certain extreme values. The C forms '0xNNN' and '0NNN' for hex and octal are not supported, but are easily added. */ { long int v1; double rv; v1 = get_integer(); if ((ch=='.')) { /* real number */ nextch(); rv = v1 + get_fraction(); if (ch=='e' || ch=='E' || ch=='l' || ch=='L') { nextch(); rv= rv * pwr10(get_integer()); } token= REAL_TOKX; lsemp= new_sem(FLOAT, INTVAR); lsemp->usem.rval= rv; } else if (ch=='e' || ch=='E' || ch=='l' || ch=='L') { /* integer followed by exponent part */ nextch(); rv= v1 * pwr10(get_integer()); token= REAL_TOKX; lsemp= new_sem(FLOAT, INTVAR); lsemp->usem.rval= rv; } else { token = INT_TOKX; lsemp=new_sem(FIXED, INTVAR); lsemp->usem.numval = v1; } } /* get_number */ #ifdef CMODE /* ............... */ static void get_string() /* Scans a string, allocating space for it, returning the pointer. The string follows C conventions, opening with the double quote mark ". An embedded double quote is represented as '\"'. Other embedded C characters, i.e. \n, etc. are not recognized in this simplified scanner. See directory CGRAM for a full C lexical analyzer, with extensions for numbers as well as strings -- cgram\skellex.c can be used instead of this file */ { char tstring[MAXSTRINGLEN+1], *tp; nextch(); /* get past the first quote mark */ lsemp= new_sem(STRNG, STRVAR); tp= tstring; while (1) { while (ch && !(ch==EOFCH || ch==STRING_CHAR || ch==BREAK_CHAR)) { *(tp++)= ch; nextch(); } if (ch == STRING_CHAR) { nextch(); break; /* has to be the end */ } else if (ch == BREAK_CHAR) { nextch(); switch (ch) { /* a few are provided as an example -- add other C break characters here as needed */ case BREAK_CHAR: /* these two just get echoed */ case STRING_CHAR: *(tp++)= ch; nextch(); break; case 'n': *(tp++)= '\n'; nextch(); break; default: error("unrecognized \ option in string"); nextch(); break; } } else error("unterminated string within line or define"); } *tp= '\0'; lsemp->usem.strx= (char *) malloc(tp - tstring + 1); strcpy(lsemp->usem.strx, tstring); token = STR_TOKX; } /* ............... */ static int get_char() /* Scans a C character, returning the int value as a SIGNED char -127 .. 128. +++ This ONLY recognizes simple character forms. See function skellex.c under the CGRAM directory for a complete recognizer of C character forms */ { int charvalue; unsigned char tstring[5], *tp; nextch(); /* get past the first quote mark */ if (ch == '\\') { nextch(); /* permits \\ and \' as characters */ charvalue= (int) ch; } else charvalue= (int) ch; nextch(); /* get over the character in quotes */ if (ch != '\'') error("expecting character quote"); else nextch(); /* skip the terminating quote */ lsemp= new_sem(CHAR, CHART); lsemp->usem.numval= charvalue; token = CHAR_TOKX; } #else /* alternative is PASCAL mode. A character and a string are equivalent, except that a character is a single-char string */ /* ............... */ static void get_string() /* Scans a string, allocating space for it, returning the pointer. The string follows PASCAL conventions, opening with the single quote mark '. An embedded double quote is represented as ''. */ { char tstring[MAXSTRINGLEN+1], *tp; nextch(); /* get past the first quote mark */ lsemp= new_sem(STRNG, STRVAR); tp= tstring; while (1) { while (ch && !(ch==EOFCH || ch=='\'')) { *(tp++)= ch; nextch(); } if (ch == '\'') { /* duplicated quote? */ nextch(); if (ch=='\'') { /* yes, a duplicated quote */ *(tp++)= ch; nextch(); /* then keep reading */ } else break; /* this is the string end */ } else error("unterminated string within line or define"); } *tp= '\0'; if (tp-tstring!=1) { lsemp->usem.strx= (char *) malloc(tp - tstring + 1); strcpy(lsemp->usem.strx, tstring); /* use this in case anyone cares about the distinction */ token = STR_TOKX; } else { lsemp->usem.numval= *tstring; token= CHAR_TOKX; } } #endif /* ..................... */ static int get_special() { /*This recognizes all those non-alphanumeric tokens that are such that the first character is a prefix of some other token. These are the `is_mult_char' tokens. The strategy is to collect all the characters that can follow the first character of such tokens, then search for the resulting string in the symbol table. The set SPECIAL_FOLSET is formed from the but-first characters of this token class. This class is loaded into the symbol table in the INITTABLES procedure.*/ char symbol[MAXSYMLEN+1], *sp; symtabtype *symp; sp= symbol; *(sp++)= ch; /* this is the character sent into this thing */ nextch(); token= STOP_TOKX; /* this is just in case */ {## var K, UD: integer; {This generates a while loop to look for characters in the 'mult_char_fol' set} begin indent:=2; ud:=udim(mult_char_fol); if ud >= ldim(mult_char_fol) then begin write('while ('); for k:=ldim(mult_char_fol) to ud do begin write('ch==', qcharacter(mult_char_fol[k])); if k<ud then write(' || '); end; writeln(') {'); writeln(' *(sp++)= ch;'); writeln(' nextch();'); writeln(' }'); end end; ##} *sp='\0'; /* terminate the string formed in the loop */ while (1) { /* if we can't find the string, reduce it by one character at a time until we can find it */ symp= findsym(symbol); if (symp==NULL || symp->symt!=RESERVED) { if (sp - symbol >= 2) { *(--sp)= '\0'; /* backup one character */ backch(); } else return 0; } else { /*should be a long non-alphanumeric token*/ token= symp->usym.tokval; return 1; } } } /* .................. */ static void get_token() /* C-style lexical analyzer -- sets TOKEN to token number */ { lsemp= NULL; /* default case */ skipblanks(); err_pos= lx-1; if (isalpha(ch)) get_symbol(); else if (isdigit(ch)) get_number(); else switch (ch) { case STRING_CHAR: /* C-style string, i.e. "a string" */ get_string(); break; #if CHAR_QUOTE != STRING_CHAR /* a C-style character recognizer, i.e. '\n' */ case CHAR_QUOTE: get_char(); break; #endif #if DEBUG == 1 case DEBUG_CHAR: { debug_level= 2; nextch(); get_token(); break; } #endif {## {the following generates inline case transfers for singlet tokens -- those not starting with an alphanumeric, and of length = 1} var K, L, U: integer; begin indent:= 4; for k:=1 to term_toks do begin if is_singlet[k] then begin {these can be recognized instantly as a single character} writeln('case ', qcharacter(tokens[k]), ': token=', k, '; nextch(); break;'); end end; {the following generates inline case transfers for the multiplet tokens -- those not starting with an alphanumeric, and of length > 1 } l:=ldim(mult_char_pfx); u:=udim(mult_char_pfx); if u>=l then begin {don't do anything if there's no need} for k:=l to u do writeln('case ', qcharacter(mult_char_pfx[k]), ':'); copy(true); end else copy(false); end; ##} if (!get_special()) { error("illegal token"); get_token(); /* try again */ }; break; {## ##} default: if (ch == EOFCH) token = STOP_TOKX; else if (ch == '\n') { nextch(); {## copy(eolseen) ##} token = EOL_TOKX; /* accept an end-of-line token */ {## copy(not eolseen) ##} get_token(); /* go find another (significant) character */ {## ##} } else { error("illegal character"); nextch(); get_token(); /* try again */ } break; } /* end switch */ if (err_count>0) err_count--; } /* get_token */ /* ................. */ void next_token() { if (tokenx>1) { tokenx = 1; get_token(); /* goes into token, lsemp */ while (token==DEBUG_TOKX) get_token(); tokary[1] = token; lsempary[1] = lsemp; } else { /*tokenx== 0 or 1*/ /* is in tokary */ token = tokary[tokenx]; lsemp = lsempary[tokenx]; } } /* ...............*/ void tokenread() { tokenx++; } /*................*/ static void putsym(str, tv) char *str; int tv; { symtabtype *symp= makesym(str, RESERVED); symp->usym.tokval=tv; } /* .................. */ static void init_lex() { lsempary[0]=NULL; lsempary[1]=NULL; lsemp = NULL; tokenx=2; getline(); } /*.................*/ void inittables() { init_sym(); /*initialize symbol table*/ clevel= -1; {## { generate a list of 'putsym' calls that associate each token name with a token number } var K: integer; begin indent:= 2; for k:=1 to term_toks do if not is_wild[k] and (like_ident[k] or is_mult_char[k]) then writeln('putsym(', qstring(tokens[k]), ', ', k, ');'); end; ##} clevel= 0; /*goes to 0 for global level*/ } /* ............... */ int open_lex(fname, show_src) char *fname; int show_src; { if (strcmp(fname, "stdin")==0) sfile= stdin; else if ((sfile= fopen(fname, "r"))==NULL) { printf("Unable to open %s\n", fname); return 0; } show_source= show_src; init_lex(); return 1; } /* ............... */ void close_lex() { if (sfile != stdin) fclose(sfile); } #if DEBUG == 1 /* The following is used by the debugger to describe tokens and lexical states */ /* ............. */ void show_lex() { if (lx > 0) report_line(lx-1, line, 0); else report_line(0, line, 0); printf("Next character CH= "); write_anychar(ch); printf(", current token= %s\n", (token >= 1 && token <= ALL_TOKS ? tokstring[token] : "???")); } #endif #ifdef TEST /* When developing a new lexical analyzer through extensive modification of this code, we recommend compiling with -DTEST, which makes this a standalone program. Run it with a source file to test the lexical analyzer. */ /* .......... */ main(argc, argv) int argc; char *argv[]; { /* takes some file as arg 1, sends out copied strings and token lists */ if (argc != 2) { printf("Usage: lex filename\n"); exit(1); } if (open_lex(argv[1])) { while (1) { next_token(); if (token >= 1 && token <= TERM_TOKS) printf("\ntoken= %s", tokstring[token]); else printf("\nUNRECOGNIZABLE TOKEN: %d", token); if (token==STOP_TOKX) break; } } exit(0); } #endif