home *** CD-ROM | disk | FTP | other *** search
- From nmyers@mntgfx.MENTOR.COM Wed Jan 14 17:35:26 1987
- Path: beno!seismo!husc6!panda!genrad!decvax!tektronix!sequent!mntgfx!nmyers
- From: nmyers@mntgfx.MENTOR.COM (Nathan Myers)
- Newsgroups: net.sources
- Subject: C++ Lexical Scanner (in C++)
- Keywords: object parse compiler cfront
- Message-ID: <431@mntgfx.MENTOR.COM>
- Date: 14 Jan 87 22:35:26 GMT
- Organization: Mentor Graphics, Beaverton OR
- Lines: 1054
-
- C++ Lexical Scanner (in C++)
-
- Following this note is a shar-format file containing a
- lexical scanner for C++. Use it however you like. It
- should be adequate to drop into a full compiler.
- For such a use, error reporting would probably need
- some refinement. Also, if the compiler was to have
- a built-in preprocessor (i.e. not using cpp) the handler
- for "#" lines would need a bit of work.
-
- I will be interested in bug reports, significant improvements.
- Nathan Myers tektronix!sequent!mntgfx!nmyers
-
- Share and enjoy!
- -------------------- cut here -----------------------------
- #! /bin/sh
- # This is a shar format file. Extract with sh, not csh.
- #
- echo "x - read.me"
- sed -e 's/^_ //' >read.me <<'%%%EOF%%%'
- _
- _ The following source files are included:
- _
- _ clex.c
- _ clex.h
- _ clex_sym.h
- _ clex_test.c
- _ kwhash.c
- _ Makefile
- _
- _ They implement a self-contained lexical scanner class for C++.
- _ It is extensible by derivation primarily in the area of
- _ processing "#" compiler directives (currently, it only
- _ interprets the "#line" construct).
- _
- _ It has one other degree of flexibility, in its handling
- _ of bracket-enclosed expressions "[]". These may be treated
- _ as a normal sequence of tokens or as delimited strings;
- _ the former is of greater use in a traditional parser, while
- _ the latter is favored for extraction of declarations by
- _ a code browser.
- _
- _ To allay some confusion, I should point out here that
- _ clex_sym.h is used in an unusual way: it is included
- _ twice in the module clex.c; once for declaration
- _ part, once for the (static) definition part. It is
- _ built this way to keep all knowledge of keywords
- _ in a single place.
- _
- _ The file kwhash.c is a standard C standalone program used
- _ to arrive at the collision-free hash function used to
- _ recognize C++ keywords. Any new keyword stands about
- _ one chance in 3 of colliding with an existing keyword,
- _ thus requiring that a new hash function be generated.
- _
- _ The file clex_test.c compiles to a program which reads
- _ C or C++ code from standard input and emits token names
- _ on the standard output. Try it with different values
- _ in the constructor's second argument.
- %%%EOF%%%
- echo "x - Makefile"
- sed -e 's/^_ //' >Makefile <<'%%%EOF%%%'
- _ # Makefile for clex/cparse
- _ INCLUDE=/usr/local/include/CC
- _ CCC=/user/local/CC
- _ CCOPTS= -g -O
- _ # -g: include debug info -O: optimize
- _
- _ all: kwhash clex_test
- _
- _ kwhash: kwhash.c /usr/include/stdio.h
- _ /bin/cc ${CCOPTS} -o kwhash kwhash.c
- _
- _ clex_test: clex_test.o clex.o
- _ ${CCC} -g -o clex_test clex_test.o clex.o
- _
- _ .c.o:
- _ /usr/lib/cpp -I${INCLUDE} -Dc_plusplus=1 $*.c >$*.cpp
- _ /user/mentor/local/cfront +L +f$*.c <$*.cpp >$*..c && /bin/rm $*.cpp
- _ /bin/cc -c ${CCOPTS} $*..c && /bin/mv $*..o $*.o
- _
- _ clex.o: ${INCLUDE}/stdio.h \
- _ ${INCLUDE}/stream.h \
- _ ${INCLUDE}/string.h \
- _ ${INCLUDE}/stdlib.h \
- _ ${INCLUDE}/ctype.h \
- _ ${INCLUDE}/assert.h \
- _ clex.h clex_sym.h
- _
- _ clex_test.o: ${INCLUDE}/stdio.h \
- _ clex.h clex_sym.h
- _
- %%%EOF%%%
- echo "x - clex.h"
- sed -e 's/^_ //' >clex.h <<'%%%EOF%%%'
- _
- _
- _ #ifndef INCLUDED_CLEX
- _ #define INCLUDED_CLEX 1
- _
- _ #ifndef INCLUDED_STDIO
- _ #include <stdio.h>
- _ #endif
- _
- _ enum Boolean { FALSE, TRUE };
- _
- _ #include "clex_sym.h"
- _
- _ class Clex
- _ {
- _ friend class Cparse;
- _ enum Clex_mode
- _ { CL_NONE=0, CL_COMMENT=1, CL_QUOTE=2, CL_POUND=4, CL_BRACK=8 };
- _
- _ protected:
- _ short look; // a one-char lookahead
- _ FILE* fp;
- _ Boolean block_brack; // if TRUE, treat contents of "[]" as a string
- _ long line_num; // line number in original source file
- _ char filename[256]; // name of original source file
- _ short bufsiz; // number of chars currently in buf
- _ char buf[256];
- _
- _ void eat_one() { look = short(getc(fp)); }
- _ void put_in_buf(char c) { if (bufsiz < sizeof(buf)-1) buf[bufsiz++] = c; }
- _ void buf_one() { put_in_buf(look); eat_one(); }
- _ Clex_sym terminate(Clex_sym s) { buf[bufsiz] = '\0'; return s; }
- _ Clex_sym eat_return(Clex_sym);
- _ Clex_sym num(char);
- _ Clex_sym ident(char);
- _ Clex_sym lbrack(Clex_mode);
- _ Clex_sym quote(char, Clex_sym, Clex_mode);
- _ void block_comment(Clex_mode);
- _ void line_comment();
- _ void eoln(Clex_mode);
- _
- _ virtual Boolean pound(Clex_mode, char*, short len);
- _
- _ public:
- _ Clex_sym next();
- _ const char* str() { return buf; }
- _ short strlen() { return bufsiz; }
- _ long line_no() { return line_num; }
- _ const char* fname() { return filename; }
- _ const char* debug(Clex_sym);
- _
- _ Clex(FILE*, Boolean block_brack);
- _ };
- _
- _ #endif
- _
- %%%EOF%%%
- echo "x - clex_sym.h"
- sed -e 's/^_ //' >clex_sym.h <<'%%%EOF%%%'
- _
- _ /* clex_sym.h:
- _ // This file defines both an enum {} type named "symbol", and
- _ // a variable sym_str[] with string representations of the
- _ // symbols. It is intended to maintain an exact
- _ // correspondence between array entries and symbol values.
- _ */
- _
- _ /*
- _ This file is #include'd twice: once for the enum
- _ (with CLEX_IMPLEMENTATION turned off) and once for
- _ the array initialization (with it turned on). The
- _ lower-numbered symbols have uppercase name strings,
- _ but the keyword symbol strings are stored separately.
- _
- _ If a keyword is to be added, add it first to the
- _ standalone program kwhash.c and generate a new
- _ perfect hash function for the new set. Then add
- _ it to both places below and modify the hash function
- _ and table size in clex.c.
- _ */
- _
- _ #ifndef CLEX_IMPLEMENTATION
- _
- _ #define CLEX_S(sym) sym
- _ #define CLEX_S2(sym1,sym2) sym1
- _ enum Clex_sym
- _ {
- _
- _ #else /* CLEX_IMPLEMENTATION */
- _
- _ #undef CLEX_S
- _ #undef CLEX_S2
- _ #define CLEX_S(sym) "sym"
- _ #define CLEX_S2(sym1,sym2) sym2
- _ static char* sym_str[] =
- _ {
- _
- _ #endif /* CLEX_IMPLEMENTATION */
- _
- _ CLEX_S(NONE_S = 0), /* should never get this */
- _
- _ CLEX_S(ERROR_S),
- _ CLEX_S( ERROR_EOLN_S),
- _ CLEX_S( ERROR_EOF_S),
- _ CLEX_S( ERROR_UNKN_S),
- _ #ifndef CLEX_IMPLEMENTATION
- _ CLEX_S(ERROR_MAX = ERROR_UNKN_S),
- _ #endif
- _
- _ CLEX_S(EOF_S),
- _ CLEX_S(EOLN_S), // \n
- _
- _ CLEX_S(BANG_S), // !
- _ CLEX_S( NE_S), // !=
- _ CLEX_S(QUOTE_S), // "
- _ CLEX_S(POUND_S), // #
- _ CLEX_S(MOD_S), // %
- _ CLEX_S( MODAS_S), // %=
- _ CLEX_S(AMPER_S), // &
- _ CLEX_S( LAND_S), // &&
- _ CLEX_S( ANDAS_S), // &=
- _ CLEX_S(APOS_S), // '
- _ CLEX_S(LPAR_S), // (
- _ CLEX_S(RPAR_S), // )
- _ CLEX_S(STAR_S), // *
- _ CLEX_S( MULAS_S), // *=
- _ CLEX_S(PLUS_S), // +
- _ CLEX_S( INCRE_S), // ++
- _ CLEX_S( ADDAS_S), // +=
- _ CLEX_S(COMMA_S), // ),
- _ CLEX_S(MINUS_S), // -
- _ CLEX_S( DECRE_S), // --
- _ CLEX_S( SUBAS_S), // -=
- _ CLEX_S( DEREF_S), // ->
- _ CLEX_S(DOT_S), // .
- _ CLEX_S( ELLIP_S), // ...
- _ CLEX_S(SLASH_S), // /
- _ CLEX_S( DIVAS_S), // /=
- _ CLEX_S(COLON_S), // :
- _ CLEX_S( SCOPE_S), // ::
- _ CLEX_S(SEMI_S), // ;
- _ CLEX_S(LT_S), // <
- _ CLEX_S( LE_S), // <=
- _ CLEX_S( SHL_S), // <<
- _ CLEX_S( SHLAS_S), // <<=
- _ CLEX_S(AS_S), // =
- _ CLEX_S( EQ_S), // ==
- _ CLEX_S(GT_S), // >
- _ CLEX_S( GE_S), // >=
- _ CLEX_S( SHR_S), // >>
- _ CLEX_S( SHRAS_S), // >>=
- _ CLEX_S(QUEST_S), // ?
- _ CLEX_S(AT_S), // @ (undefined)
- _ CLEX_S(LBRACK_S), // [
- _ CLEX_S(BSLASH_S), // \
- _ CLEX_S(RBRACK_S), // ]
- _ CLEX_S(CARET_S), // ^
- _ CLEX_S( XORAS_S), // ^=
- _ CLEX_S(GRAVE_S), // ` (undefined)
- _ CLEX_S(LBRACE_S), // {
- _ CLEX_S(VBAR_S), // |
- _ CLEX_S( LOR_S), // ||
- _ CLEX_S( ORAS_S), // |=
- _ CLEX_S(RBRACE_S), // }
- _ CLEX_S(TILDE_S), // ~
- _
- _ CLEX_S(IDENT_S), // a name, or string that could be a name
- _ CLEX_S(NUM_S), // a numeric string
- _ CLEX_S(FLOATNUM_S) // a recognizably floating-point num
- _
- _ #ifndef CLEX_IMPLEMENTATION
- _ , CLEX_S(KEYWORD_S),
- _
- _ #else
- _
- _ };
- _ static char *keywords[] =
- _ {
- _
- _ #endif
- _
- _ CLEX_S2(ASM_S = KEYWORD_S, "asm"),
- _ CLEX_S2(AUTO_S, "auto"),
- _ CLEX_S2(BREAK_S, "break"),
- _ CLEX_S2(CASE_S, "case"),
- _ CLEX_S2(CHAR_S, "char"),
- _ CLEX_S2(CLASS_S, "class"),
- _ CLEX_S2(CONST_S, "const"),
- _ CLEX_S2(CONTINUE_S, "continue"),
- _ CLEX_S2(DEFAULT_S, "default"),
- _ CLEX_S2(DELETE_S, "delete"),
- _ CLEX_S2(DO_S, "do"),
- _ CLEX_S2(DOUBLE_S, "double"),
- _ CLEX_S2(ELSE_S, "else"),
- _ CLEX_S2(ENUM_S, "enum"),
- _ CLEX_S2(EXTERN_S, "extern"),
- _ CLEX_S2(FLOAT_S, "float"),
- _ CLEX_S2(FOR_S, "for"),
- _ CLEX_S2(FRIEND_S, "friend"),
- _ CLEX_S2(GOTO_S, "goto"),
- _ CLEX_S2(IF_S, "if"),
- _ CLEX_S2(INLINE_S, "inline"),
- _ CLEX_S2(INT_S, "int"),
- _ CLEX_S2(LONG_S, "long"),
- _ CLEX_S2(NEW_S, "new"),
- _ CLEX_S2(OPERATOR_S, "operator"),
- _ CLEX_S2(OVERLOAD_S, "overload"),
- _ CLEX_S2(PRIVATE_S, "private"),
- _ CLEX_S2(PROTECTED_S,"protected"),
- _ CLEX_S2(PUBLIC_S, "public"),
- _ CLEX_S2(REGISTER_S, "register"),
- _ CLEX_S2(RETURN_S, "return"),
- _ CLEX_S2(SHORT_S, "short"),
- _ CLEX_S2(SIGNED_S, "signed"),
- _ CLEX_S2(SIZEOF_S, "sizeof"),
- _ CLEX_S2(STATIC_S, "static"),
- _ CLEX_S2(STRUCT_S, "struct"),
- _ CLEX_S2(SWITCH_S, "switch"),
- _ CLEX_S2(THIS_S, "this"),
- _ CLEX_S2(TYPEDEF_S, "typedef"),
- _ CLEX_S2(UNION_S, "union"),
- _ CLEX_S2(UNSIGNED_S, "unsigned"),
- _ CLEX_S2(VIRTUAL_S, "virtual"),
- _ CLEX_S2(VOLATILE_S, "volatile"),
- _ CLEX_S2(VOID_S, "void"),
- _ CLEX_S2(WHILE_S, "while"),
- _
- _ CLEX_S2(END_OF_SYMBOLS_S, NULL)
- _ };
- _
- _ #ifndef CLEX_IMPLEMENTATION
- _ const CLEX_NUMKEYS = (END_OF_SYMBOLS_S - KEYWORD_S);
- _ #endif
- _
- %%%EOF%%%
- echo "x - clex.c"
- sed -e 's/^_ //' >clex.c <<'%%%EOF%%%'
- _
- _ #ifndef INCLUDED_STREAM
- _ #include <stream.h>
- _ #endif
- _ #ifndef INCLUDED_STRING
- _ #include <string.h>
- _ #endif
- _ #ifndef INCLUDED_STDLIB
- _ #include <stdlib.h>
- _ #endif
- _ #ifndef INCLUDED_ASSERT
- _ #include <assert.h>
- _ #endif
- _ #ifndef INCLUDED_CTYPE
- _ #include <ctype.h>
- _ #endif
- _
- _ #include "clex.h"
- _
- _ // get string value tables, sym_str[] and keyword[] :
- _ #define CLEX_IMPLEMENTATION 1
- _ #include "clex_sym.h"
- _
- _ /******************************************************************************
- _ * *
- _ * KWTABLE -- keyword hash table (internal use only) *
- _ * KWtable implements a collision-free hash table of C++ keywords. The *
- _ * table size and hash function are computed by use of a standalone C *
- _ * program, kwhash.c, included in this directory. *
- _ * *
- _ ******************************************************************************/
- _
- _ #define U_short unsigned short
- _ #define U_char unsigned char
- _
- _ struct KWtable
- _ {
- _ enum { HASHSIZE = 131 }; // as computed by kwhash.c, for a=9,b=2,c=2
- _
- _ struct {
- _ char* kwp;
- _ Clex_sym sym;
- _ } kwhash[HASHSIZE];
- _
- _ KWtable(char**);
- _ U_short hash(const U_char*, U_short len);
- _ void insert(char*, Clex_sym);
- _ Clex_sym lookup(char*, short len);
- _ };
- _
- _ static KWtable kwt = KWtable(keywords); // keywords[] defined in Clex_sym.h
- _
- _ KWtable::
- _ KWtable (char** kwl)
- _ {
- _ short int i;
- _ for (i = 0; i < HASHSIZE; ++i)
- _ kwhash[i].kwp = NULL;
- _ for (i = 0; i < CLEX_NUMKEYS; ++i)
- _ insert(kwl[i], KEYWORD_S + i);
- _ // rely on assert() to prevent hash collisions -- may need
- _ // a new hash function or table size when keyword added.
- _ }
- _
- _ // the values used in the following hash function, and HASHSIZE, were
- _ // determined by use of the standalone C program kwhash.c, to
- _ // ensure that no collisions occur.
- _
- _ inline
- _ U_short KWtable::
- _ hash (const U_char* cp, U_short len)
- _ {
- _ return (((U_short)cp[0] ) ^
- _ ((U_short)cp[1] << 9) ^
- _ ((U_short)cp[len-1] << 2) ^
- _ (len << 2) ) % HASHSIZE;
- _ }
- _
- _ void KWtable::
- _ insert (char* cp, Clex_sym s)
- _ {
- _ U_short h = hash(cp, strlen(cp));
- _ assert(kwt.kwhash[h].kwp == NULL); // collisions not permitted.
- _ kwt.kwhash[h].kwp = cp;
- _ kwt.kwhash[h].sym = s;
- _ }
- _
- _ Clex_sym KWtable::
- _ lookup (char* cp, short len)
- _ {
- _ if (len < 2 || len > 9) return (IDENT_S);
- _ short h = hash(cp, len);
- _ if (kwt.kwhash[h].kwp == NULL) return (IDENT_S);
- _ if (strcmp(kwt.kwhash[h].kwp, cp)) return (IDENT_S);
- _ return (kwt.kwhash[h].sym);
- _ }
- _
- _ /******************************************************************************
- _ * *
- _ * CLEX -- c++ lexical scanner *
- _ * *
- _ ******************************************************************************/
- _
- _ // CONSTRUCTOR Clex:
- _ // The argument block_brack, if TRUE, dictates that the contents
- _ // of square brackets "[]" be returned as a string in the string
- _ // buffer. If false, square brackets are treated as simple tokens.
- _
- _ Clex::
- _ Clex (FILE* f, Boolean b)
- _ {
- _ fp = f;
- _ block_brack = b;
- _ filename[0] = '\0';
- _ bufsiz = 0; buf[0] = '\0';
- _ // prime the pipeline:
- _ line_num = 0;
- _ look = '\n'; // be prepared to handle '#' as first char
- _ }
- _
- _ Clex_sym Clex::
- _ num (char c)
- _ {
- _ Clex_sym s = NUM_S;
- _
- _ bufsiz = 0;
- _ put_in_buf(c);
- _ while (isdigit(look))
- _ buf_one();
- _
- _ // hexadecimal
- _ if (bufsiz == 1 && *buf == '0' && (look == 'x' || look == 'X'))
- _ {
- _ do { buf_one(); }
- _ while (isxdigit(look));
- _ if (look == 'L' || look == 'l' || look == 'U' || look == 'u')
- _ buf_one();
- _ return terminate(s);
- _ }
- _
- _ // long or unsigned
- _ if (look == 'L' || look == 'l' || look == 'U' || look == 'u')
- _ { buf_one(); return terminate(NUM_S); }
- _
- _ // floating point
- _ else if (look == '.')
- _ {
- _ s = FLOATNUM_S;
- _ do { buf_one(); }
- _ while (isdigit(look));
- _ }
- _
- _ // scientific notation
- _ if (look == 'e' || look == 'E')
- _ {
- _ s = FLOATNUM_S;
- _ do { buf_one(); }
- _ while (isdigit(look));
- _ }
- _ else
- _ return terminate(s);
- _
- _ if (look == '+' || look == '-')
- _ do { buf_one(); }
- _ while (isdigit(look));
- _ return terminate(s);
- _ }
- _
- _ Clex_sym Clex::
- _ ident (char first)
- _ {
- _ register Boolean maybe_kw = TRUE;
- _ register short bs = 0;
- _ buf[bs++] = first;
- _ while (isalnum(look) || look == '_' || look == '$')
- _ {
- _ // note: this function accounts for 30% of the total scan time
- _ if (maybe_kw && (isupper(look) || look == '_' ))
- _ maybe_kw = FALSE;
- _ buf[bs++] = look; // don't worry about overflow
- _ eat_one();
- _ }
- _ buf[bs] = '\0';
- _ bufsiz = bs;
- _
- _ if (maybe_kw)
- _ return kwt.lookup(buf, bufsiz);
- _ return IDENT_S;
- _ }
- _
- _ Clex_sym Clex::
- _ quote (char c, Clex_sym s, Clex_mode m)
- _ {
- _ if (m == CL_NONE)
- _ bufsiz = 0;
- _ while (look != c)
- _ {
- _ if (look == EOF)
- _ { return terminate(ERROR_EOF_S); }
- _ else if (look == '\n')
- _ { return terminate(ERROR_EOLN_S); }
- _ else if (look == '\\')
- _ {
- _ eat_one();
- _ if (look == '\n')
- _ { eat_one(); eoln(m|CL_QUOTE); continue; }
- _ else if (look == EOF)
- _ { return terminate(ERROR_EOF_S); }
- _ else
- _ put_in_buf('\\'); // this handles \' and \" too.
- _ }
- _ buf_one();
- _ }
- _ eat_one(); // eat the closing quote
- _ return terminate(s);
- _ }
- _
- _
- _ // lbrack() accumulates the contents between "[" and "]" into
- _ // the string buffer, handling syntactically quoted strings,
- _ // comments, and nested brackets. Note that lbrack() is
- _ // called recursively in the case of nested brackets.
- _
- _ Clex_sym Clex::
- _ lbrack (Clex_mode m)
- _ {
- _ if (m == CL_NONE)
- _ bufsiz = 0;
- _ while (look != ']')
- _ {
- _ if (look == EOF)
- _ return terminate(ERROR_EOF_S);
- _
- _ else if (look == '\n')
- _ { eat_one(); eoln(m|CL_BRACK); }
- _ else if (look == '[')
- _ {
- _ buf_one();
- _ if (lbrack(m|CL_BRACK) == ERROR_EOF_S)
- _ return ERROR_EOF_S; // already cleaned up.
- _ else put_in_buf(']');
- _ }
- _ else if (look == '\'' || look == '"')
- _ {
- _ char c = look;
- _ buf_one();
- _ (void) quote(c, NONE_S, m|CL_BRACK);
- _ put_in_buf(c);
- _ }
- _ else if (look == '/') // maybe a comment
- _ {
- _ eat_one();
- _ if (look == '/')
- _ line_comment();
- _ else if (look == '*')
- _ {
- _ block_comment(m|CL_BRACK);
- _ if (look == EOF) return terminate(ERROR_EOF_S);
- _ }
- _ else // stash the '/' and the char after
- _ { put_in_buf('/'); buf_one(); }
- _ }
- _ else // just a character to save
- _ buf_one();
- _ }
- _
- _ eat_one(); // eat the ']'.
- _ return terminate(LBRACK_S);
- _ }
- _
- _
- _ void Clex::
- _ block_comment(Clex_mode m)
- _ {
- _ eat_one(); // eat the '*'
- _ while (! (look == '*' && (eat_one(), look == '/')) )
- _ {
- _ if (look == EOF) return;
- _ if (look == '\n') { eat_one(); eoln(m|CL_COMMENT); }
- _ else if (look != '*') eat_one();
- _ }
- _ eat_one(); // eat the '/'
- _ }
- _
- _ void Clex::
- _ line_comment()
- _ {
- _ do { eat_one(); }
- _ while (look != '\n' && look != EOF);
- _ }
- _
- _ // eat_return() is intended to save space in Clex::next() -- the
- _ // inline function eat_one() produces quite a lot of code.
- _ Clex_sym Clex::
- _ eat_return(Clex_sym s)
- _ { eat_one(); return s; }
- _
- _ Clex_sym Clex::
- _ next()
- _ {
- _ short val;
- _ while (val = look, eat_one(), val != EOF)
- _ {
- _ char ch = char(val);
- _ switch (ch)
- _ {
- _ case ' ' : continue;
- _
- _ case '_' :
- _ case '$' : return ident(ch);
- _
- _ case '0' : case '1' : case '2' : case '3' : case '4' :
- _ case '5' : case '6' : case '7' : case '8' : case '9' :
- _ return num(ch);
- _
- _ case ',' : return COMMA_S;
- _ case ';' : return SEMI_S;
- _ case '[' : if (block_brack) return lbrack(CL_NONE);
- _ else return LBRACK_S;
- _ case ']' : return RBRACK_S;
- _ case '{' : return LBRACE_S;
- _ case '}' : return RBRACE_S;
- _ case '(' : return LPAR_S;
- _ case ')' : return RPAR_S;
- _ case '~' : return TILDE_S;
- _ case '?' : return QUEST_S;
- _ case '"' : return quote(ch, QUOTE_S, CL_NONE);
- _ case '\'': return quote(ch, APOS_S, CL_NONE);
- _
- _ case '=' : // '=', '=='
- _ if (look != '=') return AS_S;
- _ else return eat_return(EQ_S);
- _
- _ case ':' : // ":", "::"
- _ if (look != ':') return COLON_S;
- _ else return eat_return(SCOPE_S);
- _
- _ case '!' : // "!", "!="
- _ if (look != '=') return BANG_S;
- _ else return eat_return(NE_S);
- _
- _ case '^' : // "^", "^="
- _ if (look != '=') return CARET_S;
- _ else return eat_return(XORAS_S);
- _
- _ case '*' : // '*', '*='
- _ if (look != '=') return STAR_S;
- _ else return eat_return(MULAS_S);
- _
- _ case '%' : // '%', '%='
- _ if (look != '=') return MOD_S;
- _ else return eat_return(MODAS_S);
- _
- _ case '|' : // "|=", "||", "|"
- _ if (look == '|') return eat_return(LOR_S);
- _ else if (look == '=') return eat_return(ORAS_S);
- _ else return VBAR_S;
- _
- _ case '&' : // "&", "&=", "&&"
- _ if (look == '&') return eat_return(LAND_S);
- _ else if (look == '=') return eat_return(ANDAS_S);
- _ else return AMPER_S;
- _
- _ case '+' : // '+', '++', '+='
- _ if (look == '+') return eat_return(INCRE_S);
- _ else if (look == '=') return eat_return(ADDAS_S);
- _ else return PLUS_S;
- _
- _ case '-' : // '--', '-=', '->', '-',
- _ if (look == '-') return eat_return(DECRE_S);
- _ else if (look == '=') return eat_return(SUBAS_S);
- _ else if (look == '>') return eat_return(DEREF_S);
- _ else return MINUS_S;
- _
- _ case '/' : // '/*', '//', '/=', '/'
- _ if (look == '*')
- _ {
- _ block_comment(CL_NONE);
- _ if (look == EOF) // almost certainly a mistake:
- _ return ERROR_EOF_S;
- _ else continue;
- _ }
- _ else if (look == '/')
- _ { line_comment(); continue; }
- _ else if (look == '=') return eat_return(DIVAS_S);
- _ else return SLASH_S;
- _
- _ case '.' : // ".", "..."
- _ if (isdigit(look)) return num(ch);
- _ else if (look == '.')
- _ {
- _ eat_one(); // check for "..", undefined.
- _ if (look != '.') return ERROR_UNKN_S;
- _ else return eat_return(ELLIP_S);
- _ }
- _ else return DOT_S;
- _
- _ case '<' : // '<=', '<', '<<', '<<='
- _ if (look == '=') return eat_return(LE_S);
- _ else if (look == '<')
- _ {
- _ eat_one();
- _ if (look != '=') return SHL_S;
- _ else return eat_return(SHLAS_S);
- _ }
- _ else return LT_S;
- _
- _ case '>' : // '>=', '>', '>>', '>>='
- _ if (look == '=') return eat_return(GE_S);
- _ else if (look == '>')
- _ {
- _ eat_one();
- _ if (look != '=') return SHR_S;
- _ else return eat_return(SHRAS_S);
- _ }
- _ else return GT_S;
- _
- _ default:
- _ if (isalpha(ch))
- _ return ident(ch);
- _ if (ch == '\n')
- _ eoln(CL_NONE);
- _ else if (iscntrl(ch))
- _ continue;
- _ else
- _ return ERROR_UNKN_S;
- _ }
- _ }
- _
- _ return EOF_S;
- _ }
- _
- _ struct Quickbuf
- _ {
- _ short len;
- _ char line[10240];
- _ void put_in(char c) { if (len < sizeof(line)-1) line[len++] = c; }
- _ void terminate() { line[len] = '\0'; }
- _ Quickbuf() { len = 0; }
- _ };
- _
- _ void Clex::
- _ eoln(Clex_mode m)
- _ {
- _ // assume NL character already eaten.
- _ ++line_num;
- _ // don't process '#' lines in quotes, comments, or '#' continuations.
- _ if (m & (CL_QUOTE|CL_POUND|CL_COMMENT))
- _ return;
- _
- _ // eat whitespace
- _ while (look != EOF && look != '\n')
- _ {
- _ if (look == ' ' || iscntrl(char(look))) eat_one();
- _ else break;
- _ }
- _ if (look != '#')
- _ return;
- _
- _ // eat the '#' and subsequent whitespace
- _ do { eat_one(); if (look == EOF || look == '\n') break; }
- _ while (look == ' ' || iscntrl(char(look)));
- _
- _ // collect the '#' line
- _ Quickbuf b;
- _ do { // record line
- _ if (look == '\\') // check for continuation line
- _ {
- _ eat_one();
- _ if (look == '\n') { eat_one(); eoln(m|CL_POUND); }
- _ else { b.put_in('\\'); }
- _ }
- _ else if (look == '/') // check for comment in '#' line
- _ {
- _ eat_one();
- _ if (look == '*')
- _ {
- _ block_comment(m|CL_POUND);
- _ if (look == EOF) break;
- _ }
- _ else if (look == '/') line_comment();
- _ else { b.put_in('/'); }
- _ }
- _ else
- _ {
- _ if (iscntrl(char(look))) look = ' ';
- _ b.put_in(look);
- _ eat_one();
- _ }
- _
- _ } while (look != '\n' && look != EOF);
- _ b.terminate();
- _
- _ (void) pound(m, b.line, b.len); // call virtual handler
- _ }
- _
- _ Boolean Clex::
- _ pound (Clex_mode m, char* line, short len)
- _ {
- _ void(m); // to keep cfront blissful
- _ char* cp = line;
- _ if (!isdigit(*cp))
- _ {
- _ if (len < 5) return FALSE;
- _ if (strncmp(cp, "line ", 5) != 0)
- _ return FALSE; // don't know what it is
- _ cp += 4;
- _ while (*cp == ' ') ++cp;
- _ if (!isdigit(*cp))
- _ return FALSE;
- _ }
- _
- _ // # <line> "<filename>" or #line <line> "<filename>"
- _ line_num = atoi(cp) - 1; // will be incremented by eoln() later
- _ while (isdigit(*cp)) ++cp;
- _ while (*cp == ' ') ++cp;
- _ if (*cp == '"')
- _ {
- _ char* cpq = cp;
- _ do { ++cpq; }
- _ while (*cpq != '"' && *cpq != '\0');
- _ strncpy(filename, cp+1, cpq - cp - 1);
- _ filename[cpq - cp - 1] = '\0';
- _ }
- _
- _ return TRUE;
- _ }
- _
- _ const char* Clex::
- _ debug (Clex_sym s)
- _ {
- _ return (s >= KEYWORD_S) ? keywords[s - KEYWORD_S] : sym_str[s] ;
- _ }
- %%%EOF%%%
- echo "x - kwhash.c"
- sed -e 's/^_ //' >kwhash.c <<'%%%EOF%%%'
- _
- _ /* this is a C program */
- _
- _ #include <stdio.h>
- _
- _ static char *keywords[] =
- _ {
- _ "asm",
- _ "auto",
- _ "break",
- _ "case",
- _ "char",
- _ "class",
- _ "const",
- _ "continue",
- _ "default",
- _ "delete",
- _ "do",
- _ "double",
- _ "else",
- _ "enum",
- _ "extern",
- _ "float",
- _ "for",
- _ "friend",
- _ "goto",
- _ "if",
- _ "inline",
- _ "int",
- _ "long",
- _ "new",
- _ "operator",
- _ "overload",
- _ "private",
- _ "protected",
- _ "public",
- _ "register",
- _ "return",
- _ "short",
- _ "signed",
- _ "sizeof",
- _ "static",
- _ "struct",
- _ "switch",
- _ "this",
- _ "typedef",
- _ "union",
- _ "unsigned",
- _ "virtual",
- _ "volatile",
- _ "void",
- _ "while"
- _ };
- _
- _ #define KW_NUMKEYS (sizeof(keywords)/sizeof(*keywords))
- _
- _ unsigned int hashsize = 137;
- _ char** kwhash;
- _ typedef unsigned short u_short;
- _
- _ u_short
- _ hash(cp, len, a, b, c)
- _ unsigned char* cp;
- _ u_short len;
- _ u_short a, b, c;
- _ {
- _ return (((u_short)cp[0] ) ^
- _ ((u_short)cp[1] << a) ^
- _ ((u_short)cp[len-1] << b) ^
- _ (len << c) ) % hashsize;
- _ }
- _
- _ int
- _ insert(cp, a, b, c)
- _ char *cp;
- _ u_short a, b, c;
- _ {
- _ short h;
- _
- _ h = hash(cp, strlen(cp), a, b, c);
- _ if (kwhash[h] != NULL)
- _ {
- _ /*
- _ printf("Keyword hash collision: %s, %s\n", kwhash[h], cp);
- _ */
- _ return 0;
- _ }
- _ else
- _ kwhash[h] = cp;
- _ return 1;
- _ }
- _
- _ int
- _ try(a, b, c)
- _ short a, b, c;
- _ {
- _ short int i;
- _ int collisions;
- _
- _ collisions = 0;
- _ for (i = 0; i < hashsize; ++i)
- _ kwhash[i] = NULL;
- _ for (i = 0; i < KW_NUMKEYS; ++i)
- _ if (!insert(keywords[i], a, b, c))
- _ ++collisions;
- _ return collisions;
- _ }
- _
- _ main(argc, argv)
- _ int argc;
- _ char **argv;
- _ {
- _ int min_collisions;
- _ int min_abc = 0;
- _ short a, b, c;
- _
- _ if (argc > 1) hashsize = atoi(argv[1]);
- _ else
- _ {
- _ printf("usage: %s <hash_size>\n\t<hash_size> should be prime.\n",
- _ argv[0]);
- _ exit(-1);
- _ }
- _
- _ if (hashsize < KW_NUMKEYS)
- _ {
- _ printf("Hash table is too small.\n");
- _ exit(-1);
- _ }
- _
- _ kwhash = (char**) malloc(hashsize * sizeof(char*));
- _ min_collisions = hashsize + 1;
- _ for (a = 0; a <= 10; ++a)
- _ {
- _ for (b = 0; b <= 10; ++b)
- _ {
- _ for (c = 0; c <= 10; ++c)
- _ {
- _ int collisions;
- _
- _ collisions = try(a, b, c);
- _ if (collisions <= min_collisions)
- _ {
- _ printf("abc: %03x Collisions: %2d ",
- _ ((a<<8)|(b<<4)|c), collisions);
- _ min_collisions = collisions;
- _ if (collisions == 0) putchar('*');
- _ putchar('\n');
- _ }
- _ }
- _ }
- _ }
- _ }
- %%%EOF%%%
- echo "x - clex_test.c"
- sed -e 's/^_ //' >clex_test.c <<'%%%EOF%%%'
- _
- _ // clex_test -- test code for clex.o
- _
- _ #include "clex.h"
- _
- _ main()
- _ {
- _ Clex cl = Clex(stdin, TRUE);
- _ Clex_sym s;
- _ do {
- _ s = cl.next();
- _ printf("%5D ", cl.line_no());
- _ if (s >= KEYWORD_S)
- _ printf(" %s\n", cl.debug(s));
- _ else if (s == IDENT_S ||
- _ s == NUM_S ||
- _ s == FLOATNUM_S ||
- _ s == LBRACK_S ||
- _ s == APOS_S ||
- _ s == QUOTE_S )
- _ printf( " %s \"%s\"\n", cl.debug(s), cl.str());
- _ else
- _ printf( " %s\n", cl.debug(s));
- _ } while (s > EOF_S);
- _
- _ exit(0);
- _ }
- %%%EOF%%%
- echo "Done."
- exit 0
-
-
-