home *** CD-ROM | disk | FTP | other *** search
/ Usenet 1994 October / usenetsourcesnewsgroupsinfomagicoctober1994disk2.iso / net / c++.scanner < prev    next >
Internet Message Format  |  1987-01-16  |  32KB

  1. From nmyers@mntgfx.MENTOR.COM Wed Jan 14 17:35:26 1987
  2. Path: beno!seismo!husc6!panda!genrad!decvax!tektronix!sequent!mntgfx!nmyers
  3. From: nmyers@mntgfx.MENTOR.COM (Nathan Myers)
  4. Newsgroups: net.sources
  5. Subject: C++ Lexical Scanner (in C++)
  6. Keywords: object parse compiler cfront
  7. Message-ID: <431@mntgfx.MENTOR.COM>
  8. Date: 14 Jan 87 22:35:26 GMT
  9. Organization: Mentor Graphics, Beaverton OR
  10. Lines: 1054
  11.  
  12. C++ Lexical Scanner (in C++)
  13.  
  14. Following this note is a shar-format file containing a
  15. lexical scanner for C++.  Use it however you like.  It
  16. should be adequate to drop into a full compiler.
  17. For such a use, error reporting would probably need 
  18. some refinement.  Also, if the compiler was to have
  19. a built-in preprocessor (i.e. not using cpp) the handler
  20. for "#" lines would need a bit of work.
  21.  
  22. I will be interested in bug reports, significant improvements.
  23. Nathan Myers   tektronix!sequent!mntgfx!nmyers
  24.  
  25. Share and enjoy!
  26. -------------------- cut here -----------------------------
  27. #! /bin/sh
  28. #  This is a shar format file.  Extract with sh, not csh.
  29. #
  30. echo "x - read.me"
  31. sed -e 's/^_ //' >read.me <<'%%%EOF%%%'
  32. _ The following source files are included:
  33. _     clex.c
  34. _     clex.h
  35. _     clex_sym.h
  36. _     clex_test.c
  37. _     kwhash.c
  38. _     Makefile
  39. _ They implement a self-contained lexical scanner class for C++.
  40. _ It is extensible by derivation primarily in the area of
  41. _ processing "#" compiler directives (currently, it only
  42. _ interprets the "#line" construct).
  43. _ It has one other degree of flexibility, in its handling
  44. _ of bracket-enclosed expressions "[]".  These may be treated
  45. _ as a normal sequence of tokens or as delimited strings;
  46. _ the former is of greater use in a traditional parser, while
  47. _ the latter is favored for extraction of declarations by
  48. _ a code browser.
  49. _ To allay some confusion, I should point out here that
  50. _ clex_sym.h is used in an unusual way: it is included
  51. _ twice in the module clex.c; once for declaration
  52. _ part, once for the (static) definition part.  It is
  53. _ built this way to keep all knowledge of keywords
  54. _ in a single place.
  55. _ The file kwhash.c is a standard C standalone program used
  56. _ to arrive at the collision-free hash function used to
  57. _ recognize C++ keywords.  Any new keyword stands about
  58. _ one chance in 3 of colliding with an existing keyword,
  59. _ thus requiring that a new hash function be generated.
  60. _ The file clex_test.c compiles to a program which reads
  61. _ C or C++ code from standard input and emits token names
  62. _ on the standard output.  Try it with different values
  63. _ in the constructor's second argument.
  64. %%%EOF%%%
  65. echo "x - Makefile"
  66. sed -e 's/^_ //' >Makefile <<'%%%EOF%%%'
  67. _ # Makefile for clex/cparse
  68. _ INCLUDE=/usr/local/include/CC
  69. _ CCC=/user/local/CC
  70. _ CCOPTS= -g -O
  71. _ # -g: include debug info  -O: optimize
  72. _ all: kwhash clex_test
  73. _ kwhash: kwhash.c /usr/include/stdio.h
  74. _         /bin/cc ${CCOPTS} -o kwhash kwhash.c
  75. _ clex_test: clex_test.o clex.o
  76. _         ${CCC} -g -o clex_test clex_test.o clex.o
  77. _ .c.o:
  78. _         /usr/lib/cpp -I${INCLUDE} -Dc_plusplus=1 $*.c >$*.cpp
  79. _         /user/mentor/local/cfront +L +f$*.c <$*.cpp >$*..c && /bin/rm $*.cpp
  80. _         /bin/cc -c ${CCOPTS} $*..c && /bin/mv $*..o $*.o
  81. _ clex.o: ${INCLUDE}/stdio.h \
  82. _          ${INCLUDE}/stream.h \
  83. _          ${INCLUDE}/string.h \
  84. _          ${INCLUDE}/stdlib.h \
  85. _          ${INCLUDE}/ctype.h \
  86. _          ${INCLUDE}/assert.h \
  87. _          clex.h clex_sym.h
  88. _ clex_test.o:    ${INCLUDE}/stdio.h \
  89. _                 clex.h clex_sym.h
  90. %%%EOF%%%
  91. echo "x - clex.h"
  92. sed -e 's/^_ //' >clex.h <<'%%%EOF%%%'
  93. _ #ifndef INCLUDED_CLEX
  94. _ #define INCLUDED_CLEX 1
  95. _ #ifndef INCLUDED_STDIO
  96. _ #include <stdio.h>
  97. _ #endif
  98. _ enum Boolean { FALSE, TRUE };
  99. _ #include "clex_sym.h"
  100. _ class Clex
  101. _     { 
  102. _  friend class Cparse;
  103. _     enum Clex_mode
  104. _         { CL_NONE=0, CL_COMMENT=1, CL_QUOTE=2, CL_POUND=4, CL_BRACK=8 };
  105. _  protected:
  106. _     short   look;           // a one-char lookahead
  107. _     FILE*   fp;
  108. _     Boolean block_brack;    // if TRUE, treat contents of "[]" as a string
  109. _     long    line_num;       // line number in original source file
  110. _     char    filename[256];  // name of original source file
  111. _     short   bufsiz;         // number of chars currently in buf
  112. _     char    buf[256];
  113. _     void eat_one()          { look = short(getc(fp)); }
  114. _     void put_in_buf(char c) { if (bufsiz < sizeof(buf)-1) buf[bufsiz++] = c; }
  115. _     void buf_one()          { put_in_buf(look); eat_one(); }
  116. _     Clex_sym terminate(Clex_sym s)  { buf[bufsiz] = '\0'; return s; }
  117. _     Clex_sym eat_return(Clex_sym);
  118. _     Clex_sym num(char);
  119. _     Clex_sym ident(char);
  120. _     Clex_sym lbrack(Clex_mode);
  121. _     Clex_sym quote(char, Clex_sym, Clex_mode);
  122. _     void block_comment(Clex_mode);
  123. _     void line_comment();
  124. _     void eoln(Clex_mode);
  125. _     virtual Boolean pound(Clex_mode, char*, short len);
  126. _  public:
  127. _     Clex_sym    next();
  128. _     const char* str()           { return buf; }
  129. _     short       strlen()        { return bufsiz; }
  130. _     long        line_no()       { return line_num; }
  131. _     const char* fname()         { return filename; }
  132. _     const char* debug(Clex_sym);
  133. _     Clex(FILE*, Boolean block_brack);
  134. _     };
  135. _ #endif
  136. %%%EOF%%%
  137. echo "x - clex_sym.h"
  138. sed -e 's/^_ //' >clex_sym.h <<'%%%EOF%%%'
  139. _ /* clex_sym.h:
  140. _ // This file defines both an enum {} type named "symbol", and
  141. _ //  a variable sym_str[] with string representations of the
  142. _ //  symbols.  It is intended to maintain an exact
  143. _ //  correspondence between array entries and symbol values.
  144. _ */
  145. _ /*
  146. _     This file is #include'd twice: once for the enum
  147. _     (with CLEX_IMPLEMENTATION turned off) and once for
  148. _     the array initialization (with it turned on).  The
  149. _     lower-numbered symbols have uppercase name strings,
  150. _     but the keyword symbol strings are stored separately.
  151. _     If a keyword is to be added, add it first to the
  152. _     standalone program kwhash.c and generate a new
  153. _     perfect hash function for the new set.  Then add
  154. _     it to both places below and modify the hash function
  155. _     and table size in clex.c.
  156. _ */
  157. _ #ifndef CLEX_IMPLEMENTATION
  158. _ #define CLEX_S(sym) sym
  159. _ #define CLEX_S2(sym1,sym2) sym1
  160. _ enum Clex_sym
  161. _     {
  162. _ #else /* CLEX_IMPLEMENTATION */
  163. _ #undef  CLEX_S
  164. _ #undef  CLEX_S2
  165. _ #define CLEX_S(sym) "sym"
  166. _ #define CLEX_S2(sym1,sym2) sym2
  167. _ static char* sym_str[] =
  168. _     {
  169. _ #endif /* CLEX_IMPLEMENTATION */
  170. _     CLEX_S(NONE_S = 0),    /* should never get this */
  171. _     CLEX_S(ERROR_S),
  172. _     CLEX_S(  ERROR_EOLN_S),
  173. _     CLEX_S(  ERROR_EOF_S),
  174. _     CLEX_S(  ERROR_UNKN_S),
  175. _ #ifndef CLEX_IMPLEMENTATION
  176. _     CLEX_S(ERROR_MAX = ERROR_UNKN_S),
  177. _ #endif
  178. _     CLEX_S(EOF_S),        
  179. _     CLEX_S(EOLN_S),         // \n
  180. _     CLEX_S(BANG_S),         // !
  181. _     CLEX_S(  NE_S),         // !=
  182. _     CLEX_S(QUOTE_S),        // "
  183. _     CLEX_S(POUND_S),        // #
  184. _     CLEX_S(MOD_S),          // %
  185. _     CLEX_S(  MODAS_S),      // %=
  186. _     CLEX_S(AMPER_S),        // &
  187. _     CLEX_S(  LAND_S),       // &&
  188. _     CLEX_S(  ANDAS_S),      // &=
  189. _     CLEX_S(APOS_S),         // '
  190. _     CLEX_S(LPAR_S),         // (
  191. _     CLEX_S(RPAR_S),         // )
  192. _     CLEX_S(STAR_S),         // *
  193. _     CLEX_S(  MULAS_S),      // *=
  194. _     CLEX_S(PLUS_S),         // +
  195. _     CLEX_S(  INCRE_S),      // ++
  196. _     CLEX_S(  ADDAS_S),      // +=
  197. _     CLEX_S(COMMA_S),        // ),
  198. _     CLEX_S(MINUS_S),        // -
  199. _     CLEX_S(  DECRE_S),      // --
  200. _     CLEX_S(  SUBAS_S),      // -=
  201. _     CLEX_S(  DEREF_S),      // ->
  202. _     CLEX_S(DOT_S),          // .
  203. _     CLEX_S(  ELLIP_S),      // ...
  204. _     CLEX_S(SLASH_S),        // / 
  205. _     CLEX_S(  DIVAS_S),      // /=
  206. _     CLEX_S(COLON_S),        // :
  207. _     CLEX_S(  SCOPE_S),      // ::
  208. _     CLEX_S(SEMI_S),         // ;
  209. _     CLEX_S(LT_S),           // <
  210. _     CLEX_S(  LE_S),         // <=
  211. _     CLEX_S(  SHL_S),        // <<
  212. _     CLEX_S(  SHLAS_S),      // <<=
  213. _     CLEX_S(AS_S),           // =
  214. _     CLEX_S(  EQ_S),         // ==
  215. _     CLEX_S(GT_S),           // >
  216. _     CLEX_S(  GE_S),         // >=
  217. _     CLEX_S(  SHR_S),        // >>
  218. _     CLEX_S(  SHRAS_S),      // >>=
  219. _     CLEX_S(QUEST_S),        // ?
  220. _     CLEX_S(AT_S),           // @ (undefined)
  221. _     CLEX_S(LBRACK_S),       // [
  222. _     CLEX_S(BSLASH_S),       // \ 
  223. _     CLEX_S(RBRACK_S),       // ]
  224. _     CLEX_S(CARET_S),        // ^
  225. _     CLEX_S(  XORAS_S),      // ^=
  226. _     CLEX_S(GRAVE_S),        // ` (undefined)
  227. _     CLEX_S(LBRACE_S),       // {
  228. _     CLEX_S(VBAR_S),         // |
  229. _     CLEX_S(  LOR_S),        // ||
  230. _     CLEX_S(  ORAS_S),       // |=
  231. _     CLEX_S(RBRACE_S),       // }
  232. _     CLEX_S(TILDE_S),        // ~
  233. _     CLEX_S(IDENT_S),        // a name, or string that could be a name
  234. _     CLEX_S(NUM_S),          // a numeric string
  235. _     CLEX_S(FLOATNUM_S)      // a recognizably floating-point num
  236. _ #ifndef CLEX_IMPLEMENTATION
  237. _     , CLEX_S(KEYWORD_S),
  238. _ #else
  239. _     };
  240. _  static char *keywords[] =
  241. _     {
  242. _ #endif
  243. _     CLEX_S2(ASM_S = KEYWORD_S, "asm"),
  244. _     CLEX_S2(AUTO_S,     "auto"),
  245. _     CLEX_S2(BREAK_S,    "break"),
  246. _     CLEX_S2(CASE_S,     "case"),
  247. _     CLEX_S2(CHAR_S,     "char"),
  248. _     CLEX_S2(CLASS_S,    "class"),
  249. _     CLEX_S2(CONST_S,    "const"),
  250. _     CLEX_S2(CONTINUE_S, "continue"),
  251. _     CLEX_S2(DEFAULT_S,  "default"),
  252. _     CLEX_S2(DELETE_S,   "delete"),
  253. _     CLEX_S2(DO_S,       "do"),
  254. _     CLEX_S2(DOUBLE_S,   "double"),
  255. _     CLEX_S2(ELSE_S,     "else"),
  256. _     CLEX_S2(ENUM_S,     "enum"),
  257. _     CLEX_S2(EXTERN_S,   "extern"),
  258. _     CLEX_S2(FLOAT_S,    "float"),
  259. _     CLEX_S2(FOR_S,      "for"),
  260. _     CLEX_S2(FRIEND_S,   "friend"),
  261. _     CLEX_S2(GOTO_S,     "goto"),
  262. _     CLEX_S2(IF_S,       "if"),
  263. _     CLEX_S2(INLINE_S,   "inline"),
  264. _     CLEX_S2(INT_S,      "int"),
  265. _     CLEX_S2(LONG_S,     "long"),
  266. _     CLEX_S2(NEW_S,      "new"),
  267. _     CLEX_S2(OPERATOR_S, "operator"),
  268. _     CLEX_S2(OVERLOAD_S, "overload"),
  269. _     CLEX_S2(PRIVATE_S,  "private"),
  270. _     CLEX_S2(PROTECTED_S,"protected"),
  271. _     CLEX_S2(PUBLIC_S,   "public"),
  272. _     CLEX_S2(REGISTER_S, "register"),
  273. _     CLEX_S2(RETURN_S,   "return"),
  274. _     CLEX_S2(SHORT_S,    "short"),
  275. _     CLEX_S2(SIGNED_S,   "signed"),
  276. _     CLEX_S2(SIZEOF_S,   "sizeof"),
  277. _     CLEX_S2(STATIC_S,   "static"),
  278. _     CLEX_S2(STRUCT_S,   "struct"),
  279. _     CLEX_S2(SWITCH_S,   "switch"),
  280. _     CLEX_S2(THIS_S,     "this"),
  281. _     CLEX_S2(TYPEDEF_S,  "typedef"),
  282. _     CLEX_S2(UNION_S,    "union"),
  283. _     CLEX_S2(UNSIGNED_S, "unsigned"),
  284. _     CLEX_S2(VIRTUAL_S,  "virtual"),
  285. _     CLEX_S2(VOLATILE_S, "volatile"),
  286. _     CLEX_S2(VOID_S,     "void"),
  287. _     CLEX_S2(WHILE_S,    "while"),
  288. _     CLEX_S2(END_OF_SYMBOLS_S, NULL)
  289. _     };
  290. _ #ifndef CLEX_IMPLEMENTATION
  291. _ const CLEX_NUMKEYS = (END_OF_SYMBOLS_S - KEYWORD_S);
  292. _ #endif
  293. %%%EOF%%%
  294. echo "x - clex.c"
  295. sed -e 's/^_ //' >clex.c <<'%%%EOF%%%'
  296. _ #ifndef INCLUDED_STREAM
  297. _ #include <stream.h>
  298. _ #endif
  299. _ #ifndef INCLUDED_STRING
  300. _ #include <string.h>
  301. _ #endif
  302. _ #ifndef INCLUDED_STDLIB
  303. _ #include <stdlib.h>
  304. _ #endif
  305. _ #ifndef INCLUDED_ASSERT
  306. _ #include <assert.h>
  307. _ #endif
  308. _ #ifndef INCLUDED_CTYPE
  309. _ #include <ctype.h>
  310. _ #endif
  311. _ #include "clex.h"
  312. _ // get string value tables, sym_str[] and keyword[] :
  313. _ #define CLEX_IMPLEMENTATION 1
  314. _ #include "clex_sym.h"
  315. _ /******************************************************************************
  316. _ *                                                                             *
  317. _ *  KWTABLE -- keyword hash table (internal use only)                          *
  318. _ *     KWtable implements a collision-free hash table of C++ keywords.  The    *
  319. _ *     table size and hash function are computed by use of a standalone C      *
  320. _ *     program, kwhash.c, included in this directory.                          *
  321. _ *                                                                             *
  322. _ ******************************************************************************/
  323. _ #define U_short unsigned short
  324. _ #define U_char  unsigned char
  325. _ struct KWtable
  326. _     {
  327. _     enum { HASHSIZE = 131 };  // as computed by kwhash.c, for a=9,b=2,c=2
  328. _     struct  {
  329. _             char* kwp;
  330. _             Clex_sym sym;
  331. _             } kwhash[HASHSIZE];
  332. _     KWtable(char**);
  333. _     U_short hash(const U_char*, U_short len);
  334. _     void insert(char*, Clex_sym);
  335. _     Clex_sym lookup(char*, short len);
  336. _     };
  337. _ static KWtable kwt = KWtable(keywords); // keywords[] defined in Clex_sym.h
  338. _ KWtable::
  339. _ KWtable (char** kwl)
  340. _     {
  341. _     short int i;
  342. _     for (i = 0; i < HASHSIZE; ++i)
  343. _         kwhash[i].kwp = NULL;
  344. _     for (i = 0; i < CLEX_NUMKEYS; ++i)
  345. _         insert(kwl[i], KEYWORD_S + i);
  346. _     // rely on assert() to prevent hash collisions -- may need
  347. _     //  a new hash function or table size when keyword added.
  348. _     }
  349. _ // the values used in the following hash function, and HASHSIZE, were
  350. _ // determined by use of the standalone C program kwhash.c, to
  351. _ // ensure that no collisions occur.
  352. _ inline
  353. _ U_short KWtable::
  354. _ hash (const U_char* cp, U_short len)
  355. _     {
  356. _     return (((U_short)cp[0]         ) ^
  357. _             ((U_short)cp[1]     << 9) ^
  358. _             ((U_short)cp[len-1] << 2) ^
  359. _             (len                << 2) ) % HASHSIZE;
  360. _     }
  361. _ void KWtable::
  362. _ insert (char* cp, Clex_sym s)
  363. _     {
  364. _     U_short h = hash(cp, strlen(cp));
  365. _     assert(kwt.kwhash[h].kwp == NULL);  // collisions not permitted.
  366. _     kwt.kwhash[h].kwp = cp;
  367. _     kwt.kwhash[h].sym = s;
  368. _     }
  369. _ Clex_sym KWtable::
  370. _ lookup (char* cp, short len)
  371. _     {
  372. _     if (len < 2 || len > 9) return (IDENT_S);
  373. _     short h = hash(cp, len);
  374. _     if (kwt.kwhash[h].kwp == NULL) return (IDENT_S);
  375. _     if (strcmp(kwt.kwhash[h].kwp, cp)) return (IDENT_S);
  376. _     return (kwt.kwhash[h].sym);
  377. _     }
  378. _ /******************************************************************************
  379. _ *                                                                             *
  380. _ *  CLEX -- c++ lexical scanner                                               *
  381. _ *                                                                             *
  382. _ ******************************************************************************/
  383. _ // CONSTRUCTOR Clex:
  384. _ //   The argument block_brack, if TRUE, dictates that the contents
  385. _ //   of square brackets "[]" be returned as a string in the string
  386. _ //   buffer.  If false, square brackets are treated as simple tokens.
  387. _ Clex::
  388. _ Clex (FILE* f, Boolean b)
  389. _     {
  390. _     fp = f;
  391. _     block_brack = b;
  392. _     filename[0] = '\0';
  393. _     bufsiz = 0; buf[0] = '\0';
  394. _     // prime the pipeline:
  395. _     line_num = 0;
  396. _     look = '\n';    // be prepared to handle '#' as first char
  397. _     }
  398. _ Clex_sym Clex::
  399. _ num (char c)
  400. _     {
  401. _     Clex_sym s = NUM_S;
  402. _     bufsiz = 0;
  403. _     put_in_buf(c);
  404. _     while (isdigit(look))
  405. _         buf_one();
  406. _     // hexadecimal
  407. _     if (bufsiz == 1 && *buf == '0' && (look == 'x' || look == 'X'))
  408. _         {
  409. _         do { buf_one(); }
  410. _             while (isxdigit(look));
  411. _         if (look == 'L' || look == 'l' || look == 'U' || look == 'u')
  412. _             buf_one();
  413. _         return terminate(s);
  414. _         }
  415. _     // long or unsigned
  416. _     if (look == 'L' || look == 'l' || look == 'U' || look == 'u')
  417. _         { buf_one(); return terminate(NUM_S); }
  418. _     // floating point
  419. _     else if (look == '.')
  420. _         {
  421. _         s = FLOATNUM_S;
  422. _         do { buf_one(); }
  423. _             while (isdigit(look));
  424. _         }
  425. _     // scientific notation
  426. _     if (look == 'e' || look == 'E')
  427. _          {
  428. _          s = FLOATNUM_S;
  429. _          do { buf_one(); }
  430. _             while (isdigit(look));
  431. _          }
  432. _     else
  433. _         return terminate(s);
  434. _     if (look == '+' || look == '-')
  435. _          do { buf_one(); }
  436. _             while (isdigit(look));
  437. _     return terminate(s);
  438. _     }
  439. _ Clex_sym Clex::
  440. _ ident (char first)
  441. _     {
  442. _     register Boolean maybe_kw = TRUE;
  443. _     register short bs = 0;
  444. _     buf[bs++] = first;
  445. _     while (isalnum(look) || look == '_' || look == '$')
  446. _         {
  447. _         // note: this function accounts for 30% of the total scan time
  448. _         if (maybe_kw && (isupper(look) || look == '_' ))
  449. _             maybe_kw = FALSE;
  450. _         buf[bs++] = look;       // don't worry about overflow
  451. _         eat_one();
  452. _         }
  453. _     buf[bs] = '\0';
  454. _     bufsiz = bs;
  455. _     if (maybe_kw)
  456. _         return kwt.lookup(buf, bufsiz);
  457. _     return IDENT_S;
  458. _     }
  459. _ Clex_sym Clex::
  460. _ quote (char c, Clex_sym s, Clex_mode m)
  461. _     {
  462. _     if (m == CL_NONE)
  463. _         bufsiz = 0;
  464. _     while (look != c)
  465. _         {
  466. _         if (look == EOF)
  467. _             { return terminate(ERROR_EOF_S); }
  468. _         else if (look == '\n')
  469. _             { return terminate(ERROR_EOLN_S); }
  470. _         else if (look == '\\')
  471. _             {
  472. _             eat_one();
  473. _             if (look == '\n')
  474. _                 { eat_one(); eoln(m|CL_QUOTE); continue; }
  475. _             else if (look == EOF)
  476. _                 { return terminate(ERROR_EOF_S); }
  477. _             else
  478. _                 put_in_buf('\\');   // this handles \' and \" too.
  479. _             }
  480. _         buf_one();
  481. _         }
  482. _     eat_one();  // eat the closing quote
  483. _     return terminate(s);
  484. _     }
  485. _ // lbrack() accumulates the contents between "[" and "]" into
  486. _ //  the string buffer, handling syntactically quoted strings,
  487. _ //  comments, and nested brackets.  Note that lbrack() is
  488. _ //  called recursively in the case of nested brackets.
  489. _ Clex_sym Clex::
  490. _ lbrack (Clex_mode m)
  491. _     {
  492. _     if (m == CL_NONE)
  493. _         bufsiz = 0;
  494. _     while (look != ']')
  495. _         {
  496. _         if (look == EOF)
  497. _             return terminate(ERROR_EOF_S);
  498. _         else if (look == '\n')
  499. _             { eat_one(); eoln(m|CL_BRACK); }
  500. _         else if (look == '[')
  501. _             {
  502. _             buf_one();
  503. _             if (lbrack(m|CL_BRACK) == ERROR_EOF_S)
  504. _                 return ERROR_EOF_S;     // already cleaned up.
  505. _             else put_in_buf(']');
  506. _             }
  507. _         else if (look == '\'' || look == '"')
  508. _             {
  509. _             char c = look;
  510. _             buf_one();
  511. _             (void) quote(c, NONE_S, m|CL_BRACK);
  512. _             put_in_buf(c);
  513. _             }
  514. _         else if (look == '/')           // maybe a comment
  515. _             {
  516. _             eat_one();
  517. _             if (look == '/')
  518. _                 line_comment();
  519. _             else if (look == '*')
  520. _                 {
  521. _                 block_comment(m|CL_BRACK);
  522. _                 if (look == EOF) return terminate(ERROR_EOF_S);
  523. _                 }
  524. _             else                        // stash the '/' and the char after
  525. _                 { put_in_buf('/'); buf_one(); }
  526. _             }
  527. _         else                            // just a character to save
  528. _             buf_one();
  529. _         }
  530. _     eat_one(); // eat the ']'.
  531. _     return terminate(LBRACK_S);
  532. _     }
  533. _ void Clex::
  534. _ block_comment(Clex_mode m)
  535. _     {
  536. _     eat_one(); // eat the '*'
  537. _     while (! (look == '*' && (eat_one(), look == '/')) )
  538. _         {
  539. _         if (look == EOF) return;
  540. _         if (look == '\n') { eat_one(); eoln(m|CL_COMMENT); }
  541. _         else if (look != '*') eat_one();
  542. _         }
  543. _     eat_one(); // eat the '/'
  544. _     }
  545. _ void Clex::
  546. _ line_comment()
  547. _     {
  548. _     do { eat_one(); }
  549. _      while (look != '\n' && look != EOF);
  550. _     }
  551. _ // eat_return() is intended to save space in Clex::next() -- the
  552. _ //  inline function eat_one() produces quite a lot of code.
  553. _ Clex_sym Clex::
  554. _ eat_return(Clex_sym s)
  555. _     { eat_one(); return s; }
  556. _ Clex_sym Clex::
  557. _ next()
  558. _     {
  559. _     short val;
  560. _     while (val = look, eat_one(), val != EOF)
  561. _         {
  562. _         char ch = char(val);
  563. _         switch (ch)
  564. _             {
  565. _         case ' ' : continue;
  566. _         case '_' :
  567. _         case '$' : return ident(ch);
  568. _         case '0' : case '1' : case '2' : case '3' : case '4' :
  569. _         case '5' : case '6' : case '7' : case '8' : case '9' :
  570. _                    return num(ch);
  571. _         case ',' : return COMMA_S;
  572. _         case ';' : return SEMI_S;
  573. _         case '[' : if (block_brack) return lbrack(CL_NONE);
  574. _                    else             return LBRACK_S;
  575. _         case ']' : return RBRACK_S;
  576. _         case '{' : return LBRACE_S;
  577. _         case '}' : return RBRACE_S;
  578. _         case '(' : return LPAR_S;
  579. _         case ')' : return RPAR_S;
  580. _         case '~' : return TILDE_S;
  581. _         case '?' : return QUEST_S;
  582. _         case '"' : return quote(ch, QUOTE_S, CL_NONE);
  583. _         case '\'': return quote(ch, APOS_S, CL_NONE);
  584. _         case '=' :                              // '=', '=='
  585. _             if (look != '=') return AS_S;
  586. _             else  return eat_return(EQ_S);
  587. _         case ':' :                              // ":", "::"
  588. _             if (look != ':') return COLON_S;
  589. _             else  return eat_return(SCOPE_S);
  590. _         case '!' :                              // "!", "!="
  591. _             if (look != '=') return BANG_S;
  592. _             else  return eat_return(NE_S);
  593. _         case '^' :                              // "^", "^="
  594. _             if (look != '=') return CARET_S;
  595. _             else  return eat_return(XORAS_S);
  596. _         case '*' :                              // '*', '*='
  597. _             if (look != '=') return STAR_S;
  598. _             else  return eat_return(MULAS_S);
  599. _         case '%' :                              // '%', '%='
  600. _             if (look != '=') return MOD_S;
  601. _             else  return eat_return(MODAS_S);
  602. _         case '|' :                              //  "|=", "||", "|"
  603. _             if      (look == '|') return eat_return(LOR_S);
  604. _             else if (look == '=') return eat_return(ORAS_S);
  605. _             else                             return VBAR_S;
  606. _         case '&' :                              // "&", "&=", "&&"
  607. _             if      (look == '&') return eat_return(LAND_S);
  608. _             else if (look == '=') return eat_return(ANDAS_S);
  609. _             else                             return AMPER_S;
  610. _         case '+' :                              // '+', '++', '+='
  611. _             if      (look == '+') return eat_return(INCRE_S);
  612. _             else if (look == '=') return eat_return(ADDAS_S);
  613. _             else                             return PLUS_S;
  614. _         case '-' :                              // '--', '-=', '->', '-', 
  615. _             if      (look == '-') return eat_return(DECRE_S);
  616. _             else if (look == '=') return eat_return(SUBAS_S);
  617. _             else if (look == '>') return eat_return(DEREF_S);
  618. _             else                             return MINUS_S;
  619. _         case '/' :                              // '/*', '//', '/=', '/'
  620. _             if (look == '*')
  621. _                 {
  622. _                 block_comment(CL_NONE);
  623. _                 if (look == EOF)       // almost certainly a mistake:
  624. _                     return ERROR_EOF_S;
  625. _                 else continue;
  626. _                 }
  627. _             else if (look == '/')
  628. _                 { line_comment(); continue; }
  629. _             else if (look == '=') return eat_return(DIVAS_S);
  630. _             else                             return SLASH_S;
  631. _         case '.' :                              // ".", "..."
  632. _             if (isdigit(look))      return num(ch);
  633. _             else if (look == '.')
  634. _                 {
  635. _                 eat_one();          // check for "..", undefined.
  636. _                 if (look != '.')    return ERROR_UNKN_S;
  637. _                 else    return  eat_return(ELLIP_S);
  638. _                 }
  639. _             else                    return DOT_S;
  640. _         case '<' :                              // '<=', '<', '<<', '<<='
  641. _             if      (look == '=')   return eat_return(LE_S);
  642. _             else if (look == '<')
  643. _                 {
  644. _                 eat_one();
  645. _                 if  (look != '=')   return SHL_S;
  646. _                 else     return eat_return(SHLAS_S);
  647. _                 }
  648. _             else                    return LT_S;
  649. _         case '>' :                              // '>=', '>', '>>', '>>='
  650. _             if      (look == '=')   return eat_return(GE_S);
  651. _             else if (look == '>')
  652. _                 {
  653. _                 eat_one();
  654. _                 if  (look != '=')   return SHR_S;
  655. _                 else     return eat_return(SHRAS_S);
  656. _                 }
  657. _             else                    return GT_S;
  658. _         default:
  659. _             if (isalpha(ch))
  660. _                 return ident(ch);
  661. _             if (ch == '\n')
  662. _                 eoln(CL_NONE);
  663. _             else if (iscntrl(ch))
  664. _                 continue;
  665. _             else
  666. _                 return ERROR_UNKN_S;
  667. _             }
  668. _         }
  669. _     return EOF_S;
  670. _     }
  671. _ struct Quickbuf
  672. _     {
  673. _     short len;
  674. _     char line[10240];
  675. _     void put_in(char c) { if (len < sizeof(line)-1) line[len++] = c; }
  676. _     void terminate()    { line[len] = '\0'; }
  677. _     Quickbuf() { len = 0; }
  678. _     };
  679. _ void Clex::
  680. _ eoln(Clex_mode m)
  681. _     {
  682. _     // assume NL character already eaten.
  683. _     ++line_num;
  684. _     // don't process '#' lines in quotes, comments, or '#' continuations.
  685. _     if (m & (CL_QUOTE|CL_POUND|CL_COMMENT))
  686. _         return;
  687. _     // eat whitespace
  688. _     while (look != EOF && look != '\n')
  689. _         {
  690. _         if (look == ' ' || iscntrl(char(look))) eat_one();
  691. _         else break;
  692. _         }
  693. _     if (look != '#')
  694. _         return;
  695. _     // eat the '#' and subsequent whitespace
  696. _     do { eat_one(); if (look == EOF || look == '\n') break; }
  697. _        while (look == ' ' || iscntrl(char(look)));
  698. _     // collect the '#' line
  699. _     Quickbuf b;
  700. _     do  {   // record line
  701. _         if (look == '\\')       // check for continuation line
  702. _             {
  703. _             eat_one();
  704. _             if (look == '\n') { eat_one(); eoln(m|CL_POUND); }
  705. _             else { b.put_in('\\'); }
  706. _             }
  707. _         else if (look == '/')   // check for comment in '#' line
  708. _             {
  709. _             eat_one();
  710. _             if (look == '*')
  711. _                 {
  712. _                 block_comment(m|CL_POUND);
  713. _                 if (look == EOF) break;
  714. _                 }
  715. _             else if (look == '/') line_comment();
  716. _             else { b.put_in('/'); }
  717. _             }
  718. _         else
  719. _             {
  720. _             if (iscntrl(char(look))) look = ' ';
  721. _             b.put_in(look);
  722. _             eat_one();
  723. _             }
  724. _  
  725. _         } while (look != '\n' && look != EOF);
  726. _     b.terminate();
  727. _     (void) pound(m, b.line, b.len);     // call virtual handler
  728. _     }
  729. _ Boolean Clex::
  730. _ pound (Clex_mode m, char* line, short len)
  731. _     {
  732. _     void(m);                // to keep cfront blissful
  733. _     char* cp = line;
  734. _     if (!isdigit(*cp))
  735. _         {
  736. _         if (len < 5) return FALSE;
  737. _         if (strncmp(cp, "line ", 5) != 0)
  738. _             return FALSE;   // don't know what it is
  739. _         cp += 4;
  740. _         while (*cp == ' ') ++cp;
  741. _         if (!isdigit(*cp))
  742. _             return FALSE;
  743. _         }
  744. _     // # <line> "<filename>"   or    #line <line> "<filename>"
  745. _     line_num = atoi(cp) - 1;    // will be incremented by eoln() later
  746. _     while (isdigit(*cp)) ++cp;
  747. _     while (*cp == ' ')   ++cp;
  748. _     if (*cp == '"')
  749. _         {
  750. _         char* cpq = cp;
  751. _         do { ++cpq; }
  752. _            while (*cpq != '"' && *cpq != '\0');
  753. _         strncpy(filename, cp+1, cpq - cp - 1);
  754. _         filename[cpq - cp - 1] = '\0';
  755. _         }
  756. _     return TRUE;
  757. _     }
  758. _ const char* Clex::
  759. _ debug (Clex_sym s)
  760. _     {
  761. _     return (s >= KEYWORD_S) ? keywords[s - KEYWORD_S] : sym_str[s] ;
  762. _     }
  763. %%%EOF%%%
  764. echo "x - kwhash.c"
  765. sed -e 's/^_ //' >kwhash.c <<'%%%EOF%%%'
  766. _ /* this is a C program */
  767. _ #include <stdio.h>
  768. _ static char *keywords[] =
  769. _     {
  770. _     "asm",
  771. _     "auto",
  772. _     "break",
  773. _     "case",
  774. _     "char",
  775. _     "class",
  776. _     "const",
  777. _     "continue",
  778. _     "default",
  779. _     "delete",
  780. _     "do",
  781. _     "double",
  782. _     "else",
  783. _     "enum",
  784. _     "extern",
  785. _     "float",
  786. _     "for",
  787. _     "friend",
  788. _     "goto",
  789. _     "if",
  790. _     "inline",
  791. _     "int",
  792. _     "long",
  793. _     "new",
  794. _     "operator",
  795. _     "overload",
  796. _     "private",
  797. _     "protected",
  798. _     "public",
  799. _     "register",
  800. _     "return",
  801. _     "short",
  802. _     "signed",
  803. _     "sizeof",
  804. _     "static",
  805. _     "struct",
  806. _     "switch",
  807. _     "this",
  808. _     "typedef",
  809. _     "union",
  810. _     "unsigned",
  811. _     "virtual",
  812. _     "volatile",
  813. _     "void",
  814. _     "while"
  815. _     };
  816. _ #define KW_NUMKEYS (sizeof(keywords)/sizeof(*keywords))
  817. _ unsigned int hashsize = 137;
  818. _ char** kwhash;
  819. _ typedef unsigned short u_short;
  820. _ u_short
  821. _ hash(cp, len, a, b, c)
  822. _     unsigned char* cp;
  823. _     u_short len;
  824. _     u_short a, b, c;
  825. _     {
  826. _     return (((u_short)cp[0]         ) ^
  827. _             ((u_short)cp[1]     << a) ^
  828. _             ((u_short)cp[len-1] << b) ^
  829. _              (len               << c)  ) % hashsize;
  830. _     }
  831. _ int
  832. _ insert(cp, a, b, c)
  833. _     char *cp;
  834. _     u_short a, b, c;
  835. _     {
  836. _     short h;
  837. _     h = hash(cp, strlen(cp), a, b, c);
  838. _     if (kwhash[h] != NULL)
  839. _         {
  840. _ /*
  841. _         printf("Keyword hash collision: %s, %s\n", kwhash[h], cp);
  842. _ */
  843. _         return 0;
  844. _         }
  845. _     else
  846. _         kwhash[h] = cp;
  847. _     return 1;
  848. _     }
  849. _ int
  850. _ try(a, b, c)
  851. _     short a, b, c;
  852. _     {
  853. _     short int i;
  854. _     int collisions;
  855. _     collisions = 0;
  856. _     for (i = 0; i < hashsize; ++i)
  857. _         kwhash[i] = NULL;
  858. _     for (i = 0; i < KW_NUMKEYS; ++i)
  859. _         if (!insert(keywords[i], a, b, c))
  860. _             ++collisions;
  861. _     return collisions;
  862. _     }
  863. _ main(argc, argv)
  864. _     int argc;
  865. _     char **argv;
  866. _     {
  867. _     int min_collisions;
  868. _     int min_abc = 0;
  869. _     short a, b, c;
  870. _     if (argc > 1) hashsize = atoi(argv[1]);
  871. _     else
  872. _         {
  873. _         printf("usage: %s <hash_size>\n\t<hash_size> should be prime.\n",
  874. _                 argv[0]);
  875. _         exit(-1);
  876. _         }
  877. _     if (hashsize < KW_NUMKEYS)
  878. _         {
  879. _         printf("Hash table is too small.\n");
  880. _         exit(-1);
  881. _         }
  882. _     kwhash = (char**) malloc(hashsize * sizeof(char*));
  883. _     min_collisions = hashsize + 1;
  884. _     for (a = 0; a <= 10; ++a)
  885. _         {
  886. _         for (b = 0; b <= 10; ++b)
  887. _             {
  888. _             for (c = 0; c <= 10; ++c)
  889. _                 {
  890. _                 int collisions;
  891. _                 collisions = try(a, b, c);
  892. _                 if (collisions <= min_collisions)
  893. _                     {
  894. _                     printf("abc: %03x  Collisions: %2d ",
  895. _                            ((a<<8)|(b<<4)|c), collisions);
  896. _                     min_collisions = collisions;
  897. _                     if (collisions == 0) putchar('*');
  898. _                     putchar('\n');
  899. _                     }
  900. _                 }
  901. _             }
  902. _         }
  903. _     }
  904. %%%EOF%%%
  905. echo "x - clex_test.c"
  906. sed -e 's/^_ //' >clex_test.c <<'%%%EOF%%%'
  907. _ // clex_test -- test code for clex.o
  908. _ #include "clex.h"
  909. _ main()
  910. _     {
  911. _     Clex cl = Clex(stdin, TRUE);
  912. _     Clex_sym s;
  913. _     do  {
  914. _         s = cl.next();
  915. _         printf("%5D ", cl.line_no());
  916. _         if (s >= KEYWORD_S)
  917. _             printf(" %s\n", cl.debug(s));
  918. _         else if (s == IDENT_S ||
  919. _                  s == NUM_S ||
  920. _                  s == FLOATNUM_S ||
  921. _                  s == LBRACK_S ||
  922. _                  s == APOS_S ||
  923. _                  s == QUOTE_S )
  924. _             printf( "      %s \"%s\"\n", cl.debug(s), cl.str());
  925. _         else
  926. _             printf( "      %s\n", cl.debug(s));
  927. _         } while (s > EOF_S);
  928. _     exit(0);
  929. _     }
  930. %%%EOF%%%
  931. echo "Done."
  932. exit 0
  933.  
  934.  
  935.