ftp.cs.arizona.edu

home *** CD-ROM | disk | FTP | other *** search

/ ftp.cs.arizona.edu / ftp.cs.arizona.edu.tar / ftp.cs.arizona.edu / icon / historic / v941.tgz / icon.v941src.tar / icon.v941src / ipl / packs / ibpag2 / ibreader.icn < prev next >

Wrap

Text File | 2000-07-29 | 17KB | 516 lines

############################################################################ # # Name: ibreader.icn # # Title: reader for Ibpag2 source files # # Author: Richard L. Goerwitz # # Version: 1.29 # ############################################################################ # # This file contains a collection of procedures that 1) read in an # Ibpag2 source file, 2) output token defines, 3) emit action code, # and finally 4) pass a start symbol, list of productions, and token # table back to the calling procedure. Described formally: # # ibreader: file x file x string -> ib_grammar record # (in, out, module) -> grammar # # In is the input stream; out is the output stream; module is an # optional string that distinguishes this grammar from others that # might also be running simultaneously. Grammar is an ib_grammar # record containing the start symbol in its first field and the # production list in its second. Its third field contains a table # used to map integers to actual token names or character literals, # i.e. its keys are things like -1, 0, etc. and its values are things # like "error," "EOF," etc. # # Note that if a module argument is supplied to ibreader(), one must # also be supplied to ibwriter(). See ibwriter.icn. # # The format of the input file is highly reminiscent of YACC. It # consists of three basic sections, the first two of which are # followed by %%. See the main documentation to Ibpag2 for # specifics. Major differences between Ibpag2 and YACC input format # include: # # 1) "$$ = x" constructs are replaced by "return x" (e.g. "$$ = # $1 + $3" -> "return $1 + $3") # # 2) all variables within a given action are, by default, local # to that action; i.e. they cannot be accessed by other # actions unless you declare them global elsewhere (e.g. in # the pass-through part of the declarations section %{ ... %}) # # 3) the %union declaration is not needed by Ibpag # # 4) tokens and symbols are separated from each other by a comma # (e.g. %token '+', '-' and S : NP, VP) # # 5) epsilon is indicated by the keyword "epsilon" (e.g. REL : # epsilon) # # 6) both epsilon and error *may* be declared as %tokens for # reasons of precedence, although they retain hard-coded # internal values (-2 and -1, respectively) # # 7) all actions must follow the last RHS symbol of the rule they # apply to (preceded by an optional %prec directive); to # achieve S : NP { action1 }, VP { action2 }, insert a dummy # rule: S : NP, dummy, VP { action2 }; dummy : epsilon { # action1 } ; # # 8) YYERROR, YYACCEPT, yyclearin, and yyerrok are the same, # except they are written IIERROR, IIACCEPT, iiclearin, and # iierrok (i.e. "ii" replaces "yy") # # 9) Ibpag2's input files are tokenized like modified Icon files, # and, as a consequence, Icon's reserved words must not be # used as symbols (e.g. "if : if, then" is no go) # ############################################################################ # # Links: itokens, escape # # See also: ibwriter # ############################################################################ #link itokens, escape link escape record ib_grammar(start, rules, tbl) record tokstats(str, no, prec, assoc) # Declared in itokens.icn: # global line_number # # ibreader: file x file x string x string -> ib_grammar record # (in, out, module, source_fname) -> grammar # # Where in is an input stream, out is an output stream, module is # some string uniquely identifying this module (optional), and # where grammar is an ib_grammar record containing the start # symbol in its first field and a list of production records in # its second. Source_fname is the string name of Ibpag2's input # grammar file. Defaults to "source file." # procedure ibreader(in, out, module, source_fname) local tmp, grammar, toktbl, next_token, next_token_no_nl, token, LHS, t /source_fname := "source file" grammar := ib_grammar(&null, list(), table()) toktbl := table() next_token := create itokens(in, 1) next_token_no_nl := create 1(tmp := |@next_token, \tmp.sym) token := @next_token_no_nl | iohno(4) # Do the %{ $} and %token stuff, i.e. everything up to %% # (NEWSECT). # until token.sym == "NEWSECT" do { case token.sym of { default : { iohno(48, "token "||image(token.str) ||"; line "|| line_number) } "SEMICOL" : { # Skip semicolon. Get another token while we're at it. token := @next_token_no_nl | iohno(47, "line "||line_number) } "BEGGLOB" : { write(out, "\n$line ", line_number, " ", image(source_fname)) # Copy token values to out until we reach "%}" (ENDGLOB). (token := copy_icon_stuff(next_token, out)).sym == "ENDGLOB" token := @next_token_no_nl } "MOD" : { (token := @next_token_no_nl).sym == "IDENT" | iohno(30, "line " || line_number) # # Read in token declarations, set associativity and # precedences, and enter the tokens into toktbl. # token := { case token.str of { default : iohno(30, "line " || line_number) "token" : read_decl(next_token_no_nl, toktbl, &null) "right" : read_decl(next_token_no_nl, toktbl, "r") "left" : read_decl(next_token_no_nl, toktbl, "l") "nonassoc": read_decl(next_token_no_nl, toktbl, "n") "union" : iohno(45, "line "|| line_number) "start" : { (token := @next_token_no_nl).sym == "IDENT" | iohno(31, "line " || line_number) /grammar.start := token.str | iohno(32, "line " || line_number) @next_token_no_nl | iohno(4) } } } } } } # Skip past %% (NEWSECT) and semicolon (if present). token := @next_token_no_nl | iohno(47, "line "|| line_number) (token := token | @next_token_no_nl | iohno(4)).sym ~== "SEMICOL" token.sym == "NEWSECT" & iohno(47, "line "|| line_number) # # Fetch start symbol if it wasn't defined above via %start; by # default the start symbol is the LHS of rule 1. # /grammar.start := token.str # Having reached the end of the declarations section, we can now # copy out a define for each token number, not counting character # literals (which are stored as integers). While we're at it, # create a table that maps token numbers back to character # literals and strings (for use in later verbose and debugging # displays). # write(out, "\n") every t := !toktbl do { if type(t.str) == "integer" then insert(grammar.tbl, t.no, image(char(t.str))) else { insert(grammar.tbl, t.no, t.str) write(out, "$define ", t.str, "\t", t.no) } } # Now, finally, read in rules up until we reach EOF or %% (i.e. # NEWSECT). EOF is signaled below by failure of read_RHS(). # until token.sym == "NEWSECT" do { token.sym == "IDENT" | iohno(33, token.str ||" line "|| line_number) LHS := token.str token := @next_token_no_nl | iohno(4) token.sym == "COLON" | iohno(34, token.str ||" line "|| line_number) # # Read in RHS, then the action (if any) then the prec (if # any). If we see a BAR, then repeat, re-using the same # left-hand side symbol. # while token := read_RHS(next_token, next_token_no_nl, out, toktbl, LHS, grammar, module, source_fname) | # if read_RHS fails, we're at EOF break break do token.sym == "BAR" | break } # Copy the remainder of the file to out as Icon code. write(out, "\n$line ", line_number, " ", image(source_fname)) every copy_icon_stuff(next_token, out, "EOFX") # Do final setup on the reverse token table. This table will be # used later to map integers to their original names in verbose or # debugging displays. # insert(grammar.tbl, 0, "$") return grammar end # # copy_icon_stuff: coexpression x file x string -> ib_TOK records # (next_token, out, except) -> token records # # Copy Icon code to output stream, also suspending as we go. # Insert separators between tokens where needed. Do not output # any token whose sym field matches except. The point in # suspending tokens as we go is to enable the calling procedure to # look for signal tokens that indicate insertion or termination # points. # procedure copy_icon_stuff(next_token, out, except) local separator, T separator := "" while T := @next_token do { if \T.sym then suspend T if \T.sym == \except then next if any(&digits ++ &letters ++ '_.', \T.str, 1, 2) & \T.sym ~== "DOT" then writes(out, separator) writes(out, T.str) if any(&digits ++ &letters ++ '_.', \T.str, -1, 0) & \T.sym ~== "DOT" then separator := " " else separator := "" } # unexpected EOF error (except === "EOFX") | iohno(4) end # # read_decl: coexpression x table x string -> ib_TOK # (next_token_no_nl, toktbl, assoc) -> token # # Read in token declarations, assigning them the correct # precedence and associativity. Number the tokens for later # $define preprocessor directives. When done, return the last # token processed. Toktbl is the table that holds the stats for # each declared token. # procedure read_decl(next_token_no_nl, toktbl, assoc) local token, c static token_no, prec initial { token_no := 256 prec := 0 } # All tokens in this list have the same prec and assoc. # Precedence is determined by order. Associativity is determined # by keyword in the calling procedure, and is passed as arg 3. # prec +:= 1 assoc === ("n"|"r"|"l"|&null) | iohno(5, image(assoc)) # As long as we find commas and token names, keep on adding tokens # to the token table. Return the unused token when done. If we # reach EOF, there's been an error. # repeat { token := @next_token_no_nl | iohno(4) case token.sym of { default : iohno(31, token.str ||" line "|| line_number) "CSETLIT" | "STRING": { # Enter character literals as integers. *escape(token.str[2:-1]) = 1 | iohno(49, token.str) c := ord(escape(token.str[2:-1])) toktbl[c] := tokstats(c, c, prec, assoc) } "IDENT" : { case token.str of { "error" : toktbl[token.str] := tokstats("error", -1, prec, assoc) "epsilon": toktbl[token.str] := tokstats("epsilon",-2,prec, assoc) default : { # Enter TOKENs as string-keyed records in toktbl. token_no +:= 1 toktbl[token.str] := tokstats(token.str, token_no, prec, assoc) } } } } # As long as we're seeing commas, go back for more tokens. token := @next_token_no_nl | iohno(4) token.sym == "COMMA" | break } # Skip past semicolon, if present (as set up now, it shouldn't be). (token := token | @next_token_no_nl | iohno(4)).sym ~== "SEMICOL" return token end # # read_RHS: coexpression x coexpression x file x table x # string x ib_grammar record x string x string -> token # # Read_RHS goes through the RHS of rule definitions, inserting the # resulting productions into a master rule list. At the same # time, it outputs the actions corresponding to those productions # as procedures that are given names corresponding to the numbers # of the productions. I.e. production 1, if endowed with an { # action }, will correspond to procedure _1_. Prec and assoc are # automatically set to that of the last RHS nonterminal, but this # may be changed explicitly by the %prec keyword, as in YACC. # Source_fname is the name of the source grammar file we're pro- # cessing (caller will give us some reasonable default if we're # reading &input). # # Fails on EOF. # procedure read_RHS(next_token, next_token_no_nl, out, toktbl, LHS, grammar, module, source_fname) local token, rule, c static rule_no initial rule_no := 0 rule_no +:= 1 # LHS RHS POS LOOK no prec assoc rule := production(LHS, list(), &null, &null, rule_no, &null, &null) put(grammar.rules, rule) # Read in RHS symbols. # repeat { token := @next_token_no_nl | iohno(4) case token.sym of { default : iohno(35, "token "|| image(token.str)||"; line "|| line_number) "CSETLIT" | "STRING": { *escape(token.str[2:-1]) = 1 | iohno(49, token.str) c := ord(escape(token.str[2:-1])) if \toktbl[c] then { rule.prec := toktbl[c].prec rule.assoc := toktbl[c].assoc } # literals not declared earlier will get caught here else insert(grammar.tbl, c, image(char(c))) put(rule.RHS, c) } "IDENT" : { # If it's a terminal (i.e. a declared token), assign # this rule its precedence and associativity. If it's # not in toktbl, then it's not a declared token.... if \toktbl[token.str] then { rule.prec := toktbl[token.str].prec rule.assoc := toktbl[token.str].assoc put(rule.RHS, toktbl[token.str].no) if toktbl[token.str].no = -2 then { *rule.RHS > 1 & iohno(44, "line ", line_number) rule.POS := 2 } } # ...undeclared stuff. Could be a nonterminal. If # error and/or epsilon weren't declared as tokens, # they will get caught here, too. else { case token.str of { &null : stop("What is going on here?") default : put(rule.RHS, token.str) "error" : { put(rule.RHS, -1) insert(grammar.tbl, -1, "error") } "epsilon" : { if *put(rule.RHS, -2) > 1 then iohno(44, "line ", line_number) else rule.POS := 2 insert(grammar.tbl, -2, "epsilon") } } } } } # Comma means: Go back for another RHS symbol. token := @next_token_no_nl | fail token.sym == "COMMA" | break } # Skip semicolon token, if present. (token := token | @next_token_no_nl | fail).sym ~== "SEMICOL" # Read and set (optional) precedence. # if token.sym == "MOD" then { token := @next_token_no_nl | iohno(4) (token.sym == "IDENT" & token.str == "prec") | iohno(43, token.str || " line " || line_number) token := @next_token_no_nl | iohno(4) case token.sym of { "CSETLIT" | "STRING" : { *escape(token.str[2:-1]) = 1 | iohno(49, token.str) c := ord(escape(token.str[2:-1])) & rule.prec := toktbl[c].prec & rule.assoc := toktbl[c].assoc } "IDENT" : { \toktbl[token.str] | iohno(43, token.str || " line " || line_number) rule.prec := toktbl[token.str].prec & rule.assoc := toktbl[token.str].assoc } default : 1 = 4 # deliberate failure } | iohno(43, "line ", line_number) token := @next_token_no_nl | fail } # Skip semicolon token, if present. (token := token | @next_token_no_nl | fail).sym ~== "SEMICOL" # Read in (optional) action. # if token.sym == "LBRACE" then { write_action_as_procedure(next_token, out, rule, module, source_fname) token := @next_token_no_nl | fail } # Skip semicolon token, if present. (token := token | @next_token_no_nl | fail).sym ~== "SEMICOL" return token end # # write_action_as_procedure # procedure write_action_as_procedure(next_token, out, rule, module, source_fname) local argstr, bracelevel, token, i, neg /module := "" argstr := "" # # Decide the number of arguments based on the length of the RHS of # rule. Exception: Epsilon productions are empty, and pop nothing # off the stack, so take zero args. # if rule.RHS[1] ~=== -2 then { every argstr ||:= "arg" || (1 to *rule.RHS) || "," argstr := trim(argstr, ',') } write(out, "procedure _", rule.no, "_", module, "(", argstr, ")") write(out, "\n$line ", line_number, " ", image(source_fname)) bracelevel := 1 until bracelevel = 0 do { every token := copy_icon_stuff(next_token, out, "RHSARG") do { case token.sym of { default : next "LBRACE" : bracelevel +:= 1 "RBRACE" : bracelevel -:= 1 "RHSARG" : { until \ (token := @next_token).sym do writes(out, token.str) if neg := (token.sym == "MINUS") then until \ (token := @next_token).sym do writes(out, token.str) else neg := &null token.sym == "INTLIT" | iohno(37, "$"||token.str) if /neg & token.str ~== "0" then { token.str <= *rule.RHS | iohno(38, "$"||token.str) writes(out, " arg", token.str, " ") } else { # Code for $0, $-1, etc. # # Warning! If the name of the stack is changed # in iiparse.lib, it has to be changed here, too. # i := abs(token.str)+1 writes(out, " value_stack", module, "[", i, "] ") } } } if bracelevel = 0 then { write(out, "\nend\n") return token } } } iohno(39, "line "|| line_number) end