home *** CD-ROM | disk | FTP | other *** search
- ; "prec.scm", dynamically extensible parser/tokenizer -*-scheme-*-
- ; Copyright 1989, 1990, 1991, 1992, 1993, 1995, 1997 Aubrey Jaffer.
- ;
- ;Permission to copy this software, to redistribute it, and to use it
- ;for any purpose is granted, subject to the following restrictions and
- ;understandings.
- ;
- ;1. Any copy made of this software must include this copyright notice
- ;in full.
- ;
- ;2. I have made no warrantee or representation that the operation of
- ;this software will be error-free, and I am under no obligation to
- ;provide any services, by way of maintenance, update, or otherwise.
- ;
- ;3. In conjunction with products arising from the use of this
- ;material, there shall be no use of my name in any advertising,
- ;promotional, or sales literature without prior written consent in
- ;each case.
-
- ; This file implements:
- ; * a Pratt style parser.
- ; * a tokenizer which congeals tokens according to assigned classes of
- ; constituent characters.
- ;
- ; This module is a significant improvement because grammar can be
- ; changed dynamically from rulesets which don't need compilation.
- ; Theoretically, all possibilities of bad input are handled and return
- ; as much structure as was parsed when the error occured; The symbol
- ; `?' is substituted for missing input.
-
- ; References for the parser are:
-
- ; Pratt, V. R.
- ; Top Down Operator Precendence.
- ; SIGACT/SIGPLAN
- ; Symposium on Principles of Programming Languages,
- ; Boston, 1973, 41-51
-
- ; WORKING PAPER 121
- ; CGOL - an Alternative External Representation For LISP users
- ; Vaughan R. Pratt
- ; MIT Artificial Intelligence Lab.
- ; March 1976
-
- ; Mathlab Group,
- ; MACSYMA Reference Manual, Version Ten,
- ; Laboratory for Computer Science, MIT, 1983
-
- (require 'fluid-let)
- (require 'string-search)
- (require 'string-port)
- (require 'delay)
-
- (define *syn-defs* #f)
- (define *syn-rules* #f) ;Dynamically bound
- (define *prec:port* #f) ;Dynamically bound
-
- ;; keeps track of input column so we can generate useful error displays.
- (define tok:column 0)
- (define (tok:peek-char) (peek-char *prec:port*))
- (define (tok:read-char)
- (let ((c (read-char *prec:port*)))
- (if (or (eqv? c #\newline) (eof-object? c))
- (set! tok:column 0)
- (set! tok:column (+ 1 tok:column)))
- c))
- (define (tok:bump-column pos . ports)
- ((lambda (thunk)
- (cond ((null? ports) (thunk))
- (else (fluid-let ((*prec:port* (car ports))) (thunk)))))
- (lambda ()
- (cond ((eqv? #\newline (tok:peek-char))
- (tok:read-char))) ;to do newline
- (set! tok:column (+ tok:column pos)))))
- (define (prec:warn . msgs)
- (do ((j (+ -1 tok:column) (+ -8 j)))
- ((> 8 j)
- (do ((i j (+ -1 i)))
- ((>= 0 i))
- (display #\ )))
- (display slib:tab))
- (display "^ ")
- (newline)
- (for-each (lambda (x) (write x) (display #\ )) msgs)
- (newline))
-
- ;; Structure of lexical records.
- (define tok:make-rec cons)
- (define tok:cc car)
- (define tok:sfp cdr)
-
- (define (tok:lookup alist char)
- (if (eof-object? char)
- #f
- (let ((pair (assv char alist)))
- (and pair (cdr pair)))))
-
- (define (tok:char-group group chars chars-proc)
- (map (lambda (token)
- ;;; (let ((oldlexrec (tok:lookup *syn-defs* token)))
- ;;; (cond ((or (not oldlexrec) (eqv? (tok:cc oldlexrec) group)))
- ;;; (else (math:warn 'cc-of token 'redefined-to- group))))
- (cons token (tok:make-rec group chars-proc)))
- (cond ((string? chars) (string->list chars))
- ((char? chars) (list chars))
- (else chars))))
-
- (define (tokenize)
- (let* ((char (tok:read-char))
- (rec (tok:lookup *syn-rules* char))
- (proc (and rec (tok:cc rec)))
- (clist (list char)))
- (cond
- ((not proc) char)
- ((procedure? proc)
- (do ((cl clist (begin (set-cdr! cl (list (tok:read-char))) (cdr cl))))
- ((proc (tok:peek-char))
- ((or (tok:sfp rec) list->string) clist))))
- ((eqv? 0 proc) (tokenize))
- (else
- (do ((cl clist (begin (set-cdr! cl (list (tok:read-char))) (cdr cl))))
- ((not (let* ((prec (tok:lookup *syn-rules* (tok:peek-char)))
- (cclass (and prec (tok:cc prec))))
- (or (eqv? cclass proc)
- (eqv? cclass (+ -1 proc)))))
- ((tok:sfp rec) clist)))))))
-
- ;;; PREC:NUD is the null denotation (function and arguments to call when no
- ;;; unclaimed tokens).
- ;;; PREC:LED is the left denotation (function and arguments to call when
- ;;; unclaimed token is on left).
- ;;; PREC:LBP is the left binding power of this LED. It is the first
- ;;; argument position of PREC:LED
-
- (define (prec:nudf alist self)
- (let ((pair (assoc (cons 'nud self) alist)))
- (and pair (cdr pair))))
- (define (prec:ledf alist self)
- (let ((pair (assoc (cons 'led self) alist)))
- (and pair (cdr pair))))
- (define (prec:lbp alist self)
- (let ((pair (assoc (cons 'led self) alist)))
- (and pair (cadr pair))))
-
- (define (prec:call-or-list proc . args)
- (prec:apply-or-cons proc args))
- (define (prec:apply-or-cons proc args)
- (if (procedure? proc) (apply proc args) (cons (or proc '?) args)))
-
- ;;; PREC:SYMBOLFY and PREC:DE-SYMBOLFY are not exact inverses.
- (define (prec:symbolfy obj)
- (cond ((symbol? obj) obj)
- ((string? obj) (string->symbol obj))
- ((char? obj) (string->symbol (string obj)))
- (else obj)))
-
- (define (prec:de-symbolfy obj)
- (cond ((symbol? obj) (symbol->string obj))
- (else obj)))
-
- ;;;Calls to set up tables.
-
- (define (prec:define-grammar . synlsts)
- (set! *syn-defs* (append (apply append synlsts) *syn-defs*)))
-
- (define (prec:make-led toks . args)
- (map (lambda (tok)
- (cons (cons 'led (prec:de-symbolfy tok))
- args))
- (if (pair? toks) toks (list toks))))
- (define (prec:make-nud toks . args)
- (map (lambda (tok)
- (cons (cons 'nud (prec:de-symbolfy tok))
- args))
- (if (pair? toks) toks (list toks))))
-
- ;;; Produce dynamically augmented grammars.
- (define (prec:process-binds binds rules)
- (if (and #f (not (null? binds)) (eq? #t (car binds)))
- (cdr binds)
- (append binds rules)))
-
- ;;(define (prec:replace-rules) some-sort-of-magic-cookie)
-
- ;;; Here are the procedures to define high-level grammar, along with
- ;;; utility functions called during parsing. The utility functions
- ;;; (prec:parse-*) could be incorportated into the defining commands,
- ;;; but tracing these functions is useful for debugging.
-
- (define (prec:delim tk)
- (prec:make-led tk 0 #f))
-
- (define (prec:nofix tk sop . binds)
- (prec:make-nud tk prec:parse-nofix sop (apply append binds)))
- (define (prec:parse-nofix self sop binds)
- (set! *syn-rules* (prec:process-binds binds *syn-rules*))
- (prec:call-or-list (or sop (prec:symbolfy self))))
-
- (define (prec:prefix tk sop bp . binds)
- (prec:make-nud tk prec:parse-prefix sop bp (apply append binds)))
- (define (prec:parse-prefix self sop bp binds)
- (fluid-let ((*syn-rules* (prec:process-binds binds *syn-rules*)))
- (prec:call-or-list (or sop (prec:symbolfy self)) (prec:parse1 bp))))
-
- (define (prec:infix tk sop lbp bp . binds)
- (prec:make-led tk lbp prec:parse-infix sop bp (apply append binds)))
- (define (prec:parse-infix left self lbp sop bp binds)
- (fluid-let ((*syn-rules* (prec:process-binds binds *syn-rules*)))
- (prec:call-or-list (or sop (prec:symbolfy self)) left (prec:parse1 bp))))
-
- (define (prec:nary tk sop bp)
- (prec:make-led tk bp prec:parse-nary sop bp))
- (define (prec:parse-nary left self lbp sop bp)
- (prec:apply-or-cons (or sop (prec:symbolfy self))
- (cons left (prec:parse-list self bp))))
-
- (define (prec:postfix tk sop lbp . binds)
- (prec:make-led tk lbp prec:parse-postfix sop (apply append binds)))
- (define (prec:parse-postfix left self lbp sop binds)
- (set! *syn-rules* (prec:process-binds binds *syn-rules*))
- (prec:call-or-list (or sop (prec:symbolfy self)) left))
-
- (define (prec:prestfix tk sop bp . binds)
- (prec:make-nud tk prec:parse-rest sop bp (apply append binds)))
- (define (prec:parse-rest self sop bp binds)
- (fluid-let ((*syn-rules* (prec:process-binds binds *syn-rules*)))
- (prec:apply-or-cons (or sop (prec:symbolfy self)) (prec:parse-list #f bp))))
-
- (define (prec:commentfix tk stp match . binds)
- (append
- (prec:make-nud tk prec:parse-nudcomment stp match (apply append binds))
- (prec:make-led tk 220 prec:parse-ledcomment stp match (apply append binds))))
- (define (prec:parse-nudcomment self stp match binds)
- (fluid-let ((*syn-rules* (prec:process-binds binds *syn-rules*)))
- (tok:read-through-comment stp match)
- (prec:advance)
- (cond ((prec:delim? (force prec:token)) #f)
- (else (prec:parse1 prec:bp)))))
- (define (prec:parse-ledcomment left lbp self stp match binds)
- (fluid-let ((*syn-rules* (prec:process-binds binds *syn-rules*)))
- (tok:read-through-comment stp match)
- (prec:advance)
- left))
- (define (tok:read-through-comment stp match)
- (set! match (if (char? match)
- (string match)
- (prec:de-symbolfy match)))
- (cond ((procedure? stp)
- (let* ((len #f)
- (str (call-with-output-string
- (lambda (sp)
- (set! len (find-string-from-port?
- match *prec:port*
- (lambda (c) (display c sp) #f)))))))
- (stp (and len (substring str 0 (- len (string-length match)))))))
- (else (find-string-from-port? match *prec:port*))))
-
- (define (prec:matchfix tk sop sep match . binds)
- (define sep-lbp 0)
- (prec:make-nud tk prec:parse-matchfix
- sop sep-lbp sep match
- (apply append (prec:delim match) binds)))
- (define (prec:parse-matchfix self sop sep-lbp sep match binds)
- (fluid-let ((*syn-rules* (prec:process-binds binds *syn-rules*)))
- (cond (sop (prec:apply-or-cons
- sop (prec:parse-delimited sep sep-lbp match)))
- ((equal? (force prec:token) match)
- (prec:warn 'expression-missing)
- (prec:advance)
- '?)
- (else (let ((ans (prec:parse1 0))) ;just parenthesized expression
- (cond ((equal? (force prec:token) match)
- (prec:advance))
- ((prec:delim? (force prec:token))
- (prec:warn 'mismatched-delimiter (force prec:token)
- 'not match)
- (prec:advance))
- (else (prec:warn 'delimiter-expected--ignoring-rest
- (force prec:token) 'expected match
- 'or-delimiter)
- (do () ((prec:delim? (force prec:token)))
- (prec:parse1 0))))
- ans)))))
-
- (define (prec:inmatchfix tk sop sep match lbp . binds)
- (define sep-lbp 0)
- (prec:make-led tk lbp prec:parse-inmatchfix
- sop sep-lbp sep match
- (apply append (prec:delim match) binds)))
- (define (prec:parse-inmatchfix left self lbp sop sep-lbp sep match binds)
- (fluid-let ((*syn-rules* (prec:process-binds binds *syn-rules*)))
- (prec:apply-or-cons
- sop (cons left (prec:parse-delimited sep sep-lbp match)))))
-
- ;;;; Here is the code which actually parses.
-
- (define prec:bp #f) ;dynamically bound
- (define prec:token #f)
- (define (prec:advance)
- (set! prec:token (delay (tokenize))))
- (define (prec:advance-return-last)
- (let ((last (and prec:token (force prec:token))))
- (prec:advance)
- last))
-
- (define (prec:nudcall self)
- (let ((pob (prec:nudf *syn-rules* self)))
- (cond
- (pob (let ((proc (car pob)))
- (cond ((procedure? proc) (apply proc self (cdr pob)))
- (proc (cons proc (cdr pob)))
- (else '?))))
- ((char? self) (prec:warn 'extra-separator)
- (prec:advance)
- (prec:nudcall (force prec:token)))
- ((string? self) (string->symbol self))
- (else self))))
-
- (define (prec:ledcall left self)
- (let* ((pob (prec:ledf *syn-rules* self)))
- (apply (cadr pob) left self (cdr pob))))
-
- ;;; PREC:PARSE1 is the heart.
- (define (prec:parse1 bp)
- (fluid-let ((prec:bp bp))
- (do ((left (prec:nudcall (prec:advance-return-last))
- (prec:ledcall left (prec:advance-return-last))))
- ((or (>= bp 200) ;to avoid unneccesary lookahead
- (>= bp (or (prec:lbp *syn-rules* (force prec:token)) 0))
- (not left))
- left))))
-
- (define (prec:delim? token)
- (or (eof-object? token) (<= (or (prec:lbp *syn-rules* token) 220) 0)))
-
- (define (prec:parse-list sep bp)
- (cond ((prec:delim? (force prec:token))
- (prec:warn 'expression-missing)
- '(?))
- (else
- (let ((f (prec:parse1 bp)))
- (cons f (cond ((equal? (force prec:token) sep)
- (prec:advance)
- (cond ((equal? (force prec:token) sep)
- (prec:warn 'expression-missing)
- (prec:advance)
- (cons '? (prec:parse-list sep bp)))
- ((prec:delim? (force prec:token))
- (prec:warn 'expression-missing)
- '(?))
- (else (prec:parse-list sep bp))))
- ((prec:delim? (force prec:token)) '())
- ((not sep) (prec:parse-list sep bp))
- ((prec:delim? sep) (prec:warn 'separator-missing)
- (prec:parse-list sep bp))
- (else '())))))))
-
- (define (prec:parse-delimited sep bp delim)
- (cond ((equal? (force prec:token) sep)
- (prec:warn 'expression-missing)
- (prec:advance)
- (cons '? (prec:parse-delimited sep delim)))
- ((prec:delim? (force prec:token))
- (if (not (equal? (force prec:token) delim))
- (prec:warn 'mismatched-delimiter (force prec:token)
- 'expected delim))
- (if (not sep) (prec:warn 'expression-missing))
- (prec:advance)
- (if sep '() '(?)))
- (else (let ((ans (prec:parse-list sep bp)))
- (cond ((equal? (force prec:token) delim))
- ((prec:delim? (force prec:token))
- (prec:warn 'mismatched-delimiter (force prec:token)
- 'expecting delim))
- (else (prec:warn 'delimiter-expected--ignoring-rest
- (force prec:token) '...)
- (do () ((prec:delim? (force prec:token)))
- (prec:parse1 bp))))
- (prec:advance)
- ans))))
-
- (define (prec:parse grammar delim . port)
- (set! delim (prec:de-symbolfy delim))
- (fluid-let ((*syn-rules* (append (prec:delim delim) grammar))
- (*prec:port* (if (null? port) (current-input-port) (car port))))
- (prec:advance) ; setup prec:token with first token
- (cond ((eof-object? (force prec:token)) (force prec:token))
- ((equal? (force prec:token) delim) #f)
- (else
- (let ((ans (prec:parse1 0)))
- (cond ((eof-object? (force prec:token)))
- ((equal? (force prec:token) delim))
- (else (prec:warn 'delimiter-expected--ignoring-rest
- (force prec:token) 'not delim)
- (do () ((or (equal? (force prec:token) delim)
- (eof-object? (force prec:token))))
- (prec:advance))))
- ans)))))
-
- (define tok:decimal-digits "0123456789")
- (define tok:upper-case "ABCDEFGHIJKLMNOPQRSTUVWXYZ")
- (define tok:lower-case "abcdefghijklmnopqrstuvwxyz")
- (define tok:whitespaces
- (do ((i (+ -1 (min 256 char-code-limit)) (+ -1 i))
- (ws "" (if (char-whitespace? (integer->char i))
- (string-append ws (string (integer->char i)))
- ws)))
- ((negative? i) ws)))
-
- ;;;;The parse tables.
- ;;; Definitions accumulate in top-level variable *SYN-DEFS*.
- (set! *syn-defs* '()) ;Make sure *SYN-DEFS* is empty.
-
- ;;; Ignore Whitespace characters.
- (prec:define-grammar (tok:char-group 0 tok:whitespaces #f))
-
- ;;; On MS-DOS systems, <ctrl>-Z (26) needs to be ignored in order to
- ;;; avoid problems at end of files.
- (case (software-type)
- ((MSDOS)
- (if (not (char-whitespace? (integer->char 26)))
- (prec:define-grammar (tok:char-group 0 (integer->char 26) #f))
- )))
-
- ;;; Save these convenient definitions.
- (define *syn-ignore-whitespace* *syn-defs*)
- (set! *syn-defs* '())
-
- (define (prec:trace)
- (require 'trace)
- (trace prec:parse prec:parse1
- prec:parse-delimited prec:parse-list
- prec:call-or-list prec:apply-or-cons
- ;;tokenize prec:advance-return-last prec:advance
- prec:nudcall prec:ledcall
- prec:parse-nudcomment prec:parse-ledcomment
- prec:parse-delimited prec:parse-list
- prec:parse-nary prec:parse-rest
- prec:parse-matchfix prec:parse-inmatchfix
- prec:parse-prefix prec:parse-infix prec:parse-postfix
- ;;prec:delim?
- ;;prec:ledf prec:nudf prec:lbp
- )
- (set! *qp-width* 333))
-
- ;;(begin (trace-all "prec.scm") (set! *qp-width* 333))
- ;;(pretty-print (grammar-read-tab (get-grammar 'standard)))
- ;;(prec:trace)
-