home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
OS/2 Shareware BBS: 10 Tools
/
10-Tools.zip
/
OL.LZH
/
PROCS.LZH
/
FINDRE.ICN
< prev
next >
Wrap
Text File
|
1991-09-05
|
21KB
|
707 lines
########################################################################
#
# Name: findre.icn
#
# Title: "Find" Regular Expression
#
# Author: Richard L. Goerwitz
#
# Version: 1.14
#
# Date: June 1, 1991
#
########################################################################
#
# findre() is like the Icon builtin function find(),
# except that it takes, as its first argument, a regular expression
# pretty much like the ones the Unix egrep command uses (the few
# minor differences are listed below). Its syntax is the same as
# find's (i.e. findre(s1,s2,i,j)), with the exception that a no-
# argument invocation wipes out all static structures utilized by
# findre, and then forces a garbage collection.
#
# (For those not familiar with regular expressions and the Unix egrep
# command: findre() offers a simple and compact wildcard-based search
# system. If you do a lot of searches through text files, or write
# programs which do searches based on user input, then findre is a
# utility you might want to look over.)
#
# Important differences between find and findre: As noted above,
# findre() is just a find() function that takes a regular expression
# as its first argument. One major problem with this setup is that
# it leaves the user with no easy way to tab past a matched
# substring, as with
#
# s ? write(tab(find("hello")+5))
#
# In order to remedy this intrinsic deficiency, findre() sets the
# global variable __endpoint to the first position after any given
# match occurs. Use this variable with great care, preferably
# assigning its value to some other variable immediately after the
# match (for example, findre("hello [.?!]*",s) & tmp := __endpoint).
# Otherwise, you will certainly run into trouble. (See the example
# below for an illustration of how __endpoint is used).
#
# Important differences between egrep and findre: findre utilizes
# the same basic language as egrep. The only big difference is that
# findre uses intrinsic Icon data structures and escaping conven-
# tions rather than those of any particular Unix variant. Be care-
# ful! If you put findre("\(hello\)",s) into your source file,
# findre will treat it just like findre("(hello)",s). If, however,
# you enter '\(hello\)' at run-time (via, say, findre(!&input,s)),
# what Icon receives will depend on your operating system (most
# likely, a trace will show "\\(hello\\)").
#
# Bugs: Space has essentially been conserved at the expense of time
# in the automata produced by findre(). The algorithm, in other
# words, will produce the equivalent of a pushdown automaton under
# certain circumstances, rather than strive (at the expense of space)
# for full determinism. I tried to make up a nfa -> dfa converter
# that would only create that portion of the dfa it needed to accept
# or reject a string, but the resulting automaton was actually quite
# slow (if anyone can think of a way to do this in Icon, and keep it
# small and fast, please let us all know about it). Note that under
# version 8 of Icon, findre takes up negligible storage space, due to
# the much improved hashing algorithm. I have not tested it under
# version 7, but I would expect it to use up quite a bit more space
# in that environment.
#
# Important note: findre takes a shortest-possible-match approach
# to regular expressions. In other words, if you look for "a*",
# findre will not even bother looking for an "a." It will just match
# the empty string. Without this feature, findre would perform a bit
# more slowly. The problem with such an approach is that often the
# user will want to tab past the longest possible string of matched
# characters (say tab((findre("a*|b*"), __endpoint)). In circumstan-
# ces like this, please just use something like:
#
# s ? {
# tab(find("a")) & # or use Arb() from the IPL (patterns.icn)
# tab(many('a'))
# tab(many('b'))
# }
#
# or else use some combination of findre and the above.
#
########################################################################
#
# Regular expression syntax: Regular expression syntax is complex,
# and yet simple. It is simple in the sense that most of its power
# is concentrated in about a dozen easy-to-learn symbols. It is
# complex in the sense that, by combining these symbols with
# characters, you can represent very intricate patterns.
#
# I make no pretense here of offering a full explanation of regular
# expressions, their usage, and the deeper nuances of their syntax.
# As noted above, this should be gleaned from a Unix manual. For
# quick reference, however, I have included a brief summary of all
# the special symbols used, accompanied by an explanation of what
# they mean, and, in some cases, of how they are used (most of this
# is taken from the comments prepended to Jerry Nowlin's Icon-grep
# command, as posted a couple of years ago):
#
# ^ - matches if the following pattern is at the beginning
# of a line (i.e. ^# matches lines beginning with "#")
# $ - matches if the preceding pattern is at the end of a line
# . - matches any single character
# + - matches from 1 to any number of occurrences of the
# previous expression (i.e. a character, or set of paren-
# thesized/bracketed characters)
# * - matches from 0 to any number of occurrences of the previous
# expression
# \ - removes the special meaning of any special characters
# recognized by this program (i.e if you want to match lines
# beginning with a "[", write ^\[, and not ^[)
# | - matches either the pattern before it, or the one after
# it (i.e. abc|cde matches either abc or cde)
# [] - matches any member of the enclosed character set, or,
# if ^ is the first character, any nonmember of the
# enclosed character set (i.e. [^ab] matches any character
# _except_ a and b).
# () - used for grouping (e.g. ^(abc|cde)$ matches lines consist-
# ing of either "abc" or "cde," while ^abc|cde$ matches
# lines either beginning with "abc" or ending in "cde")
#
#########################################################################
#
# Example program:
#
# procedure main(a)
# while line := !&input do {
# token_list := tokenize_line(line,a[1])
# every write(!token_list)
# }
# end
#
# procedure tokenize_line(s,sep)
# tmp_lst := []
# s ? {
# while field := tab(findre(sep)|0) &
# mark := __endpoint
# do {
# put(tmp_lst,"" ~== field)
# if pos(0) then break
# else tab(mark)
# }
# }
# return tmp_lst
# end
#
# The above program would be compiled with findre (e.g. "icont
# test_prg.icn findre.icn") to produce a single executable which
# tokenizes each line of input based on a user-specified delimiter.
# Note how __endpoint is set soon after findre() succeeds. Note
# also how empty fields are excluded with "" ~==, etc. Finally, note
# that the temporary list, tmp_lst, is not needed. It is included
# here merely to illustrate one way in which tokens might be stored.
#
# Tokenizing is, of course, only one of many uses one might put
# findre to. It is very helpful in allowing the user to construct
# automata at run-time. If, say, you want to write a program that
# searches text files for patterns given by the user, findre would be
# a perfect utility to use. Findre in general permits more compact
# expression of patterns than one can obtain using intrinsic Icon
# scanning facilities. Its near complete compatibility with the Unix
# regexp library, moreover, makes for greater ease of porting,
# especially in cases where Icon is being used to prototype C code.
#
#########################################################################
global state_table, parends_present, slash_present
global biggest_nonmeta_str, __endpoint
record o_a_s(op,arg,state)
procedure findre(re, s, i, j)
local p, x, nonmeta_len
static FSTN_table, STRING_table
initial {
FSTN_table := table()
STRING_table := table()
}
if /re then {
FSTN_table := table()
STRING_table := table()
collect() # do it *now*
return
}
/s := &subject
if \i then {
if i < 1 then
i := *s + (i+1)
}
else i := \&pos | 1
if \j then {
if j < 1 then
j := *s + (j+1)
}
else j := *s+1
if /FSTN_table[re] then {
# If we haven't seen this re before, then...
if \STRING_table[re] then {
# ...if it's in the STRING_table, use plain find()
every p := find(STRING_table[re],s,i,j)
do { __endpoint := p + *STRING_table[re]; suspend p }
fail
}
else {
# However, if it's not in the string table, we have to
# tokenize it and check for metacharacters. If it has
# metas, we create an FSTN, and put that into FSTN_table;
# otherwise, we just put it into the STRING_table.
tokenized_re := tokenize(re)
if 0 > !tokenized_re then {
# if at least one element is < 0, re has metas
MakeFSTN(tokenized_re) | err_out(re,2)
# both biggest_nonmeta_str and state_table are global
/FSTN_table[re] := [.biggest_nonmeta_str, copy(state_table)]
}
else {
# re has no metas; put the input string into STRING_table
# for future reference, and execute find() at once
tmp := ""; every tmp ||:= char(!tokenized_re)
insert(STRING_table,re,tmp)
every p := find(STRING_table[re],s,i,j)
do { __endpoint := p + *STRING_table[re]; suspend p }
fail
}
}
}
if nonmeta_len := (1 < *FSTN_table[re][1]) then {
# If the biggest non-meta string in the original re
# was more than 1, then put in a check for it...
s[1:j] ? {
tab(x := i to j - nonmeta_len) &
(find(FSTN_table[re][1]) | fail) \ 1 &
(__endpoint := apply_FSTN(&null,FSTN_table[re][2])) &
(suspend x)
}
}
else {
#...otherwise it's not worth worrying about the biggest nonmeta str
s[1:j] ? {
tab(x := i to j) &
(__endpoint := apply_FSTN(&null,FSTN_table[re][2])) &
(suspend x)
}
}
end
procedure apply_FSTN(ini,tbl)
static s_tbl
local POS, tmp, fin
/ini := 1 & s_tbl := tbl & biggest_pos := 1
if ini = 0 then {
return &pos
}
POS := &pos
fin := 0
repeat {
if tmp := !s_tbl[ini] &
tab(tmp.op(tmp.arg))
then {
if tmp.state = fin
then return &pos
else ini := tmp.state
}
else (&pos := POS, fail)
}
end
procedure tokenize(s)
local chr, tmp
token_list := list()
s ? {
tab(many('*+?|'))
while chr := move(1) do {
if chr == "\\"
# it can't be a metacharacter; remove the \ and "put"
# the integer value of the next chr into token_list
then put(token_list,ord(move(1))) | err_out(s,2,chr)
else if any('*+()|?.$^',chr)
then {
# Yuck! Egrep compatibility stuff.
case chr of {
"*" : {
tab(many('*+?'))
put(token_list,-ord("*"))
}
"+" : {
tmp := tab(many('*?+')) | &null
if upto('*?',\tmp)
then put(token_list,-ord("*"))
else put(token_list,-ord("+"))
}
"?" : {
tmp := tab(many('*?+')) | &null
if upto('*+',\tmp)
then put(token_list,-ord("*"))
else put(token_list,-ord("?"))
}
"(" : {
tab(many('*+?'))
put(token_list,-ord("("))
}
default: {
put(token_list,-ord(chr))
}
}
}
else {
case chr of {
# More egrep compatibility stuff.
"[" : {
b_loc := find("[") | *&subject+1
every next_one := find("]",,,b_loc)
\next_one ~= &pos | err_out(s,2,chr)
put(token_list,-ord(chr))
}
"]" : {
if &pos = (\next_one+1)
then put(token_list,-ord(chr)) &
next_one := &null
else put(token_list,ord(chr))
}
default: put(token_list,ord(chr))
}
}
}
}
token_list := UnMetaBrackets(token_list)
fixed_length_token_list := list(*token_list)
every i := 1 to *token_list
do fixed_length_token_list[i] := token_list[i]
return fixed_length_token_list
end
procedure UnMetaBrackets(l)
# Since brackets delineate a cset, it doesn't make
# any sense to have metacharacters inside of them.
# UnMetaBrackets makes sure there are no metacharac-
# ters inside of the braces.
local tmplst, i, Lb, Rb
tmplst := list(); i := 0
Lb := -ord("[")
Rb := -ord("]")
while (i +:= 1) <= *l do {
if l[i] = Lb then {
put(tmplst,l[i])
until l[i +:= 1] = Rb
do put(tmplst,abs(l[i]))
put(tmplst,l[i])
}
else put(tmplst,l[i])
}
return tmplst
end
procedure MakeFSTN(l,INI,FIN)
# MakeFSTN recursively descends through the tree structure
# implied by the tokenized string, l, recording in (global)
# fstn_table a list of operations to be performed, and the
# initial and final states which apply to them.
# global biggest_nonmeta_str, slash_present, parends_present
static Lp, Rp, Sl, Lb, Rb, Caret_inside, Dot, Dollar, Caret_outside
local i, inter, inter2, tmp
initial {
Lp := -ord("("); Rp := -ord(")")
Sl := -ord("|")
Lb := -ord("["); Rb := -ord("]"); Caret_inside := ord("^")
Dot := -ord("."); Dollar := -ord("$"); Caret_outside := -ord("^")
}
/INI := 1 & state_table := table() &
NextState("new") & biggest_nonmeta_str := ""
/FIN := 0
# I haven't bothered to test for empty lists everywhere.
if *l = 0 then {
/state_table[INI] := []
put(state_table[INI],o_a_s(zSucceed,&null,FIN))
return
}
# HUNT DOWN THE SLASH (ALTERNATION OPERATOR)
every i := 1 to *l do {
if l[i] = Sl & tab_bal(l,Lp,Rp) = i then {
if i = 1 then err_out(l,2,char(abs(l[i]))) else {
/slash_present := "yes"
inter := NextState()
inter2:= NextState()
MakeFSTN(l[1:i],inter2,FIN)
MakeFSTN(l[i+1:0],inter,FIN)
/state_table[INI] := []
put(state_table[INI],o_a_s(apply_FSTN,inter2,0))
put(state_table[INI],o_a_s(apply_FSTN,inter,0))
return
}
}
}
# HUNT DOWN PARENTHESES
if l[1] = Lp then {
i := tab_bal(l,Lp,Rp) | err_out(l,2,"(")
inter := NextState()
if any('*+?',char(abs(0 > l[i+1]))) then {
case l[i+1] of {
-ord("*") : {
/state_table[INI] := []
put(state_table[INI],o_a_s(apply_FSTN,inter,0))
MakeFSTN(l[2:i],INI,INI)
MakeFSTN(l[i+2:0],inter,FIN)
return
}
-ord("+") : {
inter2 := NextState()
/state_table[inter2] := []
MakeFSTN(l[2:i],INI,inter2)
put(state_table[inter2],o_a_s(apply_FSTN,inter,0))
MakeFSTN(l[2:i],inter2,inter2)
MakeFSTN(l[i+2:0],inter,FIN)
return
}
-ord("?") : {
/state_table[INI] := []
put(state_table[INI],o_a_s(apply_FSTN,inter,0))
MakeFSTN(l[2:i],INI,inter)
MakeFSTN(l[i+2:0],inter,FIN)
return
}
}
}
else {
MakeFSTN(l[2:i],INI,inter)
MakeFSTN(l[i+1:0],inter,FIN)
return
}
}
else { # I.E. l[1] NOT = Lp (left parenthesis as -ord("("))
every i := 1 to *l do {
case l[i] of {
Lp : {
inter := NextState()
MakeFSTN(l[1:i],INI,inter)
/parends_present := "yes"
MakeFSTN(l[i:0],inter,FIN)
return
}
Rp : err_out(l,2,")")
}
}
}
# NOW, HUNT DOWN BRACKETS
if l[1] = Lb then {
i := tab_bal(l,Lb,Rb) | err_out(l,2,"[")
inter := NextState()
tmp := ""; every tmp ||:= char(l[2 to i-1])
if Caret_inside = l[2]
then tmp := ~cset(Expand(tmp[2:0]))
else tmp := cset(Expand(tmp))
if any('*+?',char(abs(0 > l[i+1]))) then {
case l[i+1] of {
-ord("*") : {
/state_table[INI] := []
put(state_table[INI],o_a_s(apply_FSTN,inter,0))
put(state_table[INI],o_a_s(any,tmp,INI))
MakeFSTN(l[i+2:0],inter,FIN)
return
}
-ord("+") : {
inter2 := NextState()
/state_table[INI] := []
put(state_table[INI],o_a_s(any,tmp,inter2))
/state_table[inter2] := []
put(state_table[inter2],o_a_s(apply_FSTN,inter,0))
put(state_table[inter2],o_a_s(any,tmp,inter2))
MakeFSTN(l[i+2:0],inter,FIN)
return
}
-ord("?") : {
/state_table[INI] := []
put(state_table[INI],o_a_s(apply_FSTN,inter,0))
put(state_table[INI],o_a_s(any,tmp,inter))
MakeFSTN(l[i+2:0],inter,FIN)
return
}
}
}
else {
/state_table[INI] := []
put(state_table[INI],o_a_s(any,tmp,inter))
MakeFSTN(l[i+1:0],inter,FIN)
return
}
}
else { # I.E. l[1] not = Lb
every i := 1 to *l do {
case l[i] of {
Lb : {
inter := NextState()
MakeFSTN(l[1:i],INI,inter)
MakeFSTN(l[i:0],inter,FIN)
return
}
Rb : err_out(l,2,"]")
}
}
}
# FIND INITIAL SEQUENCES OF POSITIVE INTEGERS, CONCATENATE THEM
if i := match_positive_ints(l) then {
inter := NextState()
tmp := Ints2String(l[1:i])
# if a slash has been encountered already, forget optimizing
# in this way; if parends are present, too, then forget it,
# unless we are at the beginning or end of the input string
if INI = 1 | FIN = 2 | /parends_present &
/slash_present & *tmp > *biggest_nonmeta_str
then biggest_nonmeta_str := tmp
/state_table[INI] := []
put(state_table[INI],o_a_s(match,tmp,inter))
MakeFSTN(l[i:0],inter,FIN)
return
}
# OKAY, CLEAN UP ALL THE JUNK THAT'S LEFT
i := 0
while (i +:= 1) <= *l do {
case l[i] of {
Dot : { Op := any; Arg := &cset }
Dollar : { Op := pos; Arg := 0 }
Caret_outside: { Op := pos; Arg := 1 }
default : { Op := match; Arg := char(0 < l[i]) }
} | err_out(l,2,char(abs(l[i])))
inter := NextState()
if any('*+?',char(abs(0 > l[i+1]))) then {
case l[i+1] of {
-ord("*") : {
/state_table[INI] := []
put(state_table[INI],o_a_s(apply_FSTN,inter,0))
put(state_table[INI],o_a_s(Op,Arg,INI))
MakeFSTN(l[i+2:0],inter,FIN)
return
}
-ord("+") : {
inter2 := NextState()
/state_table[INI] := []
put(state_table[INI],o_a_s(Op,Arg,inter2))
/state_table[inter2] := []
put(state_table[inter2],o_a_s(apply_FSTN,inter,0))
put(state_table[inter2],o_a_s(Op,Arg,inter2))
MakeFSTN(l[i+2:0],inter,FIN)
return
}
-ord("?") : {
/state_table[INI] := []
put(state_table[INI],o_a_s(apply_FSTN,inter,0))
put(state_table[INI],o_a_s(Op,Arg,inter))
MakeFSTN(l[i+2:0],inter,FIN)
return
}
}
}
else {
/state_table[INI] := []
put(state_table[INI],o_a_s(Op,Arg,inter))
MakeFSTN(l[i+1:0],inter,FIN)
return
}
}
# WE SHOULD NOW BE DONE INSERTING EVERYTHING INTO state_table
# IF WE GET TO HERE, WE'VE PARSED INCORRECTLY!
err_out(l,4)
end
procedure NextState(new)
static nextstate
if \new then nextstate := 1
else nextstate +:= 1
return nextstate
end
procedure err_out(x,i,elem)
writes(&errout,"Error number ",i," parsing ",image(x)," at ")
if \elem
then write(&errout,image(elem),".")
else write(&errout,"(?).")
exit(i)
end
procedure zSucceed()
return .&pos
end
procedure Expand(s)
s2 := ""
s ? {
s2 ||:= ="^"
s2 ||:= ="-"
while s2 ||:= tab(find("-")-1) do {
if (c1 := move(1), ="-",
c2 := move(1),
c1 << c2)
then every s2 ||:= char(ord(c1) to ord(c2))
else s2 ||:= 1(move(2), not(pos(0))) | err_out(s,2,"-")
}
s2 ||:= tab(0)
}
return s2
end
procedure tab_bal(l,i1,i2)
i := 0
i1_count := 0; i2_count := 0
while (i +:= 1) <= *l do {
case l[i] of {
i1 : i1_count +:= 1
i2 : i2_count +:= 1
}
if i1_count = i2_count
then suspend i
}
end
procedure match_positive_ints(l)
# Matches the longest sequence of positive integers in l,
# beginning at l[1], which neither contains, nor is fol-
# lowed by a negative integer. Returns the first position
# after the match. Hence, given [55, 55, 55, -42, 55],
# match_positive_ints will return 3. [55, -42] will cause
# it to fail rather than return 1 (NOTE WELL!).
every i := 1 to *l do {
if l[i] < 0
then return (3 < i) - 1 | fail
}
return *l + 1
end
procedure Ints2String(l)
tmp := ""
every tmp ||:= char(!l)
return tmp
end
procedure StripChar(s,s2)
if find(s2,s) then {
tmp := ""
s ? {
while tmp ||:= tab(find("s2"))
do tab(many(cset(s2)))
tmp ||:= tab(0)
}
}
return \tmp | s
end