The Best of Select: Windows 95 Special 1

home *** CD-ROM | disk | FTP | other *** search

/ The Best of Select: Windows 95 Special 1 / WINDOWS95_1.ISO / utils / w32-rex / regina / srccode / src / parsing.c < prev next >

Wrap

Text File | 1995-06-30 | 23KB | 630 lines

#ifndef lint static char *RCSid = "$Id: parsing.c,v 1.13 1993/05/10 06:11:00 anders Exp anders $"; #endif /* * The Regina Rexx Interpreter * Copyright (C) 1992-1994 Anders Christensen <anders@pvv.unit.no> * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Library General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Library General Public License for more details. * * You should have received a copy of the GNU Library General Public * License along with this library; if not, write to the Free * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ /* This code modified for Win32 port by Ataman Software, Inc. June 29, 1995. */ /* * This implementation of the REXX parse template is quite near to * what Cowlishaw specified in his book, but there is one very * important difference. Cowlishaw sees a template as patterns and * variables, with patterns in each end (and some of the patterns * and/or the variables might be empty). * * The concept here is any sequence of variables and patterns, and * the system is parsed into two levels and interpreted as such. * First there is the level of patterns, which is anchored to * particular locations in the string to be parsed. * * When the anchor-points are determined, the variables are filled * in with the parts of the string that comes between each anchor. * Here is an example: * * parse value 'That is a parse string!' with a b 'is' c d 'i' e * * In this example, there are four anchors in the string, the two * patterns 'is' and 'i', and the two implicit anchors start-of-string * and end-of-string. They anchor particular locations in the string * to the lists of variables: (a b), (c d) and (e). At the first level * the anchors are found and the text between them are deternmined * (SOS=Start-Of-String, EOS=End-Of-String): * * That is a parse string! * anchors (SOS)-----is------------i---(EOS) * strings in-between <a b> <<<<c d>>>>> <e> * * Now, we have a set of substrings, and to each substring, there is * possibly empty set of variable which are to receive its value from * the contents of that substring. Doing that, we get: * * (a b) = 'That ' ==> a='That' b=' ' * (c d) = ' a parse str' ==> c='a' c=' parse str' * (e) = 'ng!' ==> e='ng!' * * To some extent, one might say that there are one anchor between * each variable, since these match a sequence of blank characters. * However, these have less priority than the explicit patterns. * * This file makes available three functions: * * doparse() parses a single string into a single template * parseargtree() parses a set of strings into a set of templates * bmstrstr() implementation of Boyer-Moore string search * * Only one external variable are referenced in the code, 'currlevel' * in order to determine the currently tracing mode, and to cache the * value for the duration of the tracing of one template. */ #include "rexx.h" #include <ctype.h> #include <string.h> #include <stdio.h> /* Stupid SunOS acc uses stderr in assert(), yuk! */ #include <assert.h> #include "strings.h" /* * Using the 'tracestat' field of 'currlevel' in order to determine * the current tracing mode. */ extern proclevel currlevel ; /* * This file-scope variable is use to cache the value of the setting * of the current trace mode. It is initialized at the entrypoint of * each parse statement. However, if functionality is expanded, so that * expression can occur inside a template, then this variable must be * checked in order to ensure that it is up-to-date after the expression * has been evaluated. */ static int traceparse = (-1) ; /* * We have to seek for the nullstring rather often, so we make it a * file-scope variable, in order to optimize a bit. This way, we have * to neither allocate space for it, or deallocate it. */ static streng nullstring={ 0, 0, "" } ; /* * This is an implementation of the strstr() function, for the 'streng' * datatype. The algorithm is a straight forward implementation of the * Boyer-Moore string search algorithm. * * The input parameters are first the string to search in, then the start * position of the search in that string, and then the string to search * for. If the whole of the input string is to be searched, then the * start position is set to zero. If start position is set to one, then * a match will not be found if it includes the first character of the * the string 'heystack' as the first character of the match. * * The returned value is an integer, which will be the index of the * first character of 'heystack->value' that match. If no match is * found, -1 is returned, if a match is found in the start of 'heystack' * then 0 is returned. This means that both the 'start' parameter and * the return value are zero-based. */ int bmstrstr( streng *heystack, int Offset, streng *needle ) { unsigned char *TmpPtr ; int NeedLen, HeyLen ; unsigned char *NeedPtr, *HeyPtr, *HeyBase ; unsigned int NextChr[256] ; int Tmp ; unsigned char *eptr ; NeedPtr = (unsigned char*)needle->value ; NeedLen = needle->len ; HeyBase = heystack->value ; HeyLen = heystack->len - Offset ; HeyPtr = HeyBase + Offset ; /* * First, sometimes we want to search for one character only. Although * Boyer-Moore works for that case, it is hardly efficient. So, if the * search pattern has length one, we use the ANSI C memchr() to find * the string. That function is likely to to be more efficient, maybe * even written in hand-optimized assembly. */ if (NeedLen==1) { TmpPtr = memchr( HeyPtr, *NeedPtr, HeyLen ) ; if (TmpPtr) return (TmpPtr-HeyBase) ; else return -1 ; } /* * First we test that the pattern isn't longer than the string to * search in. If the pattern is long, builing the 'next' table will * take a lot of time, so we really want to avoid any action in that * case. */ else if (HeyLen < NeedLen) return -1 ; /* * OK, here is the 'real' search. Basically, it consists of two * phases: first a table (next) is built up, and then the string * is searched using the table to decide how far to hop if a match * was not found. * * Let's recount some of the theory behind Boyer-Moore search. If you * got a pattern, P, and string to search in, S, then the first match * may be S[1..len(P)]. To verify whether that is actually a match, * we have to iterate through all the characters, i.e. over len(P). * * If that is a match, then return the correct position, else we must * continue. Here comes the 'tricky' part. Suppose P is 'abcccd' and * that S is 'gddeebfgghhhiiijjj'. Now we check character S[len(P)] * and depending on its value we can determine something about the * possibiltiy of having a match starting somewhere in the area from * S[1] to S[len(P)]. Clearly, if S[len(P)] is a character not in P, * then no match can occur in this area. Else, determine the first, * of the next possible match, and move forward to check that. */ else { /* * First, we have to set up the table to use during the search. * Initially, we fill the whole table with the 'default' value, and * then we patch the values which should have other values in the * loop following the call to memset(). */ for (Tmp=0; Tmp<256; NextChr[Tmp++]=NeedLen) ; TmpPtr = NeedPtr ; for (Tmp=NeedLen; Tmp; ) NextChr[*(TmpPtr++)] = --Tmp ; eptr = HeyPtr + HeyLen - NeedLen-- ; while (1) { if (HeyPtr>eptr) return -1 ; HeyPtr += Tmp = NextChr[*(HeyPtr+NeedLen)] ; if (!Tmp) { Tmp=NeedLen ; TmpPtr = HeyPtr++ ; while (1) { if (--Tmp<0) return (TmpPtr - HeyBase ) ; if (TmpPtr[Tmp] != NeedPtr[Tmp]) break ; } } } } } static streng *handle_var( nodeptr this ) { if (this->type == X_HEAD_SYMBOL) return fix_compound( this, NULL ) ; else return shortcut( this ) ; } /* * This parses a part of the source string, determined by (start) * and (stop) into the variables of the (this) tree. Only variables * are handled, not patterns. It will be called by doparse() to fit a * string into a set of variables and placeholder. * * Start points to the first character to be parsed, while stop points * to the last character to be parsed. If start==stop, only one char * is to be parsed into variables and placeholders. * * There is no returnvalue from this function, it is only called to * achieve the special effects of setting variables and tracing * variables and placeholders. Actually, this routine used to be * tailrecursive, but I changed it into a goto. * * The variables and placeholders to parse ares stored in a chained * of pointers, chased through p[0]. 'start' is a ptr to the first * characters to be parsed by this function. 'stop' is a ptr to the * last character to be parsed by this function. Note that is points * to the last char to be parsed, not to the character after the last * character to be parsed. */ static void doparse3( streng *string, nodeptr this, char *start, char *stop ) { char *rstart ; streng *tptr ; recurse: /* * Since we are going to put the next word of the input string * into a variable, we must first find that value. The if tests * whether we are the last word before an 'anchor' to be parsed. * if so, use the rest of the string. If not, scan forwards to * identify a word. */ if (this->p[0]) { /* * We shall only fetch out one word. First skip leading spaces, * then find the end of the next word. */ for (;(start<=stop)&&(isspace(*start));start++) ; for (rstart=start;(rstart<=stop)&&(!isspace(*rstart));rstart++) ; } else /* * We are last word, use rest of string as value. */ rstart = stop + 1 ; /* * We have found the word to be parsed into something. Now we have * to decide what to parse it into. There are two possibilities. * It might be a variable, or just a placeholder (dot). The two are * handled in each part of the if-statement below. The setting of * 'tptr' could be lifted out of the if-statement to save space, * but at the cost of more CPU. */ tptr = Str_ncre( start, rstart-start<0 ? 0 : rstart-start ) ; if (this->type==X_TPL_SYMBOL) { if (traceparse) tracevalue(tptr,'>') ; if ( this->p[1]->type == X_HEAD_SYMBOL) fix_compound( this->p[1], tptr ) ; else setshortcut( this->p[1], tptr ) ; } else if (traceparse && (rstart-start)>=0) { /* * It's a placeholder, actually, we skip this if we arn't * tracing. No harm is done if tracevalue() is called unnecessary, * but this way we save three function calls whenever we're not * doing TRACE INT. * * The three operations below do: 1) get the value to be traced, * 2) then output the trace information, 3) and then free the * temporary storage. */ tracevalue(tptr,'.') ; Free_string( tptr ) ; } /* * Now, this should actually be a tail recursion, but since be don't * trust compilers, we are optimizeing it ourselves. */ if ((this = this->p[0])) { start = rstart + 1 ; goto recurse ; } } /* * This routine parses a string (source) into the template that is * specified by the structure in the (this) tree. (start) is the * start postion into the (source) string. It handles find the next * template, and handles the aread between two templates. * * It calls it self recursively to handle a sequence of templates. * Well, actually, it used to call it self recuseively, but the * tail recursion has been removed to improve efficiency. * * A sequence of patterns must be chained together using p[1], while * each pattern can contain the vars/placeholders as a chain linked * in at p[0]. * * 'source' is the string to be parsed, 'this' it a ptr to the top * of the parsetree that describes how 'source' after start'th * position is to be parsed. 'start' is a ptr to the first char in * 'source' to be parsed by this part of the template. 'point' is a * ptr to the current positional point in 'source', i.e. if we find * a positional pattern, we use point, else we use start. */ void doparse( streng *source, nodeptr this, int start, int point ) { int length, end, nextstart, solid ; streng *pattern, *xtmp ; char tch ; nextstart = 0 ; /* too keep gcc from complaining about uninitialized */ tch = currlevel->tracestat ; traceparse = ((tch=='I') || (tch=='R')) ; recurse: /* * Cache the length of source, to avoid chasing ponters later. * Then make pattern default to the nullstring. The nullstring is * so muched used, that we don't want to allocate and deallocate * that all the time. */ length = source->len ; pattern = &nullstring ; /* * There are two main cases, either this is the last pattern, in * which case we use the rest of the string. Or either there is * another pattern further out, in which case we have to find it. * */ if (this->p[1]) { /* * We are not the last pattern, so first find the next pattern. * First cache the type, so we don't chase pointers. There are * two main choises: either seek for a string of some sort, or * use an offset of some sort. */ solid = this->p[1]->type ; if ((solid==X_TPL_MVE)||(solid==X_TPL_VAR)) { /* * The pattern to search for is either a literal string, or it * is the value hold in a variable, set pattern to whatever * it might be. Pattern previous points to a statically * allocated variable, so don't bother to deallocate. */ if (solid==X_TPL_MVE) pattern = this->p[1]->name ; else pattern = handle_var( this->p[1]->p[0] ) ; /* * Then we must find where in the source string pattern occurs. * If it don't occur there, we use the rest of the string, else * we use the string up to, but not including, the first char * that matched pattern. The 'bmstrstr' returns -1 for not * found, so correct that to rest-of-string. Also note that if * the pattern is the nullstring, it should match the end of * the string. */ if (Str_len(pattern)) { end = bmstrstr( source, start, pattern ) ; if (end<0) { point = end = length ; nextstart = end ; } else { nextstart = end + Str_len(pattern) ; point = end ; } } else { nextstart = point = end = length ; } /* * While 'end' marks how much to stuff into variables, nextstart * marks where to start the search for the next pattern (if * any). Remember that patterns "eat" up the part of the * parse string that they match. */ /* nextstart = end + Str_len(pattern) ; */ } else { /* * The next pattern to match is not a string to match, but a * positional movement, which will always be numeric, and if * it contains a sign, that should have been stripped off during * parsing, so check that it is non-negative. */ if (this->p[1]->name) xtmp = this->p[1]->name ; else xtmp = handle_var( this->p[1]->p[0] ) ; end = atozpos( xtmp ) ; assert( end >= 0 ) ; /* * Depending on what sort of positional movement, do the right * thing. */ if (solid==X_NEG_OFFS) { /* * If it is a movement backwards, the concept goes something * like move-it-foreward-to-a-backwards-position. That is, * the string to be parsed continues forwards to the end of * the sting and stops there, while the nextposition wraps * round to the start again. * * Anyway, parse all the rest of the sting in this parse, and * start on the specified position for the next parse. */ start = point ; nextstart = point - end ; end = length ; if (nextstart < 0) nextstart = 0 ; point = nextstart ; } else if (solid==X_POS_OFFS) { /* * If the movement is forward, it is simpler, just move the * position of both the end of this, and the start of next * to the right point. */ start = point ; nextstart = point + end ; if (nextstart > length) nextstart = length ; end = nextstart ; if (end<=start) end = length ; point = nextstart ; } else if (solid==X_ABS_OFFS) { /* * Same applies if the position is absolute, just move it. */ if ((end--)==0) exiterror( ERR_INVALID_INTEGER ) ; if (end>length) end = length ; point = nextstart = end ; if (end <= start) end = length ; } } } else /* * We are last pattern to match, set the end of the string to * be parsed to the rest of the string available. */ end = nextstart = length ; /* * Make sure that we didn't do anything illegal when we pushed * around on the value of end and nextstart. These should have been * set correctly in the statements above. */ assert((0<=nextstart) && (nextstart<=length)) ; /* * Then handle end. It must be _after_ the last character in * the pattern, while it must not be larger than length. */ assert((start <= end) && (end <= length)) ; /* * Now we have marked off an area to be parsed, so call 'doparse3' to * put values into the variables. Note that end is decremented, * since doparse3 expects ptr to last char to use, not ptr to char * after last char to use. */ if (this->p[0]) doparse3( source, this->p[0],source->value+start,source->value+(--end)); /* * Then make a tailrecursive call, or rather, simulate one. This * operation will take care of the next set of variables to be * parsed values into. */ if ((this=this->p[2])) { start = nextstart ; goto recurse ; } } /* * A single parse clause can parse multiple strings into multiple * patterns. Normally, this is only done using 'parse arg'. The * following piece of code parses multiples strings in an structure * of arguments into the matching pattern. * * There are no limits on the number of arguments to be parsed, * other than memory. */ void parseargtree( paramboxptr argbox, nodeptr this, int upper ) { streng *source ; /* * All templates in a list of template are connected though the * next field of the template. */ for (; this; this=this->next) { assert(this->type==X_TPL_SOLID) ; /* * Else, it is a tempate into which a string is to be parsed. * That string is an argument, to first get that argument, * if it exist, else use the nullstring. Never bother about * deallocating this string; either it is part of an arguemnt * which is deallocated somewhere else, or it is the statically * allocated nullstring. */ if ((argbox)&&(argbox->value)) source = argbox->value ; else source = &nullstring ; /* * If we are using ARG, or PARSE UPPER ARG, we have to make * sure that the arguments are in upper case. Only one way to * do that, and unfortunately, we have to copy the string * for the time being. Upper mode will therefore be slower * than normal mode. */ if (upper) source = upcase( Str_dup( source )) ; /* * Now we have isolated a string in the right case, and a * ptr to the parse tree of the pattern, so we just have to * call the code to actually parse the string into the pattern. */ doparse( source, this, 0, 0 ) ; /* * If the mode was upper, then we must remember to deallocate * the temporary variable in which we stored the upper case * version of the string to parse. */ if (upper) Free_string( source ) ; /* * And then, at last, we have to increment the pointer in the * chain of agruments, but only if there are more arguments to * be parsed. */ if (argbox) argbox = argbox->next ; } }