home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
OS/2 Shareware BBS: 22 gnu
/
22-gnu.zip
/
gnurx.zip
/
rx
/
rxsuper.h
< prev
next >
Wrap
C/C++ Source or Header
|
1995-12-31
|
16KB
|
448 lines
/* classes: h_files */
#ifndef RXSUPERH
#define RXSUPERH
/***********************************************************
Copyright 1995 by Tom Lord
All Rights Reserved
Permission to use, copy, modify, and distribute this software and its
documentation for any purpose and without fee is hereby granted,
provided that the above copyright notice appear in all copies and that
both that copyright notice and this permission notice appear in
supporting documentation, and that the name of the copyright holder not be
used in advertising or publicity pertaining to distribution of the
software without specific, written prior permission.
Tom Lord DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
EVENT SHALL TOM LORD BE LIABLE FOR ANY SPECIAL, INDIRECT OR
CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
******************************************************************/
/* lord Sun May 7 12:40:17 1995 */
#include "rxnfa.h"
/* This begins the description of the superstate NFA.
*
* The superstate NFA corresponds to the NFA in these ways:
*
* Superstate states correspond to sets of NFA states (nfa_states(SUPER)),
*
* Superstate edges correspond to NFA paths.
*
* The superstate has no epsilon transitions;
* every edge has a character label, and a (possibly empty) side
* effect label. The side effect label corresponds to a list of
* side effects that occur in the NFA. These parts are referred
* to as: superedge_character(EDGE) and superedge_sides(EDGE).
*
* For a superstate edge EDGE starting in some superstate SUPER,
* the following is true (in pseudo-notation :-):
*
* exists DEST in nfa_states s.t.
* exists nfaEDGE in nfa_edges s.t.
* origin (nfaEDGE) == DEST
* && origin (nfaEDGE) is a member of nfa_states(SUPER)
* && exists PF in possible_futures(dest(nfaEDGE)) s.t.
* sides_of_possible_future (PF) == superedge_sides (EDGE)
*
* also:
*
* let SUPER2 := superedge_destination(EDGE)
* nfa_states(SUPER2)
* == union of all nfa state sets S s.t.
* exists PF in possible_futures(dest(nfaEDGE)) s.t.
* sides_of_possible_future (PF) == superedge_sides (EDGE)
* && S == dests_of_possible_future (PF) }
*
* Or in english, every superstate is a set of nfa states. A given
* character and a superstate implies many transitions in the NFA --
* those that begin with an edge labeled with that character from a
* state in the set corresponding to the superstate.
*
* The destinations of those transitions each have a set of possible
* futures. A possible future is a list of side effects and a set of
* destination NFA states. Two sets of possible futures can be
* `merged' by combining all pairs of possible futures that have the
* same side effects. A pair is combined by creating a new future
* with the same side effect but the union of the two destination sets.
* In this way, all the possible futures suggested by a superstate
* and a character can be merged into a set of possible futures where
* no two elements of the set have the same set of side effects.
*
* The destination of a possible future, being a set of NFA states,
* corresponds to a supernfa state. So, the merged set of possible
* futures we just created can serve as a set of edges in the
* supernfa.
*
* The representation of the superstate nfa and the nfa is critical.
* The nfa has to be compact, but has to facilitate the rapid
* computation of missing superstates. The superstate nfa has to
* be fast to interpret, lazilly constructed, and bounded in space.
*
* To facilitate interpretation, the superstate data structures are
* peppered with `instruction frames'. There is an instruction set
* defined below which matchers using the supernfa must be able to
* interpret.
*
* We'd like to make it possible but not mandatory to use code
* addresses to represent instructions (c.f. gcc's computed goto).
* Therefore, we define an enumerated type of opcodes, and when
* writing one of these instructions into a data structure, use
* the opcode as an index into a table of instruction values.
*
* Below are the opcodes that occur in the superstate nfa.
*
* The descriptions of the opcodes refer to data structures that are
* described further below.
*/
enum rx_opcode
{
/*
* BACKTRACK_POINT is invoked when a character transition in
* a superstate leads to more than one edge. In that case,
* the edges have to be explored independently using a backtracking
* strategy.
*
* A BACKTRACK_POINT instruction is stored in a superstate's
* transition table for some character when it is known that that
* character crosses more than one edge. On encountering this
* instruction, the matcher saves enough state to backtrack to this
* point later in the match.
*/
rx_backtrack_point = 0, /* data is (struct transition_class *) */
/*
* RX_DO_SIDE_EFFECTS evaluates the side effects of an epsilon path.
* There is one occurence of this instruction per rx_distinct_future.
* This instruction is skipped if a rx_distinct_future has no side effects.
*/
rx_do_side_effects = rx_backtrack_point + 1,
/* data is (struct rx_distinct_future *) */
/*
* RX_CACHE_MISS instructions are stored in rx_distinct_futures whose
* destination superstate has been reclaimed (or was never built).
* It recomputes the destination superstate.
* RX_CACHE_MISS is also stored in a superstate transition table before
* any of its edges have been built.
*/
rx_cache_miss = rx_do_side_effects + 1,
/* data is (struct rx_distinct_future *) */
/*
* RX_NEXT_CHAR is called to consume the next character and take the
* corresponding transition. This is the only instruction that uses
* the DATA field of the instruction frame instead of DATA_2.
* The comments about rx_inx explain this further.
*/
rx_next_char = rx_cache_miss + 1, /* data is (struct superstate *) */
/* RX_BACKTRACK indicates that a transition fails. Don't
* confuse this with rx_backtrack_point.
*/
rx_backtrack = rx_next_char + 1, /* no data */
/*
* RX_ERROR_INX is stored only in places that should never be executed.
*/
rx_error_inx = rx_backtrack + 1, /* Not supposed to occur. */
rx_num_instructions = rx_error_inx + 1
};
/* The heart of the matcher is a `word-code-interpreter'
* (like a byte-code interpreter, except that instructions
* are a full word wide).
*
* Instructions are not stored in a vector of code, instead,
* they are scattered throughout the data structures built
* by the regexp compiler and the matcher. One word-code instruction,
* together with the arguments to that instruction, constitute
* an instruction frame (struct rx_inx).
*
* This structure type is padded by hand to a power of 2 because
* in one of the dominant cases, we dispatch by indexing a table
* of instruction frames. If that indexing can be accomplished
* by just a shift of the index, we're happy.
*
* Instructions take at most one argument, but there are two
* slots in an instruction frame that might hold that argument.
* These are called data and data_2. The data slot is only
* used for one instruction (RX_NEXT_CHAR). For all other
* instructions, data should be set to 0.
*
* RX_NEXT_CHAR is the most important instruction by far.
* By reserving the data field for its exclusive use,
* instruction dispatch is sped up in that case. There is
* no need to fetch both the instruction and the data,
* only the data is needed. In other words, a `cycle' begins
* by fetching the field data. If that is non-0, then it must
* be the destination state of a next_char transition, so
* make that value the current state, advance the match position
* by one character, and start a new cycle. On the other hand,
* if data is 0, fetch the instruction and do a more complicated
* dispatch on that.
*/
struct rx_inx
{
void * data;
void * data_2;
void * inx;
void * fnord;
};
#ifndef RX_TAIL_ARRAY
#define RX_TAIL_ARRAY 1
#endif
/* A superstate corresponds to a set of nfa states. Those sets are
* represented by STRUCT RX_SUPERSET. The constructors
* guarantee that only one (shared) structure is created for a given set.
*/
struct rx_superset
{
int is_final;
int refs; /* This is a reference counted structure. */
/* We keep these sets in a cache because (in an unpredictable way),
* the same set is often created again and again. But that is also
* problematic -- compatibility with POSIX and GNU regex requires
* that we not be able to tell when a program discards a particular
* NFA (thus invalidating the supersets created from it).
*
* But when a cache hit appears to occur, we will have in hand the
* nfa for which it may have happened. That is why every nfa is given
* its own sequence number. On a cache hit, the cache is validated
* by comparing the nfa sequence number to this field:
*/
int id;
struct rx_nfa_state * car; /* May or may not be a valid addr. */
struct rx_superset * cdr;
/* If the corresponding superstate exists: */
struct rx_superstate * superstate;
/* There is another bookkeeping problem. It is expensive to
* compute the starting nfa state set for an nfa. So, once computed,
* it is cached in the `struct rx'.
*
* But, the state set can be flushed from the superstate cache.
* When that happens, we can't know if the corresponding `struct rx'
* is still alive or if it has been freed or re-used by the program.
* So, the cached pointer to this set in a struct rx might be invalid
* and we need a way to validate it.
*
* Fortunately, even if this set is flushed from the cache, it is
* not freed. It just goes on the free-list of supersets.
* So we can still examine it.
*
* So to validate a starting set memo, check to see if the
* starts_for field still points back to the struct rx in question,
* and if the ID matches the rx sequence number.
*/
struct rx * starts_for;
/* This is used to link into a hash bucket so these objects can
* be `hash-consed'.
*/
struct rx_hash_item hash_item;
};
#define rx_protect_superset(RX,CON) (++(CON)->refs)
/* The terminology may be confusing (rename this structure?).
* Every character occurs in at most one rx_super_edge per super-state.
* But, that structure might have more than one option, indicating a point
* of non-determinism.
*
* In other words, this structure holds a list of superstate edges
* sharing a common starting state and character label. The edges
* are in the field OPTIONS. All superstate edges sharing the same
* starting state and character are in this list.
*/
struct rx_super_edge
{
struct rx_super_edge *next;
struct rx_inx rx_backtrack_frame;
int cset_size;
rx_Bitset cset;
struct rx_distinct_future *options;
};
/* A superstate is a set of nfa states (RX_SUPERSET) along
* with a transition table. Superstates are built on demand and reclaimed
* without warning. To protect a superstate from this ghastly fate,
* use LOCK_SUPERSTATE.
*/
struct rx_superstate
{
int rx_id; /* c.f. the id field of rx_superset */
int locks; /* protection from reclamation */
/* Within a superstate cache, all the superstates are kept in a big
* queue. The tail of the queue is the state most likely to be
* reclaimed. The *recyclable fields hold the queue position of
* this state.
*/
struct rx_superstate * next_recyclable;
struct rx_superstate * prev_recyclable;
/* The supernfa edges that exist in the cache and that have
* this state as their destination are kept in this list:
*/
struct rx_distinct_future * transition_refs;
/* The list of nfa states corresponding to this superstate: */
struct rx_superset * contents;
/* The list of edges in the cache beginning from this state. */
struct rx_super_edge * edges;
/* A tail of the recyclable queue is marked as semifree. A semifree
* state has no incoming next_char transitions -- any transition
* into a semifree state causes a complex dispatch with the side
* effect of rescuing the state from its semifree state into a
* fully free state at the head of the queue.
*
* An alternative to this might be to make next_char more expensive,
* and to move a state to the head of the recyclable queue whenever
* it is entered. That way, popular states would never be recycled.
*
* But unilaterally making next_char more expensive actually loses.
* So, incoming transitions are only made expensive for states near
* the tail of the recyclable queue. The more cache contention
* there is, the more frequently a state will have to prove itself
* and be moved back to the front of the queue. If there is less
* contention, then popular states just aggregate in the front of
* the queue and stay there.
*/
int is_semifree;
/* This keeps track of the size of the transition table for this
* state. There is a half-hearted attempt to support variable sized
* superstates.
*/
int trans_size;
/* Indexed by characters... */
struct rx_inx transitions[RX_TAIL_ARRAY];
};
/* A list of distinct futures define the edges that leave from a
* given superstate on a given character. c.f. rx_super_edge.
*/
struct rx_distinct_future
{
struct rx_distinct_future * next_same_super_edge[2];
struct rx_distinct_future * next_same_dest;
struct rx_distinct_future * prev_same_dest;
struct rx_superstate * present; /* source state */
struct rx_superstate * future; /* destination state */
struct rx_super_edge * edge;
/* The future_frame holds the instruction that should be executed
* after all the side effects are done, when it is time to complete
* the transition to the next state.
*
* Normally this is a next_char instruction, but it may be a
* cache_miss instruction as well, depending on whether or not
* the superstate is in the cache and semifree.
*
* If this is the only future for a given superstate/char, and
* if there are no side effects to be performed, this frame is
* not used (directly) at all. Instead, its contents are copied
* into the transition table of the starting state of this dist. future
* (a sort of goto elimination).
*/
struct rx_inx future_frame;
struct rx_inx side_effects_frame;
struct rx_se_list * effects;
};
#define rx_lock_superstate(R,S) ((S)->locks++)
#define rx_unlock_superstate(R,S) (--(S)->locks)
struct rx_cache;
#ifdef __STDC__
typedef void (*rx_morecore_fn)(struct rx_cache *);
#else
typedef void (*rx_morecore_fn)();
#endif
struct rx_cache
{
struct rx_hash_rules superset_hash_rules;
struct rx_superstate * lru_superstate;
struct rx_superstate * semifree_superstate;
struct rx_superset * empty_superset;
int superstates;
int semifree_superstates;
int hits;
int misses;
int bytes_allowed;
int bytes_used;
int local_cset_size;
void ** instruction_table;
struct rx_hash superset_table;
};
extern struct rx_cache * rx_default_cache;
#ifdef __STDC__
extern void rx_release_superset (struct rx *rx,
struct rx_superset *set);
extern struct rx_superset * rx_superset_cons (struct rx * rx,
struct rx_nfa_state *car, struct rx_superset *cdr);
extern struct rx_superset * rx_superstate_eclosure_union
(struct rx * rx, struct rx_superset *set, struct rx_nfa_state_set *ecl) ;
extern struct rx_superstate * rx_superstate (struct rx *rx,
struct rx_superset *set);
extern struct rx_inx * rx_handle_cache_miss
(struct rx *rx, struct rx_superstate *super, unsigned char chr, void *data) ;
#else /* STDC */
extern void rx_release_superset ();
extern struct rx_superset * rx_superset_cons ();
extern struct rx_superset * rx_superstate_eclosure_union
extern struct rx_superstate * rx_superstate ();
extern struct rx_inx * rx_handle_cache_miss
#endif /* STDC */
#endif /* RXSUPERH */