home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
OS/2 Shareware BBS: 22 gnu
/
22-gnu.zip
/
gnurx.zip
/
rx
/
rxgnucomp.c
< prev
next >
Wrap
C/C++ Source or Header
|
1995-12-31
|
40KB
|
1,464 lines
/* Copyright (C) 1992, 1993, 1994, 1995 Free Software Foundation, Inc.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU Library General Public License as published by
* the Free Software Foundation; either version 2, or (at your option)
* any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Library General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this software; see the file COPYING. If not, write to
* the Free Software Foundation, 59 Temple Place - Suite 330,
* Boston, MA 02111-1307, USA.
*/
#include "rxall.h"
#include "rxgnucomp.h"
/* {A Syntax Table}
*/
/* Define the syntax basics for \<, \>, etc.
*/
#ifndef emacs
#define CHARBITS 8
#define CHAR_SET_SIZE (1 << CHARBITS)
#define Sword 1
#define SYNTAX(c) re_syntax_table[c]
char re_syntax_table[CHAR_SET_SIZE];
#ifdef __STDC__
static void
init_syntax_once (void)
#else
static void
init_syntax_once ()
#endif
{
register int c;
static int done = 0;
if (done)
return;
bzero (re_syntax_table, sizeof re_syntax_table);
for (c = 'a'; c <= 'z'; c++)
re_syntax_table[c] = Sword;
for (c = 'A'; c <= 'Z'; c++)
re_syntax_table[c] = Sword;
for (c = '0'; c <= '9'; c++)
re_syntax_table[c] = Sword;
re_syntax_table['_'] = Sword;
done = 1;
}
#endif /* not emacs */
const char *rx_error_msg[] =
{
0, /* REG_NOUT */
"No match", /* REG_NOMATCH */
"Invalid regular expression", /* REG_BADPAT */
"Invalid collation character", /* REG_ECOLLATE */
"Invalid character class name", /* REG_ECTYPE */
"Trailing backslash", /* REG_EESCAPE */
"Invalid back reference", /* REG_ESUBREG */
"Unmatched [ or [^", /* REG_EBRACK */
"Unmatched ( or \\(", /* REG_EPAREN */
"Unmatched \\{", /* REG_EBRACE */
"Invalid content of \\{\\}", /* REG_BADBR */
"Invalid range end", /* REG_ERANGE */
"Memory exhausted", /* REG_ESPACE */
"Invalid preceding regular expression", /* REG_BADRPT */
"Premature end of regular expression", /* REG_EEND */
"Regular expression too big", /* REG_ESIZE */
"Unmatched ) or \\)", /* REG_ERPAREN */
};
/*
* Macros used while compiling patterns.
*
* By convention, PEND points just past the end of the uncompiled pattern,
* P points to the read position in the pattern. `translate' is the name
* of the translation table (`TRANSLATE' is the name of a macro that looks
* things up in `translate').
*/
/*
* Fetch the next character in the uncompiled pattern---translating it
* if necessary. *Also cast from a signed character in the constant
* string passed to us by the user to an unsigned char that we can use
* as an array index (in, e.g., `translate').
*/
#define PATFETCH(c) \
do {if (p == pend) return REG_EEND; \
c = (unsigned char) *p++; \
c = translate[c]; \
} while (0)
/*
* Fetch the next character in the uncompiled pattern, with no
* translation.
*/
#define PATFETCH_RAW(c) \
do {if (p == pend) return REG_EEND; \
c = (unsigned char) *p++; \
} while (0)
/* Go backwards one character in the pattern. */
#define PATUNFETCH p--
#define TRANSLATE(d) translate[(unsigned char) (d)]
typedef unsigned regnum_t;
/* Since offsets can go either forwards or backwards, this type needs to
* be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1.
*/
typedef int pattern_offset_t;
typedef struct
{
struct rexp_node ** top_expression; /* was begalt */
struct rexp_node ** last_expression; /* was laststart */
pattern_offset_t inner_group_offset;
regnum_t regnum;
} compile_stack_elt_t;
typedef struct
{
compile_stack_elt_t *stack;
unsigned size;
unsigned avail; /* Offset of next open position. */
} compile_stack_type;
#define INIT_COMPILE_STACK_SIZE 32
#define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
#define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
/* The next available element. */
#define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
/* Set the bit for character C in a list. */
#define SET_LIST_BIT(c) \
(b[((unsigned char) (c)) / CHARBITS] \
|= 1 << (((unsigned char) c) % CHARBITS))
/* Get the next unsigned number in the uncompiled pattern. */
#define GET_UNSIGNED_NUMBER(num) \
{ if (p != pend) \
{ \
PATFETCH (c); \
while (isdigit (c)) \
{ \
if (num < 0) \
num = 0; \
num = num * 10 + c - '0'; \
if (p == pend) \
break; \
PATFETCH (c); \
} \
} \
}
#define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */
#define IS_CHAR_CLASS(string) \
(!strcmp (string, "alpha") || !strcmp (string, "upper") \
|| !strcmp (string, "lower") || !strcmp (string, "digit") \
|| !strcmp (string, "alnum") || !strcmp (string, "xdigit") \
|| !strcmp (string, "space") || !strcmp (string, "print") \
|| !strcmp (string, "punct") || !strcmp (string, "graph") \
|| !strcmp (string, "cntrl") || !strcmp (string, "blank"))
/* These predicates are used in regex_compile. */
/* P points to just after a ^ in PATTERN. Return true if that ^ comes
* after an alternative or a begin-subexpression. We assume there is at
* least one character before the ^.
*/
#ifdef __STDC__
static int
at_begline_loc_p (const char *pattern, const char * p, unsigned long syntax)
#else
static int
at_begline_loc_p (pattern, p, syntax)
const char *pattern;
const char * p;
unsigned long syntax;
#endif
{
const char *prev = p - 2;
int prev_prev_backslash = ((prev > pattern) && (prev[-1] == '\\'));
return
(/* After a subexpression? */
((*prev == '(') && ((syntax & RE_NO_BK_PARENS) || prev_prev_backslash))
||
/* After an alternative? */
((*prev == '|') && ((syntax & RE_NO_BK_VBAR) || prev_prev_backslash))
);
}
/* The dual of at_begline_loc_p. This one is for $. We assume there is
* at least one character after the $, i.e., `P < PEND'.
*/
#ifdef __STDC__
static int
at_endline_loc_p (const char *p, const char *pend, int syntax)
#else
static int
at_endline_loc_p (p, pend, syntax)
const char *p;
const char *pend;
int syntax;
#endif
{
const char *next = p;
int next_backslash = (*next == '\\');
const char *next_next = (p + 1 < pend) ? (p + 1) : 0;
return
(
/* Before a subexpression? */
((syntax & RE_NO_BK_PARENS)
? (*next == ')')
: (next_backslash && next_next && (*next_next == ')')))
||
/* Before an alternative? */
((syntax & RE_NO_BK_VBAR)
? (*next == '|')
: (next_backslash && next_next && (*next_next == '|')))
);
}
unsigned char rx_id_translation[256] =
{
0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
90, 91, 92, 93, 94, 95, 96, 97, 98, 99,
100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
130, 131, 132, 133, 134, 135, 136, 137, 138, 139,
140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
180, 181, 182, 183, 184, 185, 186, 187, 188, 189,
190, 191, 192, 193, 194, 195, 196, 197, 198, 199,
200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
210, 211, 212, 213, 214, 215, 216, 217, 218, 219,
220, 221, 222, 223, 224, 225, 226, 227, 228, 229,
230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
240, 241, 242, 243, 244, 245, 246, 247, 248, 249,
250, 251, 252, 253, 254, 255
};
/* The compiler keeps an inverted translation table.
* This looks up/inititalize elements.
* VALID is an array of booleans that validate CACHE.
*/
#ifdef __STDC__
static rx_Bitset
inverse_translation (int cset_size, char * valid, rx_Bitset cache, unsigned char * translate, int c)
#else
static rx_Bitset
inverse_translation (cset_size, valid, cache, translate, c)
int cset_size;
char * valid;
rx_Bitset cache;
unsigned char * translate;
int c;
#endif
{
rx_Bitset cs;
cs = cache + c * rx_bitset_numb_subsets (cset_size);
if (!valid[c])
{
int x;
int c_tr = TRANSLATE(c);
rx_bitset_null (cset_size, cs);
for (x = 0; x < 256; ++x) /* &&&& 13.37 */
if (TRANSLATE(x) == c_tr)
RX_bitset_enjoin (cs, x);
valid[c] = 1;
}
return cs;
}
/* More subroutine declarations and macros for regex_compile. */
/* Returns true if REGNUM is in one of COMPILE_STACK's elements and
* false if it's not.
*/
#ifdef __STDC__
static int
group_in_compile_stack (compile_stack_type compile_stack, regnum_t regnum)
#else
static int
group_in_compile_stack (compile_stack, regnum)
compile_stack_type compile_stack;
regnum_t regnum;
#endif
{
int this_element;
for (this_element = compile_stack.avail - 1;
this_element >= 0;
this_element--)
if (compile_stack.stack[this_element].regnum == regnum)
return 1;
return 0;
}
/*
* Read the ending character of a range (in a bracket expression) from the
* uncompiled pattern *P_PTR (which ends at PEND). We assume the
* starting character is in `P[-2]'. (`P[-1]' is the character `-'.)
* Then we set the translation of all bits between the starting and
* ending characters (inclusive) in the compiled pattern B.
*
* Return an error code.
*
* We use these short variable names so we can use the same macros as
* `regex_compile' itself.
*/
#ifdef __STDC__
static reg_errcode_t
compile_range (int cset_size, rx_Bitset cs, const char ** p_ptr, const char * pend, unsigned char * translate, unsigned long syntax, rx_Bitset inv_tr, char * valid_inv_tr)
#else
static reg_errcode_t
compile_range (cset_size, cs, p_ptr, pend, translate, syntax, inv_tr, valid_inv_tr)
int cset_size;
rx_Bitset cs;
const char ** p_ptr;
const char * pend;
unsigned char * translate;
unsigned long syntax;
rx_Bitset inv_tr;
char * valid_inv_tr;
#endif
{
unsigned this_char;
const char *p = *p_ptr;
unsigned char range_end;
unsigned char range_start = TRANSLATE(p[-2]);
if (p == pend)
return REG_ERANGE;
PATFETCH (range_end);
(*p_ptr)++;
if (range_start > range_end)
return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR;
for (this_char = range_start; this_char <= range_end; this_char++)
{
rx_Bitset it =
inverse_translation (cset_size, valid_inv_tr, inv_tr, translate, this_char);
rx_bitset_union (cset_size, cs, it);
}
return REG_NOERROR;
}
#ifdef __STDC__
static int
pointless_if_repeated (struct rexp_node * node)
#else
static int
pointless_if_repeated (node)
struct rexp_node * node;
#endif
{
if (!node)
return 1;
switch (node->type)
{
case r_cset:
return 0;
case r_concat:
case r_alternate:
return (pointless_if_repeated (node->params.pair.left)
&& pointless_if_repeated (node->params.pair.right));
case r_opt:
case r_star:
case r_interval:
case r_parens:
return pointless_if_repeated (node->params.pair.left);
case r_context:
switch (node->params.intval)
{
case '=':
case '<':
case '^':
case 'b':
case 'B':
case '`':
case '\'':
return 1;
default:
return 0;
}
default:
return 0;
}
}
#define isa_blank(C) (((C) == ' ') || ((C) != '\t'))
#ifdef __STDC__
reg_errcode_t
rx_parse (struct rexp_node ** rexp_p,
const char *pattern,
int size,
unsigned long syntax,
int cset_size,
unsigned char *translate)
#else
reg_errcode_t
rx_parse (rexp_p, pattern, size, syntax, cset_size, translate)
struct rexp_node ** rexp_p;
const char *pattern;
int size;
unsigned long syntax;
int cset_size;
unsigned char *translate;
#endif
{
reg_errcode_t compile_error;
RX_subset
inverse_translate [CHAR_SET_SIZE * rx_bitset_numb_subsets(CHAR_SET_SIZE)];
char
validate_inv_tr [CHAR_SET_SIZE * rx_bitset_numb_subsets(CHAR_SET_SIZE)];
/* We fetch characters from PATTERN here. Even though PATTERN is
`char *' (i.e., signed), we declare these variables as unsigned, so
they can be reliably used as array indices. */
register unsigned char c;
register unsigned char c1;
/* A random tempory spot in PATTERN. */
const char *p1;
/* Keeps track of unclosed groups. */
compile_stack_type compile_stack;
/* Points to the current (ending) position in the pattern. */
const char *p;
const char *pend;
/* When parsing is done, this will hold the expression tree. */
struct rexp_node * rexp;
/* This and top_expression are saved on the compile stack. */
struct rexp_node ** top_expression;
struct rexp_node ** last_expression;
/* Parameter to `goto append_node' */
struct rexp_node * append;
/* Counts open-groups as they are encountered. This is the index of the
* innermost group being compiled.
*/
regnum_t regnum;
/* Place in the uncompiled pattern (i.e., the {) to
* which to go back if the interval is invalid.
*/
const char *beg_interval;
int side;
if (!translate)
translate = rx_id_translation;
/* Points to the current (ending) position in the pattern. */
p = pattern;
pend = pattern + size;
/* When parsing is done, this will hold the expression tree. */
rexp = 0;
/* In the midst of compilation, this holds onto the regexp
* first parst while rexp goes on to aquire additional constructs.
*/
top_expression = &rexp;
last_expression = top_expression;
/* Counts open-groups as they are encountered. This is the index of the
* innermost group being compiled.
*/
regnum = 0;
bzero (validate_inv_tr, sizeof (validate_inv_tr));
/* Initialize the compile stack. */
compile_stack.stack = (( compile_stack_elt_t *) malloc ((INIT_COMPILE_STACK_SIZE) * sizeof ( compile_stack_elt_t)));
if (compile_stack.stack == 0)
return REG_ESPACE;
compile_stack.size = INIT_COMPILE_STACK_SIZE;
compile_stack.avail = 0;
#if !defined (emacs) && !defined (SYNTAX_TABLE)
/* Initialize the syntax table. */
init_syntax_once ();
#endif
/* Loop through the uncompiled pattern until we're at the end. */
while (p != pend)
{
PATFETCH (c);
switch (c)
{
case '^':
{
if ( /* If at start of pattern, it's an operator. */
p == pattern + 1
/* If context independent, it's an operator. */
|| syntax & RE_CONTEXT_INDEP_ANCHORS
/* Otherwise, depends on what's come before. */
|| at_begline_loc_p (pattern, p, syntax))
{
struct rexp_node * n
= rx_mk_r_int (r_context, '^');
if (!n)
goto space_error;
append = n;
goto append_node;
}
else
goto normal_char;
}
break;
case '$':
{
if ( /* If at end of pattern, it's an operator. */
p == pend
/* If context independent, it's an operator. */
|| syntax & RE_CONTEXT_INDEP_ANCHORS
/* Otherwise, depends on what's next. */
|| at_endline_loc_p (p, pend, syntax))
{
struct rexp_node * n
= rx_mk_r_int (r_context, '$');
if (!n)
goto space_error;
append = n;
goto append_node;
}
else
goto normal_char;
}
break;
case '+':
case '?':
if ((syntax & RE_BK_PLUS_QM)
|| (syntax & RE_LIMITED_OPS))
goto normal_char;
handle_plus:
case '*':
/* If there is no previous pattern... */
if (pointless_if_repeated (*last_expression))
{
if (syntax & RE_CONTEXT_INVALID_OPS)
{
compile_error = REG_BADRPT;
goto error_return;
}
else if (!(syntax & RE_CONTEXT_INDEP_OPS))
goto normal_char;
}
{
/* 1 means zero (many) matches is allowed. */
char zero_times_ok = 0, many_times_ok = 0;
/* If there is a sequence of repetition chars, collapse it
down to just one (the right one). We can't combine
interval operators with these because of, e.g., `a{2}*',
which should only match an even number of `a's. */
for (;;)
{
zero_times_ok |= c != '+';
many_times_ok |= c != '?';
if (p == pend)
break;
PATFETCH (c);
if (c == '*'
|| (!(syntax & RE_BK_PLUS_QM) && (c == '+' || c == '?')))
;
else if (syntax & RE_BK_PLUS_QM && c == '\\')
{
if (p == pend)
{
compile_error = REG_EESCAPE;
goto error_return;
}
PATFETCH (c1);
if (!(c1 == '+' || c1 == '?'))
{
PATUNFETCH;
PATUNFETCH;
break;
}
c = c1;
}
else
{
PATUNFETCH;
break;
}
/* If we get here, we found another repeat character. */
}
/* Now we know whether or not zero matches is allowed
* and also whether or not two or more matches is allowed.
*/
{
struct rexp_node * inner_exp;
struct rexp_node * star;
inner_exp = *last_expression;
star = rx_mk_r_monop ((many_times_ok
? (zero_times_ok ? r_star : r_plus)
: r_opt),
inner_exp);
if (!star)
goto space_error;
*last_expression = star;
}
}
break;
case '.':
{
rx_Bitset cs;
struct rexp_node * n;
cs = rx_cset (cset_size);
if (!cs)
goto space_error;
n = rx_mk_r_cset (r_cset, cset_size, cs);
if (!n)
{
rx_free_cset (cs);
goto space_error;
}
rx_bitset_universe (cset_size, cs);
if (!(syntax & RE_DOT_NEWLINE))
RX_bitset_remove (cs, '\n');
if (!(syntax & RE_DOT_NOT_NULL))
RX_bitset_remove (cs, 0);
append = n;
goto append_node;
break;
}
case '[':
if (p == pend)
{
compile_error = REG_EBRACK;
goto error_return;
}
{
int had_char_class;
rx_Bitset cs;
struct rexp_node * node;
int is_inverted;
had_char_class = 0;
is_inverted = *p == '^';
cs = rx_cset (cset_size);
if (!cs)
goto space_error;
node = rx_mk_r_cset (r_cset, cset_size ,cs);
if (!node)
{
rx_free_cset (cs);
goto space_error;
}
/* This branch of the switch is normally exited with
*`goto append_node'
*/
append = node;
if (is_inverted)
p++;
/* Remember the first position in the bracket expression. */
p1 = p;
/* Read in characters and ranges, setting map bits. */
for (;;)
{
if (p == pend)
{
compile_error = REG_EBRACK;
goto error_return;
}
PATFETCH (c);
/* \ might escape characters inside [...] and [^...]. */
if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
{
if (p == pend)
{
compile_error = REG_EESCAPE;
goto error_return;
}
PATFETCH (c1);
{
rx_Bitset it = inverse_translation (cset_size,
validate_inv_tr,
inverse_translate,
translate,
c1);
rx_bitset_union (cset_size, cs, it);
}
continue;
}
/* Could be the end of the bracket expression. If it's
not (i.e., when the bracket expression is `[]' so
far), the ']' character bit gets set way below. */
if (c == ']' && p != p1 + 1)
goto finalize_class_and_append;
/* Look ahead to see if it's a range when the last thing
was a character class. */
if (had_char_class && c == '-' && *p != ']')
{
compile_error = REG_ERANGE;
goto error_return;
}
/* Look ahead to see if it's a range when the last thing
was a character: if this is a hyphen not at the
beginning or the end of a list, then it's the range
operator. */
if (c == '-'
&& !(p - 2 >= pattern && p[-2] == '[')
&& !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^')
&& *p != ']')
{
reg_errcode_t ret
= compile_range (cset_size, cs, &p, pend, translate, syntax,
inverse_translate, validate_inv_tr);
if (ret != REG_NOERROR)
{
compile_error = ret;
goto error_return;
}
}
else if (p[0] == '-' && p[1] != ']')
{ /* This handles ranges made up of characters only. */
reg_errcode_t ret;
/* Move past the `-'. */
PATFETCH (c1);
ret = compile_range (cset_size, cs, &p, pend, translate, syntax,
inverse_translate, validate_inv_tr);
if (ret != REG_NOERROR)
{
compile_error = ret;
goto error_return;
}
}
/* See if we're at the beginning of a possible character
class. */
else if ((syntax & RE_CHAR_CLASSES)
&& (c == '[') && (*p == ':'))
{
char str[CHAR_CLASS_MAX_LENGTH + 1];
PATFETCH (c);
c1 = 0;
/* If pattern is `[[:'. */
if (p == pend)
{
compile_error = REG_EBRACK;
goto error_return;
}
for (;;)
{
PATFETCH (c);
if (c == ':' || c == ']' || p == pend
|| c1 == CHAR_CLASS_MAX_LENGTH)
break;
str[c1++] = c;
}
str[c1] = '\0';
/* If isn't a word bracketed by `[:' and:`]':
undo the ending character, the letters, and leave
the leading `:' and `[' (but set bits for them). */
if (c == ':' && *p == ']')
{
int ch;
int is_alnum = !strcmp (str, "alnum");
int is_alpha = !strcmp (str, "alpha");
int is_blank = !strcmp (str, "blank");
int is_cntrl = !strcmp (str, "cntrl");
int is_digit = !strcmp (str, "digit");
int is_graph = !strcmp (str, "graph");
int is_lower = !strcmp (str, "lower");
int is_print = !strcmp (str, "print");
int is_punct = !strcmp (str, "punct");
int is_space = !strcmp (str, "space");
int is_upper = !strcmp (str, "upper");
int is_xdigit = !strcmp (str, "xdigit");
if (!IS_CHAR_CLASS (str))
{
compile_error = REG_ECTYPE;
goto error_return;
}
/* Throw away the ] at the end of the character
class. */
PATFETCH (c);
if (p == pend) { compile_error = REG_EBRACK; goto error_return; }
for (ch = 0; ch < 1 << CHARBITS; ch++)
{
if ( (is_alnum && isalnum (ch))
|| (is_alpha && isalpha (ch))
|| (is_blank && isa_blank (ch))
|| (is_cntrl && iscntrl (ch))
|| (is_digit && isdigit (ch))
|| (is_graph && isgraph (ch))
|| (is_lower && islower (ch))
|| (is_print && isprint (ch))
|| (is_punct && ispunct (ch))
|| (is_space && isspace (ch))
|| (is_upper && isupper (ch))
|| (is_xdigit && isxdigit (ch)))
{
rx_Bitset it =
inverse_translation (cset_size,
validate_inv_tr,
inverse_translate,
translate,
ch);
rx_bitset_union (cset_size,
cs, it);
}
}
had_char_class = 1;
}
else
{
c1++;
while (c1--)
PATUNFETCH;
{
rx_Bitset it =
inverse_translation (cset_size,
validate_inv_tr,
inverse_translate,
translate,
'[');
rx_bitset_union (cset_size,
cs, it);
}
{
rx_Bitset it =
inverse_translation (cset_size,
validate_inv_tr,
inverse_translate,
translate,
':');
rx_bitset_union (cset_size,
cs, it);
}
had_char_class = 0;
}
}
else
{
had_char_class = 0;
{
rx_Bitset it = inverse_translation (cset_size,
validate_inv_tr,
inverse_translate,
translate,
c);
rx_bitset_union (cset_size, cs, it);
}
}
}
finalize_class_and_append:
if (is_inverted)
{
rx_bitset_complement (cset_size, cs);
if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
RX_bitset_remove (cs, '\n');
}
goto append_node;
}
break;
case '(':
if (syntax & RE_NO_BK_PARENS)
goto handle_open;
else
goto normal_char;
case ')':
if (syntax & RE_NO_BK_PARENS)
goto handle_close;
else
goto normal_char;
case '\n':
if (syntax & RE_NEWLINE_ALT)
goto handle_alt;
else
goto normal_char;
case '|':
if (syntax & RE_NO_BK_VBAR)
goto handle_alt;
else
goto normal_char;
case '{':
if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
goto handle_interval;
else
goto normal_char;
case '\\':
if (p == pend) { compile_error = REG_EESCAPE; goto error_return; }
/* Do not translate the character after the \, so that we can
distinguish, e.g., \B from \b, even if we normally would
translate, e.g., B to b. */
PATFETCH_RAW (c);
switch (c)
{
case '(':
if (syntax & RE_NO_BK_PARENS)
goto normal_backslash;
handle_open:
regnum++;
if (COMPILE_STACK_FULL)
{
compile_stack.stack
= ((compile_stack_elt_t *)
realloc (compile_stack.stack,
(compile_stack.size << 1) * sizeof (compile_stack_elt_t)));
if (compile_stack.stack == 0)
goto space_error;
compile_stack.size <<= 1;
}
if (*last_expression)
{
struct rexp_node * concat;
concat = rx_mk_r_binop (r_concat, *last_expression, 0);
if (!concat)
goto space_error;
*last_expression = concat;
last_expression = &concat->params.pair.right;
}
/*
* These are the values to restore when we hit end of this
* group.
*/
COMPILE_STACK_TOP.top_expression = top_expression;
COMPILE_STACK_TOP.last_expression = last_expression;
COMPILE_STACK_TOP.regnum = regnum;
compile_stack.avail++;
top_expression = last_expression;
break;
case ')':
if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
handle_close:
/* See similar code for backslashed left paren above. */
if (COMPILE_STACK_EMPTY)
if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
goto normal_char;
else
{ compile_error = REG_ERPAREN; goto error_return; }
/* Since we just checked for an empty stack above, this
* ``can't happen''.
*/
{
/* We don't just want to restore into `regnum', because
* later groups should continue to be numbered higher,
* as in `(ab)c(de)' -- the second group is #2.
*/
regnum_t this_group_regnum;
struct rexp_node ** inner;
struct rexp_node * parens;
inner = top_expression;
compile_stack.avail--;
top_expression = COMPILE_STACK_TOP.top_expression;
last_expression = COMPILE_STACK_TOP.last_expression;
this_group_regnum = COMPILE_STACK_TOP.regnum;
parens = rx_mk_r_monop (r_parens, *inner);
if (!parens)
goto space_error;
parens->params.intval = this_group_regnum;
*inner = parens;
break;
}
case '|': /* `\|'. */
if ((syntax & RE_LIMITED_OPS) || (syntax & RE_NO_BK_VBAR))
goto normal_backslash;
handle_alt:
if (syntax & RE_LIMITED_OPS)
goto normal_char;
{
struct rexp_node * alt;
alt = rx_mk_r_binop (r_alternate, *top_expression, 0);
if (!alt)
goto space_error;
*top_expression = alt;
last_expression = &alt->params.pair.right;
}
break;
case '{':
/* If \{ is a literal. */
if (!(syntax & RE_INTERVALS)
/* If we're at `\{' and it's not the open-interval
operator. */
|| ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
|| (p - 2 == pattern && p == pend))
goto normal_backslash;
handle_interval:
{
/* If got here, then the syntax allows intervals.
*/
/* At least (most) this many matches must be made.
*/
int lower_bound;
int upper_bound;
lower_bound = -1;
upper_bound = -1;
/* We're about to parse the bounds of the interval.
* It may turn out that this isn't an interval after
* all, in which case these same characters will have
* to be reparsed as literals. This remembers
* the backtrack point in the parse:
*/
beg_interval = p - 1;
if (p == pend)
{
if (syntax & RE_NO_BK_BRACES)
goto unfetch_interval;
else
{ compile_error = REG_EBRACE; goto error_return; }
}
GET_UNSIGNED_NUMBER (lower_bound);
if (c == ',')
{
GET_UNSIGNED_NUMBER (upper_bound);
if (upper_bound < 0) upper_bound = RE_DUP_MAX;
}
else
/* Interval such as `{n}' => match exactly n times.
*/
upper_bound = lower_bound;
if (lower_bound < 0
|| upper_bound > RE_DUP_MAX
|| lower_bound > upper_bound)
{
if (syntax & RE_NO_BK_BRACES)
goto unfetch_interval;
else
{ compile_error = REG_BADBR; goto error_return; }
}
if (!(syntax & RE_NO_BK_BRACES))
{
if (c != '\\') { compile_error = REG_EBRACE; goto error_return; }
PATFETCH (c);
}
if (c != '}')
{
if (syntax & RE_NO_BK_BRACES)
goto unfetch_interval;
else
{ compile_error = REG_BADBR; goto error_return; }
}
/* We just parsed a valid interval.
* lower_bound and upper_bound are set.
*/
/* If it's invalid to have no preceding re.
*/
if (pointless_if_repeated (*last_expression))
{
if (syntax & RE_CONTEXT_INVALID_OPS)
{ compile_error = REG_BADRPT; goto error_return; }
else if (!(syntax & RE_CONTEXT_INDEP_OPS))
/* treat the interval spec as literal chars. */
goto unfetch_interval;
}
{
struct rexp_node * interval;
interval = rx_mk_r_monop (r_interval, *last_expression);
if (!interval)
goto space_error;
interval->params.intval = lower_bound;
interval->params.intval2 = upper_bound;
*last_expression = interval;
}
beg_interval = 0;
}
break;
unfetch_interval:
/* If an invalid interval, match the characters as literals. */
p = beg_interval;
beg_interval = 0;
/* normal_char and normal_backslash need `c'. */
PATFETCH (c);
if (!(syntax & RE_NO_BK_BRACES))
{
if ((p > pattern) && (p[-1] == '\\'))
goto normal_backslash;
}
goto normal_char;
#ifdef emacs
/* There is no way to specify the before_dot and after_dot
* operators. rms says this is ok. --karl
*/
case '=':
side = '=';
goto add_side_effect;
break;
case 's':
case 'S':
{
rx_Bitset cs;
struct rexp_node * set;
cs = rx_cset (&cset_size);
if (!cs)
goto space_error;
set = rx_mk_r_cset (r_cset, cset_size, cs);
if (!set)
{
rx_free_cset (cs);
goto space_error;
}
if (c == 'S')
rx_bitset_universe (cset_size, cs);
PATFETCH (c);
{
int x;
enum syntaxcode code = syntax_spec_code [c];
for (x = 0; x < 256; ++x)
{
if (SYNTAX (x) == code)
{
rx_Bitset it =
inverse_translation (cset_size, validate_inv_tr,
inverse_translate,
translate, x);
rx_bitset_xor (cset_size, cs, it);
}
}
}
append = set;
goto append_node;
}
break;
#endif /* emacs */
case 'w':
case 'W':
{
rx_Bitset cs;
struct rexp_node * n;
cs = rx_cset (cset_size);
n = (cs ? rx_mk_r_cset (r_cset, cset_size, cs) : 0);
if (!(cs && n))
{
if (cs)
rx_free_cset (cs);
goto space_error;
}
if (c == 'W')
rx_bitset_universe (cset_size ,cs);
{
int x;
for (x = cset_size - 1; x > 0; --x)
if (SYNTAX(x) & Sword)
RX_bitset_toggle (cs, x);
}
append = n;
goto append_node;
}
break;
case '<':
side = '<';
goto add_side_effect;
break;
case '>':
side = '>';
goto add_side_effect;
break;
case 'b':
side = 'b';
goto add_side_effect;
break;
case 'B':
side = 'B';
goto add_side_effect;
break;
case '`':
side = '`';
goto add_side_effect;
break;
case '\'':
side = '\'';
goto add_side_effect;
break;
add_side_effect:
{
struct rexp_node * se;
se = rx_mk_r_int (r_context, side);
if (!se)
goto space_error;
append = se;
goto append_node;
}
break;
case '1': case '2': case '3': case '4': case '5':
case '6': case '7': case '8': case '9':
if (syntax & RE_NO_BK_REFS)
goto normal_char;
c1 = c - '0';
if (c1 > regnum)
{ compile_error = REG_ESUBREG; goto error_return; }
side = c;
goto add_side_effect;
break;
case '+':
case '?':
if (syntax & RE_BK_PLUS_QM)
goto handle_plus;
else
goto normal_backslash;
default:
normal_backslash:
/* You might think it would be useful for \ to mean
* not to translate; but if we don't translate it
* it will never match anything.
*/
c = TRANSLATE (c);
goto normal_char;
}
break;
default:
/* Expects the character in `c'. */
normal_char:
{
rx_Bitset cs;
struct rexp_node * match;
rx_Bitset it;
cs = rx_cset (cset_size);
match = (cs ? rx_mk_r_cset (r_cset, cset_size, cs) : 0);
if (!(cs && match))
{
if (cs)
rx_free_cset (cs);
goto space_error;
}
it = inverse_translation (cset_size, validate_inv_tr,
inverse_translate, translate, c);
rx_bitset_union (CHAR_SET_SIZE, cs, it);
append = match;
append_node:
/* This genericly appends the rexp APPEND to *LAST_EXPRESSION
* and then parses the next character normally.
*/
if (*last_expression)
{
struct rexp_node * concat;
concat = rx_mk_r_binop (r_concat,
*last_expression, append);
if (!concat)
goto space_error;
*last_expression = concat;
last_expression = &concat->params.pair.right;
}
else
*last_expression = append;
}
} /* switch (c) */
} /* while p != pend */
/* Through the pattern now. */
if (!COMPILE_STACK_EMPTY)
{ compile_error = REG_EPAREN; goto error_return; }
free (compile_stack.stack);
*rexp_p = rexp;
return REG_NOERROR;
space_error:
compile_error = REG_ESPACE;
error_return:
free (compile_stack.stack);
/* Free expressions pushed onto the compile stack! */
if (rexp)
rx_free_rexp (rexp);
return compile_error;
}