OS/2 Shareware BBS: 22 gnu

home *** CD-ROM | disk | FTP | other *** search

/ OS/2 Shareware BBS: 22 gnu / 22-gnu.zip / gnurx.zip / rx / rxgnucomp.c < prev next >

Wrap

C/C++ Source or Header | 1995-12-31 | 40KB | 1,464 lines

/* Copyright (C) 1992, 1993, 1994, 1995 Free Software Foundation, Inc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU Library General Public License as published by * the Free Software Foundation; either version 2, or (at your option) * any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Library General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this software; see the file COPYING. If not, write to * the Free Software Foundation, 59 Temple Place - Suite 330, * Boston, MA 02111-1307, USA. */ #include "rxall.h" #include "rxgnucomp.h" /* {A Syntax Table} */ /* Define the syntax basics for \<, \>, etc. */ #ifndef emacs #define CHARBITS 8 #define CHAR_SET_SIZE (1 << CHARBITS) #define Sword 1 #define SYNTAX(c) re_syntax_table[c] char re_syntax_table[CHAR_SET_SIZE]; #ifdef __STDC__ static void init_syntax_once (void) #else static void init_syntax_once () #endif { register int c; static int done = 0; if (done) return; bzero (re_syntax_table, sizeof re_syntax_table); for (c = 'a'; c <= 'z'; c++) re_syntax_table[c] = Sword; for (c = 'A'; c <= 'Z'; c++) re_syntax_table[c] = Sword; for (c = '0'; c <= '9'; c++) re_syntax_table[c] = Sword; re_syntax_table['_'] = Sword; done = 1; } #endif /* not emacs */ const char *rx_error_msg[] = { 0, /* REG_NOUT */ "No match", /* REG_NOMATCH */ "Invalid regular expression", /* REG_BADPAT */ "Invalid collation character", /* REG_ECOLLATE */ "Invalid character class name", /* REG_ECTYPE */ "Trailing backslash", /* REG_EESCAPE */ "Invalid back reference", /* REG_ESUBREG */ "Unmatched [ or [^", /* REG_EBRACK */ "Unmatched ( or \$", /* REG_EPAREN */ "Unmatched \\{", /* REG_EBRACE */ "Invalid content of \\{\\}", /* REG_BADBR */ "Invalid range end", /* REG_ERANGE */ "Memory exhausted", /* REG_ESPACE */ "Invalid preceding regular expression", /* REG_BADRPT */ "Premature end of regular expression", /* REG_EEND */ "Regular expression too big", /* REG_ESIZE */ "Unmatched ) or \$", /* REG_ERPAREN */ }; /* * Macros used while compiling patterns. * * By convention, PEND points just past the end of the uncompiled pattern, * P points to the read position in the pattern. `translate' is the name * of the translation table (`TRANSLATE' is the name of a macro that looks * things up in `translate'). */ /* * Fetch the next character in the uncompiled pattern---translating it * if necessary. *Also cast from a signed character in the constant * string passed to us by the user to an unsigned char that we can use * as an array index (in, e.g., `translate'). */ #define PATFETCH(c) \ do {if (p == pend) return REG_EEND; \ c = (unsigned char) *p++; \ c = translate[c]; \ } while (0) /* * Fetch the next character in the uncompiled pattern, with no * translation. */ #define PATFETCH_RAW(c) \ do {if (p == pend) return REG_EEND; \ c = (unsigned char) *p++; \ } while (0) /* Go backwards one character in the pattern. */ #define PATUNFETCH p-- #define TRANSLATE(d) translate[(unsigned char) (d)] typedef unsigned regnum_t; /* Since offsets can go either forwards or backwards, this type needs to * be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */ typedef int pattern_offset_t; typedef struct { struct rexp_node ** top_expression; /* was begalt */ struct rexp_node ** last_expression; /* was laststart */ pattern_offset_t inner_group_offset; regnum_t regnum; } compile_stack_elt_t; typedef struct { compile_stack_elt_t *stack; unsigned size; unsigned avail; /* Offset of next open position. */ } compile_stack_type; #define INIT_COMPILE_STACK_SIZE 32 #define COMPILE_STACK_EMPTY (compile_stack.avail == 0) #define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size) /* The next available element. */ #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail]) /* Set the bit for character C in a list. */ #define SET_LIST_BIT(c) \ (b[((unsigned char) (c)) / CHARBITS] \ |= 1 << (((unsigned char) c) % CHARBITS)) /* Get the next unsigned number in the uncompiled pattern. */ #define GET_UNSIGNED_NUMBER(num) \ { if (p != pend) \ { \ PATFETCH (c); \ while (isdigit (c)) \ { \ if (num < 0) \ num = 0; \ num = num * 10 + c - '0'; \ if (p == pend) \ break; \ PATFETCH (c); \ } \ } \ } #define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */ #define IS_CHAR_CLASS(string) \ (!strcmp (string, "alpha") || !strcmp (string, "upper") \ || !strcmp (string, "lower") || !strcmp (string, "digit") \ || !strcmp (string, "alnum") || !strcmp (string, "xdigit") \ || !strcmp (string, "space") || !strcmp (string, "print") \ || !strcmp (string, "punct") || !strcmp (string, "graph") \ || !strcmp (string, "cntrl") || !strcmp (string, "blank")) /* These predicates are used in regex_compile. */ /* P points to just after a ^ in PATTERN. Return true if that ^ comes * after an alternative or a begin-subexpression. We assume there is at * least one character before the ^. */ #ifdef __STDC__ static int at_begline_loc_p (const char *pattern, const char * p, unsigned long syntax) #else static int at_begline_loc_p (pattern, p, syntax) const char *pattern; const char * p; unsigned long syntax; #endif { const char *prev = p - 2; int prev_prev_backslash = ((prev > pattern) && (prev[-1] == '\\')); return (/* After a subexpression? */ ((*prev == '(') && ((syntax & RE_NO_BK_PARENS) || prev_prev_backslash)) || /* After an alternative? */ ((*prev == '|') && ((syntax & RE_NO_BK_VBAR) || prev_prev_backslash)) ); } /* The dual of at_begline_loc_p. This one is for $. We assume there is * at least one character after the $, i.e., `P < PEND'. */ #ifdef __STDC__ static int at_endline_loc_p (const char *p, const char *pend, int syntax) #else static int at_endline_loc_p (p, pend, syntax) const char *p; const char *pend; int syntax; #endif { const char *next = p; int next_backslash = (*next == '\\'); const char *next_next = (p + 1 < pend) ? (p + 1) : 0; return ( /* Before a subexpression? */ ((syntax & RE_NO_BK_PARENS) ? (*next == ')') : (next_backslash && next_next && (*next_next == ')'))) || /* Before an alternative? */ ((syntax & RE_NO_BK_VBAR) ? (*next == '|') : (next_backslash && next_next && (*next_next == '|'))) ); } unsigned char rx_id_translation[256] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255 }; /* The compiler keeps an inverted translation table. * This looks up/inititalize elements. * VALID is an array of booleans that validate CACHE. */ #ifdef __STDC__ static rx_Bitset inverse_translation (int cset_size, char * valid, rx_Bitset cache, unsigned char * translate, int c) #else static rx_Bitset inverse_translation (cset_size, valid, cache, translate, c) int cset_size; char * valid; rx_Bitset cache; unsigned char * translate; int c; #endif { rx_Bitset cs; cs = cache + c * rx_bitset_numb_subsets (cset_size); if (!valid[c]) { int x; int c_tr = TRANSLATE(c); rx_bitset_null (cset_size, cs); for (x = 0; x < 256; ++x) /* &&&& 13.37 */ if (TRANSLATE(x) == c_tr) RX_bitset_enjoin (cs, x); valid[c] = 1; } return cs; } /* More subroutine declarations and macros for regex_compile. */ /* Returns true if REGNUM is in one of COMPILE_STACK's elements and * false if it's not. */ #ifdef __STDC__ static int group_in_compile_stack (compile_stack_type compile_stack, regnum_t regnum) #else static int group_in_compile_stack (compile_stack, regnum) compile_stack_type compile_stack; regnum_t regnum; #endif { int this_element; for (this_element = compile_stack.avail - 1; this_element >= 0; this_element--) if (compile_stack.stack[this_element].regnum == regnum) return 1; return 0; } /* * Read the ending character of a range (in a bracket expression) from the * uncompiled pattern *P_PTR (which ends at PEND). We assume the * starting character is in `P[-2]'. (`P[-1]' is the character `-'.) * Then we set the translation of all bits between the starting and * ending characters (inclusive) in the compiled pattern B. * * Return an error code. * * We use these short variable names so we can use the same macros as * `regex_compile' itself. */ #ifdef __STDC__ static reg_errcode_t compile_range (int cset_size, rx_Bitset cs, const char ** p_ptr, const char * pend, unsigned char * translate, unsigned long syntax, rx_Bitset inv_tr, char * valid_inv_tr) #else static reg_errcode_t compile_range (cset_size, cs, p_ptr, pend, translate, syntax, inv_tr, valid_inv_tr) int cset_size; rx_Bitset cs; const char ** p_ptr; const char * pend; unsigned char * translate; unsigned long syntax; rx_Bitset inv_tr; char * valid_inv_tr; #endif { unsigned this_char; const char *p = *p_ptr; unsigned char range_end; unsigned char range_start = TRANSLATE(p[-2]); if (p == pend) return REG_ERANGE; PATFETCH (range_end); (*p_ptr)++; if (range_start > range_end) return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR; for (this_char = range_start; this_char <= range_end; this_char++) { rx_Bitset it = inverse_translation (cset_size, valid_inv_tr, inv_tr, translate, this_char); rx_bitset_union (cset_size, cs, it); } return REG_NOERROR; } #ifdef __STDC__ static int pointless_if_repeated (struct rexp_node * node) #else static int pointless_if_repeated (node) struct rexp_node * node; #endif { if (!node) return 1; switch (node->type) { case r_cset: return 0; case r_concat: case r_alternate: return (pointless_if_repeated (node->params.pair.left) && pointless_if_repeated (node->params.pair.right)); case r_opt: case r_star: case r_interval: case r_parens: return pointless_if_repeated (node->params.pair.left); case r_context: switch (node->params.intval) { case '=': case '<': case '^': case 'b': case 'B': case '`': case '\'': return 1; default: return 0; } default: return 0; } } #define isa_blank(C) (((C) == ' ') || ((C) != '\t')) #ifdef __STDC__ reg_errcode_t rx_parse (struct rexp_node ** rexp_p, const char *pattern, int size, unsigned long syntax, int cset_size, unsigned char *translate) #else reg_errcode_t rx_parse (rexp_p, pattern, size, syntax, cset_size, translate) struct rexp_node ** rexp_p; const char *pattern; int size; unsigned long syntax; int cset_size; unsigned char *translate; #endif { reg_errcode_t compile_error; RX_subset inverse_translate [CHAR_SET_SIZE * rx_bitset_numb_subsets(CHAR_SET_SIZE)]; char validate_inv_tr [CHAR_SET_SIZE * rx_bitset_numb_subsets(CHAR_SET_SIZE)]; /* We fetch characters from PATTERN here. Even though PATTERN is `char *' (i.e., signed), we declare these variables as unsigned, so they can be reliably used as array indices. */ register unsigned char c; register unsigned char c1; /* A random tempory spot in PATTERN. */ const char *p1; /* Keeps track of unclosed groups. */ compile_stack_type compile_stack; /* Points to the current (ending) position in the pattern. */ const char *p; const char *pend; /* When parsing is done, this will hold the expression tree. */ struct rexp_node * rexp; /* This and top_expression are saved on the compile stack. */ struct rexp_node ** top_expression; struct rexp_node ** last_expression; /* Parameter to `goto append_node' */ struct rexp_node * append; /* Counts open-groups as they are encountered. This is the index of the * innermost group being compiled. */ regnum_t regnum; /* Place in the uncompiled pattern (i.e., the {) to * which to go back if the interval is invalid. */ const char *beg_interval; int side; if (!translate) translate = rx_id_translation; /* Points to the current (ending) position in the pattern. */ p = pattern; pend = pattern + size; /* When parsing is done, this will hold the expression tree. */ rexp = 0; /* In the midst of compilation, this holds onto the regexp * first parst while rexp goes on to aquire additional constructs. */ top_expression = &rexp; last_expression = top_expression; /* Counts open-groups as they are encountered. This is the index of the * innermost group being compiled. */ regnum = 0; bzero (validate_inv_tr, sizeof (validate_inv_tr)); /* Initialize the compile stack. */ compile_stack.stack = (( compile_stack_elt_t *) malloc ((INIT_COMPILE_STACK_SIZE) * sizeof ( compile_stack_elt_t))); if (compile_stack.stack == 0) return REG_ESPACE; compile_stack.size = INIT_COMPILE_STACK_SIZE; compile_stack.avail = 0; #if !defined (emacs) && !defined (SYNTAX_TABLE) /* Initialize the syntax table. */ init_syntax_once (); #endif /* Loop through the uncompiled pattern until we're at the end. */ while (p != pend) { PATFETCH (c); switch (c) { case '^': { if ( /* If at start of pattern, it's an operator. */ p == pattern + 1 /* If context independent, it's an operator. */ || syntax & RE_CONTEXT_INDEP_ANCHORS /* Otherwise, depends on what's come before. */ || at_begline_loc_p (pattern, p, syntax)) { struct rexp_node * n = rx_mk_r_int (r_context, '^'); if (!n) goto space_error; append = n; goto append_node; } else goto normal_char; } break; case '$': { if ( /* If at end of pattern, it's an operator. */ p == pend /* If context independent, it's an operator. */ || syntax & RE_CONTEXT_INDEP_ANCHORS /* Otherwise, depends on what's next. */ || at_endline_loc_p (p, pend, syntax)) { struct rexp_node * n = rx_mk_r_int (r_context, '$'); if (!n) goto space_error; append = n; goto append_node; } else goto normal_char; } break; case '+': case '?': if ((syntax & RE_BK_PLUS_QM) || (syntax & RE_LIMITED_OPS)) goto normal_char; handle_plus: case '*': /* If there is no previous pattern... */ if (pointless_if_repeated (*last_expression)) { if (syntax & RE_CONTEXT_INVALID_OPS) { compile_error = REG_BADRPT; goto error_return; } else if (!(syntax & RE_CONTEXT_INDEP_OPS)) goto normal_char; } { /* 1 means zero (many) matches is allowed. */ char zero_times_ok = 0, many_times_ok = 0; /* If there is a sequence of repetition chars, collapse it down to just one (the right one). We can't combine interval operators with these because of, e.g., `a{2}*', which should only match an even number of `a's. */ for (;;) { zero_times_ok |= c != '+'; many_times_ok |= c != '?'; if (p == pend) break; PATFETCH (c); if (c == '*' || (!(syntax & RE_BK_PLUS_QM) && (c == '+' || c == '?'))) ; else if (syntax & RE_BK_PLUS_QM && c == '\\') { if (p == pend) { compile_error = REG_EESCAPE; goto error_return; } PATFETCH (c1); if (!(c1 == '+' || c1 == '?')) { PATUNFETCH; PATUNFETCH; break; } c = c1; } else { PATUNFETCH; break; } /* If we get here, we found another repeat character. */ } /* Now we know whether or not zero matches is allowed * and also whether or not two or more matches is allowed. */ { struct rexp_node * inner_exp; struct rexp_node * star; inner_exp = *last_expression; star = rx_mk_r_monop ((many_times_ok ? (zero_times_ok ? r_star : r_plus) : r_opt), inner_exp); if (!star) goto space_error; *last_expression = star; } } break; case '.': { rx_Bitset cs; struct rexp_node * n; cs = rx_cset (cset_size); if (!cs) goto space_error; n = rx_mk_r_cset (r_cset, cset_size, cs); if (!n) { rx_free_cset (cs); goto space_error; } rx_bitset_universe (cset_size, cs); if (!(syntax & RE_DOT_NEWLINE)) RX_bitset_remove (cs, '\n'); if (!(syntax & RE_DOT_NOT_NULL)) RX_bitset_remove (cs, 0); append = n; goto append_node; break; } case '[': if (p == pend) { compile_error = REG_EBRACK; goto error_return; } { int had_char_class; rx_Bitset cs; struct rexp_node * node; int is_inverted; had_char_class = 0; is_inverted = *p == '^'; cs = rx_cset (cset_size); if (!cs) goto space_error; node = rx_mk_r_cset (r_cset, cset_size ,cs); if (!node) { rx_free_cset (cs); goto space_error; } /* This branch of the switch is normally exited with *`goto append_node' */ append = node; if (is_inverted) p++; /* Remember the first position in the bracket expression. */ p1 = p; /* Read in characters and ranges, setting map bits. */ for (;;) { if (p == pend) { compile_error = REG_EBRACK; goto error_return; } PATFETCH (c); /* \ might escape characters inside [...] and [^...]. */ if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\') { if (p == pend) { compile_error = REG_EESCAPE; goto error_return; } PATFETCH (c1); { rx_Bitset it = inverse_translation (cset_size, validate_inv_tr, inverse_translate, translate, c1); rx_bitset_union (cset_size, cs, it); } continue; } /* Could be the end of the bracket expression. If it's not (i.e., when the bracket expression is `[]' so far), the ']' character bit gets set way below. */ if (c == ']' && p != p1 + 1) goto finalize_class_and_append; /* Look ahead to see if it's a range when the last thing was a character class. */ if (had_char_class && c == '-' && *p != ']') { compile_error = REG_ERANGE; goto error_return; } /* Look ahead to see if it's a range when the last thing was a character: if this is a hyphen not at the beginning or the end of a list, then it's the range operator. */ if (c == '-' && !(p - 2 >= pattern && p[-2] == '[') && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^') && *p != ']') { reg_errcode_t ret = compile_range (cset_size, cs, &p, pend, translate, syntax, inverse_translate, validate_inv_tr); if (ret != REG_NOERROR) { compile_error = ret; goto error_return; } } else if (p[0] == '-' && p[1] != ']') { /* This handles ranges made up of characters only. */ reg_errcode_t ret; /* Move past the `-'. */ PATFETCH (c1); ret = compile_range (cset_size, cs, &p, pend, translate, syntax, inverse_translate, validate_inv_tr); if (ret != REG_NOERROR) { compile_error = ret; goto error_return; } } /* See if we're at the beginning of a possible character class. */ else if ((syntax & RE_CHAR_CLASSES) && (c == '[') && (*p == ':')) { char str[CHAR_CLASS_MAX_LENGTH + 1]; PATFETCH (c); c1 = 0; /* If pattern is `[[:'. */ if (p == pend) { compile_error = REG_EBRACK; goto error_return; } for (;;) { PATFETCH (c); if (c == ':' || c == ']' || p == pend || c1 == CHAR_CLASS_MAX_LENGTH) break; str[c1++] = c; } str[c1] = '\0'; /* If isn't a word bracketed by `[:' and:`]': undo the ending character, the letters, and leave the leading `:' and `[' (but set bits for them). */ if (c == ':' && *p == ']') { int ch; int is_alnum = !strcmp (str, "alnum"); int is_alpha = !strcmp (str, "alpha"); int is_blank = !strcmp (str, "blank"); int is_cntrl = !strcmp (str, "cntrl"); int is_digit = !strcmp (str, "digit"); int is_graph = !strcmp (str, "graph"); int is_lower = !strcmp (str, "lower"); int is_print = !strcmp (str, "print"); int is_punct = !strcmp (str, "punct"); int is_space = !strcmp (str, "space"); int is_upper = !strcmp (str, "upper"); int is_xdigit = !strcmp (str, "xdigit"); if (!IS_CHAR_CLASS (str)) { compile_error = REG_ECTYPE; goto error_return; } /* Throw away the ] at the end of the character class. */ PATFETCH (c); if (p == pend) { compile_error = REG_EBRACK; goto error_return; } for (ch = 0; ch < 1 << CHARBITS; ch++) { if ( (is_alnum && isalnum (ch)) || (is_alpha && isalpha (ch)) || (is_blank && isa_blank (ch)) || (is_cntrl && iscntrl (ch)) || (is_digit && isdigit (ch)) || (is_graph && isgraph (ch)) || (is_lower && islower (ch)) || (is_print && isprint (ch)) || (is_punct && ispunct (ch)) || (is_space && isspace (ch)) || (is_upper && isupper (ch)) || (is_xdigit && isxdigit (ch))) { rx_Bitset it = inverse_translation (cset_size, validate_inv_tr, inverse_translate, translate, ch); rx_bitset_union (cset_size, cs, it); } } had_char_class = 1; } else { c1++; while (c1--) PATUNFETCH; { rx_Bitset it = inverse_translation (cset_size, validate_inv_tr, inverse_translate, translate, '['); rx_bitset_union (cset_size, cs, it); } { rx_Bitset it = inverse_translation (cset_size, validate_inv_tr, inverse_translate, translate, ':'); rx_bitset_union (cset_size, cs, it); } had_char_class = 0; } } else { had_char_class = 0; { rx_Bitset it = inverse_translation (cset_size, validate_inv_tr, inverse_translate, translate, c); rx_bitset_union (cset_size, cs, it); } } } finalize_class_and_append: if (is_inverted) { rx_bitset_complement (cset_size, cs); if (syntax & RE_HAT_LISTS_NOT_NEWLINE) RX_bitset_remove (cs, '\n'); } goto append_node; } break; case '(': if (syntax & RE_NO_BK_PARENS) goto handle_open; else goto normal_char; case ')': if (syntax & RE_NO_BK_PARENS) goto handle_close; else goto normal_char; case '\n': if (syntax & RE_NEWLINE_ALT) goto handle_alt; else goto normal_char; case '|': if (syntax & RE_NO_BK_VBAR) goto handle_alt; else goto normal_char; case '{': if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES)) goto handle_interval; else goto normal_char; case '\\': if (p == pend) { compile_error = REG_EESCAPE; goto error_return; } /* Do not translate the character after the \, so that we can distinguish, e.g., \B from \b, even if we normally would translate, e.g., B to b. */ PATFETCH_RAW (c); switch (c) { case '(': if (syntax & RE_NO_BK_PARENS) goto normal_backslash; handle_open: regnum++; if (COMPILE_STACK_FULL) { compile_stack.stack = ((compile_stack_elt_t *) realloc (compile_stack.stack, (compile_stack.size << 1) * sizeof (compile_stack_elt_t))); if (compile_stack.stack == 0) goto space_error; compile_stack.size <<= 1; } if (*last_expression) { struct rexp_node * concat; concat = rx_mk_r_binop (r_concat, *last_expression, 0); if (!concat) goto space_error; *last_expression = concat; last_expression = &concat->params.pair.right; } /* * These are the values to restore when we hit end of this * group. */ COMPILE_STACK_TOP.top_expression = top_expression; COMPILE_STACK_TOP.last_expression = last_expression; COMPILE_STACK_TOP.regnum = regnum; compile_stack.avail++; top_expression = last_expression; break; case ')': if (syntax & RE_NO_BK_PARENS) goto normal_backslash; handle_close: /* See similar code for backslashed left paren above. */ if (COMPILE_STACK_EMPTY) if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD) goto normal_char; else { compile_error = REG_ERPAREN; goto error_return; } /* Since we just checked for an empty stack above, this * ``can't happen''. */ { /* We don't just want to restore into `regnum', because * later groups should continue to be numbered higher, * as in `(ab)c(de)' -- the second group is #2. */ regnum_t this_group_regnum; struct rexp_node ** inner; struct rexp_node * parens; inner = top_expression; compile_stack.avail--; top_expression = COMPILE_STACK_TOP.top_expression; last_expression = COMPILE_STACK_TOP.last_expression; this_group_regnum = COMPILE_STACK_TOP.regnum; parens = rx_mk_r_monop (r_parens, *inner); if (!parens) goto space_error; parens->params.intval = this_group_regnum; *inner = parens; break; } case '|': /* `\|'. */ if ((syntax & RE_LIMITED_OPS) || (syntax & RE_NO_BK_VBAR)) goto normal_backslash; handle_alt: if (syntax & RE_LIMITED_OPS) goto normal_char; { struct rexp_node * alt; alt = rx_mk_r_binop (r_alternate, *top_expression, 0); if (!alt) goto space_error; *top_expression = alt; last_expression = &alt->params.pair.right; } break; case '{': /* If \{ is a literal. */ if (!(syntax & RE_INTERVALS) /* If we're at `\{' and it's not the open-interval operator. */ || ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES)) || (p - 2 == pattern && p == pend)) goto normal_backslash; handle_interval: { /* If got here, then the syntax allows intervals. */ /* At least (most) this many matches must be made. */ int lower_bound; int upper_bound; lower_bound = -1; upper_bound = -1; /* We're about to parse the bounds of the interval. * It may turn out that this isn't an interval after * all, in which case these same characters will have * to be reparsed as literals. This remembers * the backtrack point in the parse: */ beg_interval = p - 1; if (p == pend) { if (syntax & RE_NO_BK_BRACES) goto unfetch_interval; else { compile_error = REG_EBRACE; goto error_return; } } GET_UNSIGNED_NUMBER (lower_bound); if (c == ',') { GET_UNSIGNED_NUMBER (upper_bound); if (upper_bound < 0) upper_bound = RE_DUP_MAX; } else /* Interval such as `{n}' => match exactly n times. */ upper_bound = lower_bound; if (lower_bound < 0 || upper_bound > RE_DUP_MAX || lower_bound > upper_bound) { if (syntax & RE_NO_BK_BRACES) goto unfetch_interval; else { compile_error = REG_BADBR; goto error_return; } } if (!(syntax & RE_NO_BK_BRACES)) { if (c != '\\') { compile_error = REG_EBRACE; goto error_return; } PATFETCH (c); } if (c != '}') { if (syntax & RE_NO_BK_BRACES) goto unfetch_interval; else { compile_error = REG_BADBR; goto error_return; } } /* We just parsed a valid interval. * lower_bound and upper_bound are set. */ /* If it's invalid to have no preceding re. */ if (pointless_if_repeated (*last_expression)) { if (syntax & RE_CONTEXT_INVALID_OPS) { compile_error = REG_BADRPT; goto error_return; } else if (!(syntax & RE_CONTEXT_INDEP_OPS)) /* treat the interval spec as literal chars. */ goto unfetch_interval; } { struct rexp_node * interval; interval = rx_mk_r_monop (r_interval, *last_expression); if (!interval) goto space_error; interval->params.intval = lower_bound; interval->params.intval2 = upper_bound; *last_expression = interval; } beg_interval = 0; } break; unfetch_interval: /* If an invalid interval, match the characters as literals. */ p = beg_interval; beg_interval = 0; /* normal_char and normal_backslash need `c'. */ PATFETCH (c); if (!(syntax & RE_NO_BK_BRACES)) { if ((p > pattern) && (p[-1] == '\\')) goto normal_backslash; } goto normal_char; #ifdef emacs /* There is no way to specify the before_dot and after_dot * operators. rms says this is ok. --karl */ case '=': side = '='; goto add_side_effect; break; case 's': case 'S': { rx_Bitset cs; struct rexp_node * set; cs = rx_cset (&cset_size); if (!cs) goto space_error; set = rx_mk_r_cset (r_cset, cset_size, cs); if (!set) { rx_free_cset (cs); goto space_error; } if (c == 'S') rx_bitset_universe (cset_size, cs); PATFETCH (c); { int x; enum syntaxcode code = syntax_spec_code [c]; for (x = 0; x < 256; ++x) { if (SYNTAX (x) == code) { rx_Bitset it = inverse_translation (cset_size, validate_inv_tr, inverse_translate, translate, x); rx_bitset_xor (cset_size, cs, it); } } } append = set; goto append_node; } break; #endif /* emacs */ case 'w': case 'W': { rx_Bitset cs; struct rexp_node * n; cs = rx_cset (cset_size); n = (cs ? rx_mk_r_cset (r_cset, cset_size, cs) : 0); if (!(cs && n)) { if (cs) rx_free_cset (cs); goto space_error; } if (c == 'W') rx_bitset_universe (cset_size ,cs); { int x; for (x = cset_size - 1; x > 0; --x) if (SYNTAX(x) & Sword) RX_bitset_toggle (cs, x); } append = n; goto append_node; } break; case '<': side = '<'; goto add_side_effect; break; case '>': side = '>'; goto add_side_effect; break; case 'b': side = 'b'; goto add_side_effect; break; case 'B': side = 'B'; goto add_side_effect; break; case '`': side = '`'; goto add_side_effect; break; case '\'': side = '\''; goto add_side_effect; break; add_side_effect: { struct rexp_node * se; se = rx_mk_r_int (r_context, side); if (!se) goto space_error; append = se; goto append_node; } break; case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': if (syntax & RE_NO_BK_REFS) goto normal_char; c1 = c - '0'; if (c1 > regnum) { compile_error = REG_ESUBREG; goto error_return; } side = c; goto add_side_effect; break; case '+': case '?': if (syntax & RE_BK_PLUS_QM) goto handle_plus; else goto normal_backslash; default: normal_backslash: /* You might think it would be useful for \ to mean * not to translate; but if we don't translate it * it will never match anything. */ c = TRANSLATE (c); goto normal_char; } break; default: /* Expects the character in `c'. */ normal_char: { rx_Bitset cs; struct rexp_node * match; rx_Bitset it; cs = rx_cset (cset_size); match = (cs ? rx_mk_r_cset (r_cset, cset_size, cs) : 0); if (!(cs && match)) { if (cs) rx_free_cset (cs); goto space_error; } it = inverse_translation (cset_size, validate_inv_tr, inverse_translate, translate, c); rx_bitset_union (CHAR_SET_SIZE, cs, it); append = match; append_node: /* This genericly appends the rexp APPEND to *LAST_EXPRESSION * and then parses the next character normally. */ if (*last_expression) { struct rexp_node * concat; concat = rx_mk_r_binop (r_concat, *last_expression, append); if (!concat) goto space_error; *last_expression = concat; last_expression = &concat->params.pair.right; } else *last_expression = append; } } /* switch (c) */ } /* while p != pend */ /* Through the pattern now. */ if (!COMPILE_STACK_EMPTY) { compile_error = REG_EPAREN; goto error_return; } free (compile_stack.stack); *rexp_p = rexp; return REG_NOERROR; space_error: compile_error = REG_ESPACE; error_return: free (compile_stack.stack); /* Free expressions pushed onto the compile stack! */ if (rexp) rx_free_rexp (rexp); return compile_error; }