home *** CD-ROM | disk | FTP | other *** search
/ Language/OS - Multiplatform Resource Library / LANGUAGE OS.iso / icon / contrib / dev.lha / dev / regex.c < prev   
Encoding:
C/C++ Source or Header  |  1992-12-17  |  13.0 KB  |  404 lines

  1. /* @(#)regex.c  4.1 (Berkeley) 12/21/80 */
  2. #
  3.  
  4. /*
  5.  * routines to do regular expression matching
  6.  *
  7.  * Entry points:
  8.  *
  9.  *      re_comp(s)
  10.  *              char *s;
  11.  *       ... returns 0 if the string s was compiled successfully,
  12.  *                   a pointer to an error message otherwise.
  13.  *           If passed 0 or a null string returns without changing
  14.  *           the currently compiled re (see note 11 below).
  15.  *
  16.  *      re_exec(s)
  17.  *              char *s;
  18.  *       ... returns 1 if the string s matches the last compiled regular
  19.  *                     expression, 
  20.  *                   0 if the string s failed to match the last compiled
  21.  *                     regular expression, and
  22.  *                  -1 if the compiled regular expression was invalid 
  23.  *                     (indicating an internal error).
  24.  *
  25.  * The strings passed to both re_comp and re_exec may have trailing or
  26.  * embedded newline characters; they are terminated by nulls.
  27.  *
  28.  * The identity of the author of these routines is lost in antiquity;
  29.  * this is essentially the same as the re code in the original V6 ed.
  30.  *
  31.  * The regular expressions recognized are described below. This description
  32.  * is essentially the same as that for ed.
  33.  *
  34.  *      A regular expression specifies a set of strings of characters.
  35.  *      A member of this set of strings is said to be matched by
  36.  *      the regular expression.  In the following specification for
  37.  *      regular expressions the word `character' means any character but NUL.
  38.  *
  39.  *      1.  Any character except a special character matches itself.
  40.  *          Special characters are the regular expression delimiter plus
  41.  *          \ [ . and sometimes ^ * $.
  42.  *      2.  A . matches any character.
  43.  *      3.  A \ followed by any character except a digit or ( )
  44.  *          matches that character.
  45.  *      4.  A nonempty string s bracketed [s] (or [^s]) matches any
  46.  *          character in (or not in) s. In s, \ has no special meaning,
  47.  *          and ] may only appear as the first letter. A substring 
  48.  *          a-b, with a and b in ascending ASCII order, stands for
  49.  *          the inclusive range of ASCII characters.
  50.  *      5.  A regular expression of form 1-4 followed by * matches a
  51.  *          sequence of 0 or more matches of the regular expression.
  52.  *      6.  A regular expression, x, of form 1-8, bracketed \(x\)
  53.  *          matches what x matches.
  54.  *      7.  A \ followed by a digit n matches a copy of the string that the
  55.  *          bracketed regular expression beginning with the nth \( matched.
  56.  *      8.  A regular expression of form 1-8, x, followed by a regular
  57.  *          expression of form 1-7, y matches a match for x followed by
  58.  *          a match for y, with the x match being as long as possible
  59.  *          while still permitting a y match.
  60.  *      9.  A regular expression of form 1-8 preceded by ^ (or followed
  61.  *          by $), is constrained to matches that begin at the left
  62.  *          (or end at the right) end of a line.
  63.  *      10. A regular expression of form 1-9 picks out the longest among
  64.  *          the leftmost matches in a line.
  65.  *      11. An empty regular expression stands for a copy of the last
  66.  *          regular expression encountered.
  67.  */
  68.  
  69. /*
  70.  * constants for re's
  71.  */
  72. #define CBRA    1
  73. #define CCHR    2
  74. #define CDOT    4
  75. #define CCL     6
  76. #define NCCL    8
  77. #define CEOF    11
  78. #define CKET    12
  79. #define CBACK   18
  80.  
  81. #define CSTAR   01
  82.  
  83. #define ESIZE   512
  84. #define NBRA    9
  85.  
  86. static char *names[] = { "?", "\\(", "char", "char*", ".",
  87.                 ".*", "[", "[*", "[^", "[^*",
  88.                 "?", "$", "\\)", "?", "?", "?", "?", "?", "\\" };
  89.  
  90. char    expbuf[ESIZE], *braslist[NBRA], *braelist[NBRA];
  91. static  char    circf = 1;
  92.  
  93. /*
  94.  * compile the regular expression argument into a dfa
  95.  */
  96. char *
  97. re_comp(sp)
  98.         register char   *sp;
  99. {
  100.     char *alloc();
  101.         register int    c;
  102.         register char   *ep = expbuf;
  103.         int     cclcnt, numbra = 0;
  104.         char    *lastep = 0;
  105.         char    bracket[NBRA];
  106.         char    *bracketp = &bracket[0];
  107.         static  char    *retoolong = "Regular expression too long";
  108.  
  109. #define comerr(msg) {expbuf[0] = 0; numbra = 0; printf(msg); return(0); }
  110.  
  111.         if (sp == 0 || *sp == '\0') {
  112.                 if (*ep == 0)
  113.                         comerr("No previous regular expression\n");
  114.                 return(0);
  115.         }
  116.         /*      if (*sp == '^') {
  117.          *        circf = 1;
  118.          *        sp++;
  119.          *         }
  120.          *    else circf = 0;
  121.      */
  122.         for (;;) {
  123.                 if (ep >= &expbuf[ESIZE])
  124.                         comerr(retoolong);
  125.                 if ((c = *sp++) == '\0') {
  126.                         if (bracketp != bracket)
  127.                                 comerr("unmatched \\(");
  128.                         *ep++ = CEOF;
  129.                         *ep++ = 0;
  130.             sp = alloc(ep - expbuf);
  131.             for (numbra = 0; numbra < ep - expbuf; numbra++)
  132.                 {
  133.                 sp[numbra] = expbuf[numbra];
  134.                 }
  135.             return sp;
  136.             }
  137.                 if (c != '*')
  138.                         lastep = ep;
  139.                 switch (c) {
  140.  
  141.                 case '.':
  142.                         *ep++ = CDOT;
  143.                         continue;
  144.  
  145.                 case '*':
  146.                         if (lastep == 0 || *lastep == CBRA || *lastep == CKET)
  147.                                 goto defchar;
  148.                         *lastep |= CSTAR;
  149.                         continue;
  150.  
  151.                 case '[':
  152.                         *ep++ = CCL;
  153.                         *ep++ = 0;
  154.                         cclcnt = 1;
  155.                         if ((c = *sp++) == '^') {
  156.                                 c = *sp++;
  157.                                 ep[-2] = NCCL;
  158.                         }
  159.                         do {
  160.                                 if (c == '\0')
  161.                                         comerr("missing ]");
  162.                                 if (c == '-' && ep [-1] != 0) {
  163.                                         if ((c = *sp++) == ']') {
  164.                                                 *ep++ = '-';
  165.                                                 cclcnt++;
  166.                                                 break;
  167.                                         }
  168.                                         while (ep[-1] < c) {
  169.                                                 *ep = ep[-1] + 1;
  170.                                                 ep++;
  171.                                                 cclcnt++;
  172.                                                 if (ep >= &expbuf[ESIZE])
  173.                                                         comerr(retoolong);
  174.                                         }
  175.                                 }
  176.                                 *ep++ = c;
  177.                                 cclcnt++;
  178.                                 if (ep >= &expbuf[ESIZE])
  179.                                         comerr(retoolong);
  180.                         } while ((c = *sp++) != ']');
  181.                         lastep[1] = cclcnt;
  182.                         continue;
  183.  
  184.                 case '\\':
  185.                         if ((c = *sp++) == '(') {
  186.                                 if (numbra >= NBRA)
  187.                                         comerr("too many \\(\\) pairs");
  188.                                 *bracketp++ = numbra;
  189.                                 *ep++ = CBRA;
  190.                                 *ep++ = numbra;
  191.                 numbra++;
  192.                                 continue;
  193.                         }
  194.                         if (c == ')') {
  195.                                 if (bracketp <= bracket)
  196.                                         comerr("unmatched \\)");
  197.                                 *ep++ = CKET;
  198.                                 *ep++ = *--bracketp;
  199.                                 continue;
  200.                         }
  201.                         if (c >= '1' && c < ('1' + NBRA)) {
  202.                                 *ep++ = CBACK;
  203.                                 *ep++ = c - '1';
  204.                                 continue;
  205.                         }
  206.                         *ep++ = CCHR;
  207.                         *ep++ = c;
  208.                         continue;
  209.  
  210.                 defchar:
  211.                 default:
  212.                         *ep++ = CCHR;
  213.                         *ep++ = c;
  214.                 }
  215.         }
  216. }
  217.  
  218.  
  219. /* 
  220.  * match the argument string against the compiled re
  221.  */
  222. int
  223. re_exec(subject, pattern, fromline)
  224.         register char   *subject;
  225.         register char   *pattern;
  226. {
  227.         register int    c;
  228.         int     rv;
  229.  
  230.         for (c = 0; c < NBRA; c++) {
  231.                 braslist[c] = 0;
  232.                 braelist[c] = 0;
  233.         }
  234.         if (circf) {
  235.                 c = (advance(subject, pattern));
  236.         /* printf("return %d from %d\n", c, fromline); */
  237.         return c;
  238.         }
  239.         /*
  240.          * fast check for first character
  241.          */
  242.         if (*pattern == CCHR) {
  243.                 c = pattern[1];
  244.                 do {
  245.                         if (*subject != c)
  246.                                 continue;
  247.                         if (rv = advance(subject, pattern)) {
  248.                 /* printf("return %d from %d\n", rv, fromline); */
  249.                                 return(rv);
  250.                 }
  251.                 } while (*subject++);
  252.         /* printf("return 0 from %d\n", fromline); */
  253.                 return(0);
  254.         }
  255.         /*
  256.          * regular algorithm
  257.          */
  258.         do
  259.                 if (rv = advance(subject, pattern)) {
  260.             /* printf("return %d from %d\n", rv, fromline); */
  261.                         return(rv);
  262.             }
  263.         while (*subject++);
  264.     /* printf("return 0 from %d\n", fromline); */
  265.         return(0);
  266. }
  267.  
  268. /* 
  269.  * try to match the next thing in the dfa
  270.  */
  271. static  int
  272. advance(lp, ep)
  273.         register char   *lp, *ep;
  274. {
  275.         register char   *curlp;
  276.         int     ct, i;
  277.         int     rv;
  278.  
  279.         for (;;)
  280.                 switch (*ep++) {
  281.  
  282.                 case CCHR:
  283.                         if (*ep++ == *lp++)
  284.                                 continue;
  285.                         return(0);
  286.  
  287.                 case CDOT:
  288.                         if (*lp++)
  289.                                 continue;
  290.                         return(0);
  291.  
  292.                 case CEOF:
  293.                         if (*lp == 0) return(1);
  294.                         return(0);
  295.  
  296.                 case CCL:
  297.                         if (cclass(ep, *lp++, 1)) {
  298.                                 ep += *ep;
  299.                                 continue;
  300.                         }
  301.                         return(0);
  302.  
  303.                 case NCCL:
  304.                         if (cclass(ep, *lp++, 0)) {
  305.                                 ep += *ep;
  306.                                 continue;
  307.                         }
  308.                         return(0);
  309.  
  310.                 case CBRA:
  311.                         braslist[*ep++] = lp;
  312.                         continue;
  313.  
  314.                 case CKET:
  315.                         braelist[*ep++] = lp;
  316.                         continue;
  317.  
  318.                 case CBACK:
  319.                         if (braelist[i = *ep++] == 0)
  320.                                 return(-1);
  321.                         if (backref(i, lp)) {
  322.                                 lp += braelist[i] - braslist[i];
  323.                                 continue;
  324.                         }
  325.                         return(0);
  326.  
  327.                 case CBACK|CSTAR:
  328.                         if (braelist[i = *ep++] == 0)
  329.                                 return(-1);
  330.                         curlp = lp;
  331.                         ct = braelist[i] - braslist[i];
  332.                         while (backref(i, lp))
  333.                                 lp += ct;
  334.                         while (lp >= curlp) {
  335.                                 if (rv = advance(lp, ep))
  336.                                         return(rv);
  337.                                 lp -= ct;
  338.                         }
  339.                         continue;
  340.  
  341.                 case CDOT|CSTAR:
  342.                         curlp = lp;
  343.                         while (*lp++)
  344.                                 ;
  345.                         goto star;
  346.  
  347.                 case CCHR|CSTAR:
  348.                         curlp = lp;
  349.                         while (*lp++ == *ep)
  350.                                 ;
  351.                         ep++;
  352.                         goto star;
  353.  
  354.                 case CCL|CSTAR:
  355.                 case NCCL|CSTAR:
  356.                         curlp = lp;
  357.                         while (cclass(ep, *lp++, ep[-1] == (CCL|CSTAR)))
  358.                                 ;
  359.                         ep += *ep;
  360.                         goto star;
  361.  
  362.                 star:
  363.                         do {
  364.                                 lp--;
  365.                                 if (rv = advance(lp, ep))
  366.                                         return(rv);
  367.                         } while (lp > curlp);
  368.                         return(0);
  369.  
  370.                 default:
  371.             printf("oops %d\n", *(ep - 1));
  372.                         return(-1);
  373.                 }
  374. }
  375.  
  376. backref(i, lp)
  377.         register int    i;
  378.         register char   *lp;
  379. {
  380.         register char   *bp;
  381.  
  382.         bp = braslist[i];
  383.         while (*bp++ == *lp++)
  384.                 if (bp >= braelist[i])
  385.                         return(1);
  386.         return(0);
  387. }
  388.  
  389. int
  390. cclass(set, c, af)
  391.         register char   *set, c;
  392.         int     af;
  393. {
  394.         register int    n;
  395.  
  396.         if (c == 0)
  397.                 return(0);
  398.         n = *set++;
  399.         while (--n)
  400.                 if (*set++ == c)
  401.                         return(af);
  402.         return(! af);
  403. }
  404.