home *** CD-ROM | disk | FTP | other *** search
- {$nested-comments}
- {
- Regular expression matching and replacement
-
- The RegEx unit provides routines to match strings against regular
- expressions and perform substitutions using matched subexpressions.
-
- To use the RegEx unit, you will need the rx library which can be
- found in ftp://agnes.dida.physik.uni-essen.de/gnu-pascal/libs/ .
-
- Regular expressions are strings with some characters having special
- meanings. They describe (match) a class of strings. They are similar
- to wild cards used in file name matching, but much more powerful.
-
- There are two kinds of regular expressions supported by this unit,
- basic and extended regular expressions. The difference between them
- is not functionality, but only syntax. The following is a short
- overview of regular expressions. For a more thorough explanation see
- the literature, or the documentation of the rx library, or man pages
- of programs like grep(1) and sed(1).
-
- Basic Extended Meaning
- `.' `.' matches any single character
- `[aei-z]' `[aei-z]' matches either `a', `e', or any
- character from `i' to `z'
- `[^aei-z]' `[^aei-z]' matches any character but `a', `e',
- or `i' .. `z'
- To include in such a list the the
- characters `]', `^', or `-', put
- them first, anywhere but first, or
- first or last, resp.
- `[[:alnum:]]' `[[:alnum:]]' matches any alphanumeric character
- `[^[:digit:]]' `[^[:digit:]]' matches anything but a digit
- `[a[:space:]]' `[a[:space:]]' matches the letter `a' or a space
- character (space, tab)
- ... (there are more classes available)
- `\w' `\w' = [[:alnum:]]
- `\W' `\W' = [^[:alnum:]]
- `^' `^' matches the empty string at the
- beginning of a line
- `$' `$' matches the empty string at the end
- of a line
- `*' `*' matches zero or more occurences of
- the preceding expression
- `\+' `+' matches one or more occurences of
- the preceding expression
- `\?' `?' matches zero or one occurence of the
- preceding expression
- `\{N\}' `{N}' matches exactly N occurences of the
- preceding expression (N is an
- integer number)
- `\{M,N\}' `{M,N}' matches M to N occurences of the
- preceding expression (M and N are
- integer numbers, M <= N)
- `A B' `A B' matches A followed by B (A and B are
- regular expressions)
- `A\|B' `A|B' matches A or B (A and B are regular
- expressions)
- `\( \)' `( )' forms a subexpression, to override
- precedence, and for subexpression
- references
- `\7' `\7' matches the 7'th parenthesized
- subexpression (counted by their
- start in the regex), where 7 is a
- number from 1 to 9 ;-).
- *Please note:* using this feature
- can be *very* slow or take very much
- memory (exponential time and space
- in the worst case, if you know what
- that means...).
- `\' `\' quotes the following character if
- it's special (i.e. listed above)
- rest rest any other character matches itself
-
- Precedence, from highest to lowest:
- * parentheses (`()')
- * repetition (`*', `+', `?', `{}')
- * concatenation
- * alternation (`|')
-
- When performing substitutions using matched subexpressions of a
- regular expression (see `ReplaceSubExpressionReferences'), the
- replacement string can reference the whole matched expression with
- `&' or `\0', the 7th subexpression with `\7' (just like in the regex
- itself, but using it in replacements is not slow), and the 7th
- subexpression converted to upper/lower case with `\u7' or `\l7',
- resp. (which also works for the whole matched expression with `\u0'
- or `\l0'). A verbatim `&' or `\' can be specified with `\&' or `\\',
- resp.
-
- Copyright (C) 1998-2001 Free Software Foundation, Inc.
-
- Author: Frank Heckenbach <frank@pascal.gnu.de>
-
- This file is part of GNU Pascal.
-
- GNU Pascal is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2, or (at your option)
- any later version.
-
- GNU Pascal is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with GNU Pascal; see the file COPYING. If not, write to the
- Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
- 02111-1307, USA.
-
- As a special exception, if you link this file with files compiled
- with a GNU compiler to produce an executable, this does not cause
- the resulting executable to be covered by the GNU General Public
- License. This exception does not however invalidate any other
- reasons why the executable file might be covered by the GNU General
- Public License.
-
- Please also note the license of the rx library.
- }
-
- {$gnu-pascal,B-,I-}
- {$if __GPC_RELEASE__ < 20000412}
- {$error This unit requires GPC release 20000412 or newer.}
- {$endif}
-
- unit RegEx;
-
- interface
-
- uses GPC;
-
- const
- { `BasicRegExSpecialChars' contains all characters that have
- special meanings in basic regular expressions.
- `ExtRegExSpecialChars' contains those that have special meanings
- in extended regular expressions. }
- BasicRegExSpecialChars = ['.', '[', ']', '^', '$', '*', '\'];
- ExtRegExSpecialChars = ['.', '[', ']', '^', '$', '*', '+', '?', '{', '}', '|', '(', ')', '\'];
-
- type
- { The type used by the routines of the `RegEx' unit to store
- regular expressions in an internal format. The fields RegEx,
- RegMatch, ErrorInternal, From and Length are only used
- internally. SubExpressions can be read after `NewRegEx' and will
- contain the number of parenthesized subexpressions. Error should
- be checked after `NewRegEx'. It will be `nil' when it succeeded,
- and contain an error message otherwise. }
- RegExType = record
- RegEx, RegMatch : Pointer; { Internal }
- ErrorInternal : CString; { Internal }
- From, Length : Integer; { Internal }
- SubExpressions : Integer;
- Error : PString
- end;
-
- { Simple interface to regular expression mathing. Matches a regular
- expression against a string starting from a specified position.
- Returns the position of the first match, or 0 if it does not
- match, or the regular expression is invalid. }
- function RegExPosFrom (const Expression : String; ExtendedRegEx, CaseInsensitive : Boolean; const s : String; From : Integer) : Integer; asmname '_p_regexposfrom';
-
- { Creates the internal format of a regular expression. If
- ExtendedRegEx is True, Expression is assumed to denote an extended
- regular expression, otherwise a basic regular expression.
- CaseInsensitive determines if the case of letters will be ignored
- when mathing the expression. If NewLines is True, `NewLine'
- characters in a string matched against the expression will be
- treated as dividing the string in multiple lines, so that `$' can
- match before the NewLine and `^' can match after. Also, `.' and
- `[^...]' will not match a NewLine then.}
- procedure NewRegEx (var RegEx : RegExType; const Expression : String; ExtendedRegEx, CaseInsensitive, NewLines : Boolean); asmname '_p_newregex';
-
- { Disposes of a regular expression created with `NewRegEx'. *Must*
- be used after `NewRegEx' before the RegEx variable becomes invalid
- (i.e., goes out of scope or a pointer pointing to it is Dispose'd
- of). }
- procedure DisposeRegEx (var RegEx : RegExType); asmname '_p_dispose_regex';
-
- { Matches a regular expression created with `NewRegEx' against a
- string. }
- function MatchRegEx (var RegEx : RegExType; const s : String; NotBeginningOfLine, NotEndOfLine : Boolean) : Boolean; asmname '_p_matchregex';
-
- { Matches a regular expression created with `NewRegEx' against a
- string, starting from a specified position. }
- function MatchRegExFrom (var RegEx : RegExType; const s : String; NotBeginningOfLine, NotEndOfLine : Boolean; From : Integer) : Boolean; asmname '_p_matchregexfrom';
-
- { Finds out where the regular expression matched, if `MatchRegEx' or
- `MatchRegExFrom' were successful. If n = 0, it returns the
- position of the whole match, otherwise the position of the n'th
- parenthesized subexpression. MatchPosition and MatchLength will
- contain the position (counted from 1) and length of the match, or
- 0 if it didn't match. (Note: MatchLength can also be 0 for a
- successful empty match, so check MatchPosition for 0 to find out
- if it matched at all.) MatchPosition or MatchLength may be null
- and is ignored then. }
- procedure GetMatchRegEx (var RegEx : RegExType; n : Integer; var MatchPosition, MatchLength : Integer); asmname '_p_getmatch_regex';
-
- { Checks if the string s contains any quoted characters or
- (sub)expression references to the regular expression RegEx created
- with `NewRegEx'. These are `&' or `\0' for the whole matched
- expression (if OnlySub is not set) and `\1' .. `\9' for the n'th
- parenthesized subexpression. Returns 0 if it does not contain any,
- and the number of references and quoted characters if it does. If
- an invalid reference (i.e. a number bigger than the number of
- subexpressions in RegEx) is found, it returns the negative value
- of the (first) invalid reference. }
- function FindSubExpressionReferences (var RegEx : RegExType; const s : String; OnlySub : Boolean) : Integer; asmname '_p_find_subexpressionreferences_regex';
-
- { Replaces (sub)expression references in ReplaceStr by the actual
- (sub)expressions and unquotes quoted characters. To be used after
- the regular expression RegEx created with `NewRegEx' was matched
- against s successfully with `MatchRegEx' or `MatchRegExFrom'. }
- function ReplaceSubExpressionReferences (var RegEx : RegExType; const s, ReplaceStr : String) : TString; asmname '_p_replace_subexpressionreferences_regex';
-
- { Returns the string for a regular expression that matches exactly
- one character out of the given set. It can be combined with the
- usual operators to form more complex expressions. }
- function CharSet2RegEx (const Characters : CharSet) : TString; asmname '_p_CharSet2RegEx';
-
- implementation
-
- {$L rx, regexc.c}
-
- procedure CNewRegEx (var RegEx : RegExType; Expression : CString; ExpressionLength : Integer; ExtendedRegEx, CaseInsensitive, NewLines : Boolean); asmname '_p_new_regex';
- function CMatchRegExFrom (var RegEx : RegExType; aString : CString; StrLength : Integer; NotBeginningOfLine, NotEndOfLine : Boolean; From : Integer) : Boolean; asmname '_p_match_regex_from';
-
- procedure NewRegEx (var RegEx : RegExType; const Expression : String; ExtendedRegEx, CaseInsensitive, NewLines : Boolean);
- begin
- CNewRegEx (RegEx, Expression, Length (Expression), ExtendedRegEx, CaseInsensitive, NewLines);
- if RegEx.ErrorInternal = nil then
- RegEx.Error := nil
- else
- RegEx.Error := NewString (CString2String (RegEx.ErrorInternal))
- end;
-
- function MatchRegEx (var RegEx : RegExType; const s : String; NotBeginningOfLine, NotEndOfLine : Boolean) : Boolean;
- begin
- MatchRegEx := CMatchRegExFrom (RegEx, s, Length (s), NotBeginningOfLine, NotEndOfLine, 1)
- end;
-
- function MatchRegExFrom (var RegEx : RegExType; const s : String; NotBeginningOfLine, NotEndOfLine : Boolean; From : Integer) : Boolean;
- begin
- MatchRegExFrom := CMatchRegExFrom (RegEx, s, Length (s), (From <> 1) or NotBeginningOfLine, NotEndOfLine, From)
- end;
-
- function RegExPosFrom (const Expression : String; ExtendedRegEx, CaseInsensitive : Boolean; const s : String; From : Integer) = MatchPosition : Integer;
- var RegEx : RegExType;
- begin
- MatchPosition := 0;
- NewRegEx (RegEx, Expression, ExtendedRegEx, CaseInsensitive, False);
- if (RegEx.Error = nil) and MatchRegExFrom (RegEx, s, False, False, From) then
- GetMatchRegEx (RegEx, 0, MatchPosition, null);
- DisposeRegEx (RegEx)
- end;
-
- function FindSubExpressionReferences (var RegEx : RegExType; const s : String; OnlySub : Boolean) : Integer;
- var
- i, j : Integer;
- ch : Char;
- begin
- j := 0;
- i := 1;
- while i <= Length (s) do
- begin
- case s [i] of
- '&' : if not OnlySub then Inc (j);
- '\' : if i = Length (s) then
- Inc (j)
- else
- begin
- ch := s [i + 1];
- if (ch in ['u', 'l']) and (i + 1 < Length (s)) then
- begin
- Inc (i);
- ch := s [i + 1]
- end;
- if (ch in ['0' .. '9']) and
- (Ord (ch) - Ord ('0') > RegEx.SubExpressions) then
- return - (Ord (ch) - Ord ('0'))
- else
- begin
- if not OnlySub or (ch <> '0') then Inc (j);
- Inc (i)
- end
- end;
- end;
- Inc (i)
- end;
- FindSubExpressionReferences := j
- end;
-
- function ReplaceSubExpressionReferences (var RegEx : RegExType; const s, ReplaceStr : String) = Res : TString;
- var i : Integer;
-
- procedure DoReplace (l, n : Integer; CaseMode : Char);
- var
- MatchPosition, MatchLength : Integer;
- st : TString;
- begin
- GetMatchRegEx (RegEx, n, MatchPosition, MatchLength);
- Delete (Res, i, l);
- if (n <= RegEx.SubExpressions) and (MatchPosition > 0) (*@@rx-1.5 bug*)and(MatchPosition + MatchLength - 1 <= Length (s)) then
- begin
- st := Copy (s, MatchPosition, MatchLength);
- case CaseMode of
- 'u' : UpCaseString (st);
- 'l' : LoCaseString (st);
- end;
- Insert (st, Res, i);
- Inc (i, MatchLength)
- end;
- Dec (i)
- end;
-
- begin
- Res := ReplaceStr;
- i := 1;
- while i <= Length (Res) do
- begin
- case Res [i] of
- '&' : DoReplace (1, 0, #0);
- '\' : if (i < Length (Res)) and (Res [i + 1] in ['0' .. '9']) then
- DoReplace (2, Ord (Res [i + 1]) - Ord ('0'), #0)
- else if (i + 1 < Length (Res)) and (Res [i + 1] in ['u', 'l']) and (Res [i + 2] in ['0' .. '9']) then
- DoReplace (3, Ord (Res [i + 2]) - Ord ('0'), Res [i + 1])
- else
- Delete (Res, i, 1);
- end;
- Inc (i)
- end
- end;
-
- function CharSet2RegEx (const Characters : CharSet) = s : TString;
- var
- i : Integer;
- c, c2 : Char;
- s2, s3 : String (1) = '';
- begin
- if Characters = [] then return '[^' + Low (Char) + '-' + High (Char) + ']';
- if Characters = ['^'] then return '\^'; { A `^' alone cannot be handled within `[]'! }
- if Characters = ['^', '-'] then return '[-^]';
- s := '';
- i := Ord (Low (Char));
- { we cannot use a Char for the loop, because it would overflow }
- while i <= Ord (High (Char)) do
- begin
- c := Chr (i);
- if c in Characters then
- case c of
- ']' : s := c + s; { `]' must come first }
- '^' : s2 := c; { `^' must not come first }
- '-' : s3 := c; { `-' must come last (or first) }
- else
- c2 := c;
- while (c2 < High (c)) and (Succ (c2) in Characters) do Inc (c2);
- if c2 = ']' then Dec (c2); { `x-]' would be interpreted as a closing bracket }
- if c2 <= Succ (c) then
- s := s + c
- else
- begin
- s := s + c + '-' + c2;
- c := c2
- end
- end;
- i := Ord (c) + 1
- end;
- s := '[' + s + s2 + s3 + ']'
- end;
-
- end.
-