home *** CD-ROM | disk | FTP | other *** search
- unit FParse;
- { Copyright 1996 Kevin L. Boylan }
-
- {
- TFParse will parse words (as defined by the developer or user) from a file, s string, or
- a PChar. For performance purposes, all parsing is done in memory. If you are parsing from
- a file and the file is > 64K, it will be buffered in 64K chunks. If you are developing in
- the Delphi 2 environment, the file will be buffered in 500,000 byte chunks.
-
- What is to be parsed is determined by setting one of the following three properties.
- Setting one of these properties will override any previous settings of one of the three:
-
- FileToParse: The path and filename of a file that you wish to parsed.
- StringToParse: A String variable that you wish to parse.
- PCharToParse: A PChar variable that you wish to parse.
-
- What constitutes a word is defined through these properties:
-
- NormalCharacters: The set of characters that can go into a word.
- SignificantCharacters: The set of characters that can go into a word only if surrounded
- by NormalCharacters.
- InsignificantCharacters:The set of characters that will be ignored and removed from between
- two NormalCharacters.
- MIN_WORD_LENGTH: The minimum length of words that will be accepted.
- MAX_WORD_LENGTH: The maximum length of words returned. Longer words are truncated.
- CommonWords: This list of words will be ingored, not returned. If you don't want
- to make use of this property, you can set the property CmnWrdsActive
- to False.
-
- Once these properties are set, you then repeatedly call GetNext, which returns a string
- containing the next word from the file, string, or PChar. When GetNext returns a null
- string ('') then all words have been parsed.
-
- 5/12/96 Released version 1.0
- 5/15/96 Modified CreateCharSets, changing for loops to while loops because
- the counter was being modified inside the loops with inc(i,2). This
- problem kept the component from installing under Delphi 2.0 This
- modification resulted in version 1.1.
- 6/3/96 Fixed bugs which kept strings and PChars from being parsed. Also fixed
- a problem where the PercentDone property could be greater then 100% by
- adding the MinLongInt function. version 1.2
- 7/22/96 Added HTML parsing capabilites (minus special character handling) and condensed
- CreateCharSets member function by consolidating code into an internal procedure;
- 7/23/96 Added special character (i.e. ")handling capabilites to HTML parsing.
- 8/12/96 Added OnHTMLTag event
- 8/13/96 Added AutoDetect capability
- }
-
- interface
-
- uses
- SysUtils, WinTypes, WinProcs, Messages, Classes, Graphics, Controls,
- Forms, Dialogs;
-
- type
- EDocType = (dtAscii, dtHTML);
- TCharSet = Set of Char;
- EOFileException = class(Exception);
- THTMLTagEvent = procedure(HTMLTag: String) of object;
-
- TFParse = class(TComponent)
- private
- { Private declarations }
- FFileToParse: String;
- theFile: TFileStream;
- ByteCount: LongInt;
- TotalSize: LongInt;
- FBuffer,CurrChar,EOBuffer: PChar;
- BuffLen: LongInt;
- NormChar,SigChar,InSigChar: TCharSet;
- FNormalCharacters: String;
- FSignificantCharacters: String;
- FInsignificantChars: String;
- FCommonWords: TStrings;
- FCmnWrdsActive: Boolean;
- FMIN_WORD_LENGTH: Integer;
- FMAX_WORD_LENGTH: Integer;
- FDocumentType: EDocType;
- FOnHTMLTag: THTMLTagEvent;
- FAutoDetect: Boolean;
- protected
- { Protected declarations }
- procedure SetFileToParse( FName: String );
- procedure SetStringToParse( theStr: String );
- procedure SetPCharToParse( thePChar: PChar );
- function GetPercentDone: LongInt;
- function NextChar: Boolean;
- procedure Loaded; override;
- procedure CreateCharSets;
- procedure SetCommonWords(Value: TStrings);
- procedure Init;
- function DetermineType: EDocType;
- public
- { Public declarations }
- constructor Create( AOwner: TComponent ); override;
- destructor Destroy; override;
-
- property StringToParse: String write SetStringToParse;
- property PCharToParse: PChar write SetPCharToParse;
- function GetNext: String;
- property PercentDone: LongInt read GetPercentDone;
- published
- { Published properties }
- property FileToParse: String read FFileToParse write SetFileToParse;
- property MIN_WORD_LENGTH: Integer read FMIN_WORD_LENGTH write FMIN_WORD_LENGTH default 1;
- property MAX_WORD_LENGTH: Integer read FMAX_WORD_LENGTH write FMAX_WORD_LENGTH default 20;
- property NormalCharacters: String read FNormalCharacters write FNormalCharacters;
- property SignificantCharacters: String read FSignificantCharacters
- write FSignificantCharacters;
- property InsignificantChars: String read FInsignificantChars write FInsignificantChars;
- property CommonWords: TStrings read FCommonWords write SetCommonWords;
- property CmnWrdsActive: Boolean read FCmnWrdsActive write FCmnWrdsActive;
- property DocumentType: EDocType read FDocumentType write FDocumentType default dtAscii;
- property AutoDetect: Boolean read FAutoDetect write FAutoDetect default False;
- { Published events }
- property OnHTMLTag: THTMLTagEvent read FOnHTMLTag write FOnHTMLTag;
- end;
-
-
- procedure Register;
- function MinLongInt( Long1, Long2: LongInt ): LongInt;
-
- implementation
-
- const
- {$IFDEF WIN32}
- MAX_WORD = 500000;
- {$ELSE}
- MAX_WORD = 65526;
- {$ENDIF}
-
- constructor TFParse.Create( AOwner: TComponent );
- begin
- Inherited Create( AOwner );
- MIN_WORD_LENGTH := 1; { Default minimum word length }
- MAX_WORD_LENGTH := 20; { Default maximum word length }
- FCmnWrdsActive := False; { By default, CommonWords is not active }
- FAutoDetect := False; { By default, AutoDetect is false }
- FDocumentType := dtAscii; { Plain text file is default }
- If (csDesigning In ComponentState) then
- begin
- NormalCharacters := '0-9A-Za-z'; { Default chars that make words }
- SignificantCharacters := './'; { Default chars that can be inside words only }
- InsignificantChars := ','; { Default chars that will be stripped from words }
- end;
- FCommonWords := TStringList.Create;
- end;
-
- destructor TFParse.Destroy;
- begin
- theFile.Free;
- StrDispose( FBuffer );
- FCommonWords.Free;
- Inherited Destroy;
- end;
-
- procedure TFParse.Loaded;
- begin
- Inherited Loaded;
- if not (csDesigning In ComponentState) then
- CreateCharSets; { Create the Sets of Chars }
- end;
-
- procedure TFParse.SetCommonWords(Value: TStrings);
- begin
- FCommonWords.Assign(Value);
- end;
-
-
- procedure TFParse.CreateCharSets;
- { Converts from the string representation of the character sets to real
- sets of char }
- procedure CreateSet( var CharSet: TCharSet; SetDef: String );
- var
- i: Integer;
- j: Char;
- begin
- CharSet := [];
- i := 1;
- While i <= Length(SetDef) do { Read each character }
- begin
- If (i < Length(SetDef)-1) and (SetDef[i+1] = '-') then
- begin { Handle ranges }
- For j := SetDef[i] to SetDef[i+2] do
- CharSet := CharSet + [j]; { Add each in range to set }
- Inc(i,2);
- end
- else
- begin
- CharSet := CharSet + [SetDef[i]]; { Add to set }
- end;
- Inc(i,1);
- end;
- end; { procedure CreateSet }
-
- begin
- If NormalCharacters = '' then
- NormalCharacters := '0-9A-Za-z'; { If none set, then use the default }
- CreateSet( NormChar, NormalCharacters ); {Create Normal Character Set }
- CreateSet( SigChar, SignificantCharacters ); {Create Significant Character Set }
- CreateSet( InSigChar, InsignificantChars ); {Create InSignificant Character Set }
- end;
-
- procedure TFParse.Init;
- { Initializes variables prior to any parsing }
- begin
- CurrChar := FBuffer-1; { Will start out incrementing CurrChar }
- ByteCount := 1;
- EOBuffer := FBuffer + BuffLen;
- EOBuffer^ := #0;
- end;
-
- function TFParse.DetermineType: EDocType;
- { Determines what type of document (i.e. Ascii or HTML) has been loaded in
- the buffer. }
- const
- DetectStringSize = 50;
- var
- DetectStr: String[DetectStringSize];
- i: PChar;
- begin
- Result := dtAscii; { assume ascii }
- i := FBuffer;
- DetectStr := '';
- While ((i<EOBuffer) and (i<FBuffer+DetectStringSize)) do
- begin
- DetectStr := DetectStr + i^;
- Inc(i);
- end;
- DetectStr := LowerCase(DetectStr);
- { --- Check for HTML }
- If (ExtractFileExt( FFileToParse ) = '.htm' )
- or ((Pos('<', DetectStr) > 0) and (Pos('html', DetectStr) > 0)) then
- Result := dtHTML;
- end;
-
- procedure TFParse.SetFileToParse( FName: String );
- { Opens the file to be parsed. Maximum block size to be read in at one time is
- MAX_WORD which should be close to 64K for 16 bit and much greater for 32 bit }
- begin
- theFile.Free;
- try
- theFile := TFileStream.Create( FName, fmOpenRead );
- If (theFile.Size > MAX_WORD) then
- BuffLen := MAX_WORD
- else
- BuffLen := theFile.Size+1;
- TotalSize := theFile.Size;
- StrDispose(FBuffer);
- FBuffer := StrAlloc(BuffLen);
- Init;
- theFile.Read( FBuffer^, BuffLen-1 );
- If AutoDetect then
- FDocumentType := DetermineType;
- except
- On EFOpenError do
- raise EFOpenError.CreateFmt( 'File %s could not be found', [FName] );
- end; { except }
- end;
-
- procedure TFParse.SetStringToParse( theStr: String );
- { Sets the buffer to be parsed from a String instead of from a file }
- begin
- FFileToParse := '';
- TotalSize := Length(theStr);
- BuffLen := TotalSize + 1;
- StrDispose(FBuffer);
- FBuffer := StrAlloc( BuffLen );
- StrPCopy( FBuffer, theStr );
- Init;
- If AutoDetect then
- FDocumentType := DetermineType;
- end;
-
- procedure TFParse.SetPCharToParse( thePChar: PChar );
- { Sets the buffer to be parsed from a PChar instead of from a file }
- begin
- FFileToParse := '';
- TotalSize := StrLen( thePChar );
- BuffLen := TotalSize + 1;
- StrDispose(FBuffer);
- FBuffer := StrAlloc( BuffLen );
- StrCopy( FBuffer, thePChar );
- Init;
- If AutoDetect then
- FDocumentType := DetermineType;
- end;
-
- function TFParse.GetNext: String;
- { returns the next word from the file }
- var
- tmpWord: String[255];
- tmpChar: PChar;
-
- procedure GetHTMLLiteral;
- { It is assumed that tmpChar is pointing at a '&' }
- const
- LiteralStringSize = 3;
- var
- AStr: String[LiteralStringSize];
- begin
- AStr := '';
- tmpChar^ := #0;
- Inc(tmpChar);
- If tmpChar^ = '#' then { its an ascii representation }
- begin
- tmpChar^ := #0;
- Inc(tmpChar);
- While (tmpChar^ <> ';') and (Length(AStr) <= LiteralStringSize) do
- begin
- AStr := AStr + tmpChar^;
- tmpChar^ := #0;
- Inc(tmpChar);
- end;
- tmpChar^ := Chr( StrToInt(AStr) );
- end
- Else { its an entity code-word }
- begin
- AStr := tmpChar^;
- tmpChar^ := #0;
- Inc(tmpChar);
- While (tmpChar^ <> ';') do
- begin
- AStr := AStr + tmpChar^;
- tmpChar^ := #0;
- Inc(tmpChar);
- end;
- AStr := LowerCase(AStr);
- If AStr = 'amp' then
- tmpChar^ := '&'
- Else If AStr = 'gt' then
- tmpChar^ := '>'
- Else If AStr = 'lt' then
- tmpChar^ := '<'
- Else If AStr = 'quot' then
- tmpChar^ := '"'
- Else
- tmpChar^ := #0; { we don't handle any others at this time }
- end;
- end;
-
- procedure ClearHTML;
- { Zero's out irrelevant HTML code so that parsing will ignore }
- var
- HTMLTag: String;
- begin
- tmpChar := CurrChar;
- HTMLTag := '';
- Repeat
- If tmpChar^ = '<' then
- begin
- While True do
- begin
- If (tmpChar^ = '>') then { End on the '>' character }
- begin
- if Assigned(FOnHTMLTag) then { Handle the OnHTMLTag event }
- begin
- HTMLTag := HTMLTag + tmpChar^;
- FOnHTMLTag( HTMLTag );
- HTMLTag := '';
- end;
- tmpChar^ := #0; { Zero out so will be ignored }
- Inc(tmpChar);
- Break;
- end;
- If Assigned(FOnHTMLTag) then { Collect tag contents if OnHTMLTag is set }
- HTMLTag := HTMLTag + tmpChar^;
- tmpChar^ := #0;
- Inc(tmpChar);
- end;
- end
- Else
- GetHTMLLiteral; {It's a literal, not a Tag }
- Until (tmpChar^ <> '<') and (tmpChar^ <> '&')
- end;
-
- begin
- try { EOFileException is raised when end of buffer is reached }
- tmpChar := nil;
- Repeat
- Repeat
- Result := '';
- tmpWord := '';
- While (NextChar) do { Find next NormChar to start a word }
- begin { First check to see if it's HTML stuff to skip }
- { Begin HTML Check ***** }
- If ((DocumentType = dtHTML) and (tmpChar < CurrChar)) and ((CurrChar^ = '<') or (CurrChar^ = '&')) then
- If Length(tmpWord) > 0 then
- begin
- Dec(CurrChar);
- Dec(ByteCount);
- Break;
- end
- Else
- ClearHTML;
- { End HTML Check ***** }
-
- If (CurrChar^ = #0) or (CurrChar ^ in InSigChar) then
- Continue;
- If (CurrChar^ in NormChar) or
- ((CurrChar^ in SigChar) and ((CurrChar+1)^ in NormChar)) then
- begin
- If Length(tmpWord) < 255 then
- tmpWord := tmpWord + CurrChar^;
- end
- Else
- Break;
- end;
- Until Length( tmpWord ) >= MIN_WORD_LENGTH;
-
- Result := Copy(LowerCase(tmpWord),1,MAX_WORD_LENGTH);
- Until ( (not CmnWrdsActive) or (FCommonWords.IndexOf( Result ) = -1) )
-
- except
- on EOFileException do { End of buffer has been reached }
- begin
- StrDispose( FBuffer );
- FBuffer := nil;
- exit;
- end;
- end;
-
- end;
-
- function TFParse.NextChar: Boolean;
- { points at the next character in the buffer and reacts according to what's there }
- var
- AmountToRead,
- AmountRead: LongInt;
- begin
- Result := True;
- Inc(CurrChar);
- Inc(ByteCount);
- If CurrChar < EOBuffer then
- exit;
- If ByteCount > TotalSize+1 then
- raise EOFileException.Create('End of File');
- CurrChar := FBuffer;
- AmountToRead := BuffLen - 1;
- FBuffer[0] := #0;
- AmountRead := theFile.Read( (FBuffer+StrLen(FBuffer))^, AmountToRead );
- If AmountRead < AmountToRead then
- begin
- EOBuffer := CurrChar + AmountRead;
- EOBuffer^ := #0;
- end;
- end;
-
- function TFParse.GetPercentDone: LongInt;
- { returns the percentage of the file that has been parsed }
- begin
- Result := MinLongInt(((ByteCount * 100) div TotalSize), 100 );
- end;
-
- function MinLongInt( Long1, Long2: LongInt ): LongInt;
- { returns the smallest of two long integers }
- begin
- If Long1 < Long2 then
- result := Long1
- Else
- result := Long2;
- end;
-
- procedure Register;
- { Registers the component }
- begin
- RegisterComponents('Samples', [TFParse]);
- end;
-
- end.
-