CD Shareware Magazine 1996 December

home *** CD-ROM | disk | FTP | other *** search

/ CD Shareware Magazine 1996 December / CD_shareware_12-96.iso / WIN / Programa / FPARSE20.ZIP / FPARSE.PAS < prev next >

Wrap

Pascal/Delphi Source File | 1996-08-13 | 14.7 KB | 471 lines

unit FParse; { Copyright 1996 Kevin L. Boylan } { TFParse will parse words (as defined by the developer or user) from a file, s string, or a PChar. For performance purposes, all parsing is done in memory. If you are parsing from a file and the file is > 64K, it will be buffered in 64K chunks. If you are developing in the Delphi 2 environment, the file will be buffered in 500,000 byte chunks. What is to be parsed is determined by setting one of the following three properties. Setting one of these properties will override any previous settings of one of the three: FileToParse: The path and filename of a file that you wish to parsed. StringToParse: A String variable that you wish to parse. PCharToParse: A PChar variable that you wish to parse. What constitutes a word is defined through these properties: NormalCharacters: The set of characters that can go into a word. SignificantCharacters: The set of characters that can go into a word only if surrounded by NormalCharacters. InsignificantCharacters:The set of characters that will be ignored and removed from between two NormalCharacters. MIN_WORD_LENGTH: The minimum length of words that will be accepted. MAX_WORD_LENGTH: The maximum length of words returned. Longer words are truncated. CommonWords: This list of words will be ingored, not returned. If you don't want to make use of this property, you can set the property CmnWrdsActive to False. Once these properties are set, you then repeatedly call GetNext, which returns a string containing the next word from the file, string, or PChar. When GetNext returns a null string ('') then all words have been parsed. 5/12/96 Released version 1.0 5/15/96 Modified CreateCharSets, changing for loops to while loops because the counter was being modified inside the loops with inc(i,2). This problem kept the component from installing under Delphi 2.0 This modification resulted in version 1.1. 6/3/96 Fixed bugs which kept strings and PChars from being parsed. Also fixed a problem where the PercentDone property could be greater then 100% by adding the MinLongInt function. version 1.2 7/22/96 Added HTML parsing capabilites (minus special character handling) and condensed CreateCharSets member function by consolidating code into an internal procedure; 7/23/96 Added special character (i.e. ")handling capabilites to HTML parsing. 8/12/96 Added OnHTMLTag event 8/13/96 Added AutoDetect capability } interface uses SysUtils, WinTypes, WinProcs, Messages, Classes, Graphics, Controls, Forms, Dialogs; type EDocType = (dtAscii, dtHTML); TCharSet = Set of Char; EOFileException = class(Exception); THTMLTagEvent = procedure(HTMLTag: String) of object; TFParse = class(TComponent) private { Private declarations } FFileToParse: String; theFile: TFileStream; ByteCount: LongInt; TotalSize: LongInt; FBuffer,CurrChar,EOBuffer: PChar; BuffLen: LongInt; NormChar,SigChar,InSigChar: TCharSet; FNormalCharacters: String; FSignificantCharacters: String; FInsignificantChars: String; FCommonWords: TStrings; FCmnWrdsActive: Boolean; FMIN_WORD_LENGTH: Integer; FMAX_WORD_LENGTH: Integer; FDocumentType: EDocType; FOnHTMLTag: THTMLTagEvent; FAutoDetect: Boolean; protected { Protected declarations } procedure SetFileToParse( FName: String ); procedure SetStringToParse( theStr: String ); procedure SetPCharToParse( thePChar: PChar ); function GetPercentDone: LongInt; function NextChar: Boolean; procedure Loaded; override; procedure CreateCharSets; procedure SetCommonWords(Value: TStrings); procedure Init; function DetermineType: EDocType; public { Public declarations } constructor Create( AOwner: TComponent ); override; destructor Destroy; override; property StringToParse: String write SetStringToParse; property PCharToParse: PChar write SetPCharToParse; function GetNext: String; property PercentDone: LongInt read GetPercentDone; published { Published properties } property FileToParse: String read FFileToParse write SetFileToParse; property MIN_WORD_LENGTH: Integer read FMIN_WORD_LENGTH write FMIN_WORD_LENGTH default 1; property MAX_WORD_LENGTH: Integer read FMAX_WORD_LENGTH write FMAX_WORD_LENGTH default 20; property NormalCharacters: String read FNormalCharacters write FNormalCharacters; property SignificantCharacters: String read FSignificantCharacters write FSignificantCharacters; property InsignificantChars: String read FInsignificantChars write FInsignificantChars; property CommonWords: TStrings read FCommonWords write SetCommonWords; property CmnWrdsActive: Boolean read FCmnWrdsActive write FCmnWrdsActive; property DocumentType: EDocType read FDocumentType write FDocumentType default dtAscii; property AutoDetect: Boolean read FAutoDetect write FAutoDetect default False; { Published events } property OnHTMLTag: THTMLTagEvent read FOnHTMLTag write FOnHTMLTag; end; procedure Register; function MinLongInt( Long1, Long2: LongInt ): LongInt; implementation const {$IFDEF WIN32} MAX_WORD = 500000; {$ELSE} MAX_WORD = 65526; {$ENDIF} constructor TFParse.Create( AOwner: TComponent ); begin Inherited Create( AOwner ); MIN_WORD_LENGTH := 1; { Default minimum word length } MAX_WORD_LENGTH := 20; { Default maximum word length } FCmnWrdsActive := False; { By default, CommonWords is not active } FAutoDetect := False; { By default, AutoDetect is false } FDocumentType := dtAscii; { Plain text file is default } If (csDesigning In ComponentState) then begin NormalCharacters := '0-9A-Za-z'; { Default chars that make words } SignificantCharacters := './'; { Default chars that can be inside words only } InsignificantChars := ','; { Default chars that will be stripped from words } end; FCommonWords := TStringList.Create; end; destructor TFParse.Destroy; begin theFile.Free; StrDispose( FBuffer ); FCommonWords.Free; Inherited Destroy; end; procedure TFParse.Loaded; begin Inherited Loaded; if not (csDesigning In ComponentState) then CreateCharSets; { Create the Sets of Chars } end; procedure TFParse.SetCommonWords(Value: TStrings); begin FCommonWords.Assign(Value); end; procedure TFParse.CreateCharSets; { Converts from the string representation of the character sets to real sets of char } procedure CreateSet( var CharSet: TCharSet; SetDef: String ); var i: Integer; j: Char; begin CharSet := []; i := 1; While i <= Length(SetDef) do { Read each character } begin If (i < Length(SetDef)-1) and (SetDef[i+1] = '-') then begin { Handle ranges } For j := SetDef[i] to SetDef[i+2] do CharSet := CharSet + [j]; { Add each in range to set } Inc(i,2); end else begin CharSet := CharSet + [SetDef[i]]; { Add to set } end; Inc(i,1); end; end; { procedure CreateSet } begin If NormalCharacters = '' then NormalCharacters := '0-9A-Za-z'; { If none set, then use the default } CreateSet( NormChar, NormalCharacters ); {Create Normal Character Set } CreateSet( SigChar, SignificantCharacters ); {Create Significant Character Set } CreateSet( InSigChar, InsignificantChars ); {Create InSignificant Character Set } end; procedure TFParse.Init; { Initializes variables prior to any parsing } begin CurrChar := FBuffer-1; { Will start out incrementing CurrChar } ByteCount := 1; EOBuffer := FBuffer + BuffLen; EOBuffer^ := #0; end; function TFParse.DetermineType: EDocType; { Determines what type of document (i.e. Ascii or HTML) has been loaded in the buffer. } const DetectStringSize = 50; var DetectStr: String[DetectStringSize]; i: PChar; begin Result := dtAscii; { assume ascii } i := FBuffer; DetectStr := ''; While ((i<EOBuffer) and (i<FBuffer+DetectStringSize)) do begin DetectStr := DetectStr + i^; Inc(i); end; DetectStr := LowerCase(DetectStr); { --- Check for HTML } If (ExtractFileExt( FFileToParse ) = '.htm' ) or ((Pos('<', DetectStr) > 0) and (Pos('html', DetectStr) > 0)) then Result := dtHTML; end; procedure TFParse.SetFileToParse( FName: String ); { Opens the file to be parsed. Maximum block size to be read in at one time is MAX_WORD which should be close to 64K for 16 bit and much greater for 32 bit } begin theFile.Free; try theFile := TFileStream.Create( FName, fmOpenRead ); If (theFile.Size > MAX_WORD) then BuffLen := MAX_WORD else BuffLen := theFile.Size+1; TotalSize := theFile.Size; StrDispose(FBuffer); FBuffer := StrAlloc(BuffLen); Init; theFile.Read( FBuffer^, BuffLen-1 ); If AutoDetect then FDocumentType := DetermineType; except On EFOpenError do raise EFOpenError.CreateFmt( 'File %s could not be found', [FName] ); end; { except } end; procedure TFParse.SetStringToParse( theStr: String ); { Sets the buffer to be parsed from a String instead of from a file } begin FFileToParse := ''; TotalSize := Length(theStr); BuffLen := TotalSize + 1; StrDispose(FBuffer); FBuffer := StrAlloc( BuffLen ); StrPCopy( FBuffer, theStr ); Init; If AutoDetect then FDocumentType := DetermineType; end; procedure TFParse.SetPCharToParse( thePChar: PChar ); { Sets the buffer to be parsed from a PChar instead of from a file } begin FFileToParse := ''; TotalSize := StrLen( thePChar ); BuffLen := TotalSize + 1; StrDispose(FBuffer); FBuffer := StrAlloc( BuffLen ); StrCopy( FBuffer, thePChar ); Init; If AutoDetect then FDocumentType := DetermineType; end; function TFParse.GetNext: String; { returns the next word from the file } var tmpWord: String[255]; tmpChar: PChar; procedure GetHTMLLiteral; { It is assumed that tmpChar is pointing at a '&' } const LiteralStringSize = 3; var AStr: String[LiteralStringSize]; begin AStr := ''; tmpChar^ := #0; Inc(tmpChar); If tmpChar^ = '#' then { its an ascii representation } begin tmpChar^ := #0; Inc(tmpChar); While (tmpChar^ <> ';') and (Length(AStr) <= LiteralStringSize) do begin AStr := AStr + tmpChar^; tmpChar^ := #0; Inc(tmpChar); end; tmpChar^ := Chr( StrToInt(AStr) ); end Else { its an entity code-word } begin AStr := tmpChar^; tmpChar^ := #0; Inc(tmpChar); While (tmpChar^ <> ';') do begin AStr := AStr + tmpChar^; tmpChar^ := #0; Inc(tmpChar); end; AStr := LowerCase(AStr); If AStr = 'amp' then tmpChar^ := '&' Else If AStr = 'gt' then tmpChar^ := '>' Else If AStr = 'lt' then tmpChar^ := '<' Else If AStr = 'quot' then tmpChar^ := '"' Else tmpChar^ := #0; { we don't handle any others at this time } end; end; procedure ClearHTML; { Zero's out irrelevant HTML code so that parsing will ignore } var HTMLTag: String; begin tmpChar := CurrChar; HTMLTag := ''; Repeat If tmpChar^ = '<' then begin While True do begin If (tmpChar^ = '>') then { End on the '>' character } begin if Assigned(FOnHTMLTag) then { Handle the OnHTMLTag event } begin HTMLTag := HTMLTag + tmpChar^; FOnHTMLTag( HTMLTag ); HTMLTag := ''; end; tmpChar^ := #0; { Zero out so will be ignored } Inc(tmpChar); Break; end; If Assigned(FOnHTMLTag) then { Collect tag contents if OnHTMLTag is set } HTMLTag := HTMLTag + tmpChar^; tmpChar^ := #0; Inc(tmpChar); end; end Else GetHTMLLiteral; {It's a literal, not a Tag } Until (tmpChar^ <> '<') and (tmpChar^ <> '&') end; begin try { EOFileException is raised when end of buffer is reached } tmpChar := nil; Repeat Repeat Result := ''; tmpWord := ''; While (NextChar) do { Find next NormChar to start a word } begin { First check to see if it's HTML stuff to skip } { Begin HTML Check ***** } If ((DocumentType = dtHTML) and (tmpChar < CurrChar)) and ((CurrChar^ = '<') or (CurrChar^ = '&')) then If Length(tmpWord) > 0 then begin Dec(CurrChar); Dec(ByteCount); Break; end Else ClearHTML; { End HTML Check ***** } If (CurrChar^ = #0) or (CurrChar ^ in InSigChar) then Continue; If (CurrChar^ in NormChar) or ((CurrChar^ in SigChar) and ((CurrChar+1)^ in NormChar)) then begin If Length(tmpWord) < 255 then tmpWord := tmpWord + CurrChar^; end Else Break; end; Until Length( tmpWord ) >= MIN_WORD_LENGTH; Result := Copy(LowerCase(tmpWord),1,MAX_WORD_LENGTH); Until ( (not CmnWrdsActive) or (FCommonWords.IndexOf( Result ) = -1) ) except on EOFileException do { End of buffer has been reached } begin StrDispose( FBuffer ); FBuffer := nil; exit; end; end; end; function TFParse.NextChar: Boolean; { points at the next character in the buffer and reacts according to what's there } var AmountToRead, AmountRead: LongInt; begin Result := True; Inc(CurrChar); Inc(ByteCount); If CurrChar < EOBuffer then exit; If ByteCount > TotalSize+1 then raise EOFileException.Create('End of File'); CurrChar := FBuffer; AmountToRead := BuffLen - 1; FBuffer[0] := #0; AmountRead := theFile.Read( (FBuffer+StrLen(FBuffer))^, AmountToRead ); If AmountRead < AmountToRead then begin EOBuffer := CurrChar + AmountRead; EOBuffer^ := #0; end; end; function TFParse.GetPercentDone: LongInt; { returns the percentage of the file that has been parsed } begin Result := MinLongInt(((ByteCount * 100) div TotalSize), 100 ); end; function MinLongInt( Long1, Long2: LongInt ): LongInt; { returns the smallest of two long integers } begin If Long1 < Long2 then result := Long1 Else result := Long2; end; procedure Register; { Registers the component } begin RegisterComponents('Samples', [TFParse]); end; end.