OS/2 Shareware BBS: 10 Tools

home *** CD-ROM | disk | FTP | other *** search

/ OS/2 Shareware BBS: 10 Tools / 10-Tools.zip / pascal.zip / lexical / lexical.c < prev next >

Wrap

Text File | 1995-10-29 | 21KB | 615 lines

/* * C . A . P . L E X I C A L A N A L Y Z E R * * C O R E P R O C E D U R E S * * Stéphane Charette @ C.A.P. Services * * Last modified: Stéphane Charette, 1995 October 29 * ***************************************************************************** * * Project: BILL * Group: lexical analyzer * File: lexical\lexical.c * Version: 0.1.4 * * This file contains all of the source code that makes up the lexical * analyzer portion of the interpreter BILL. */ /* * Versions: * * 0.0.1 - design of structure and implementation, Stéphane Charette, 94Feb26-Mar1 * 0.1.0 - first working version, SC, 94Mar2 * 0.1.1 - added debug information, SC, 94Mar2-3 * 0.1.2 - added set of illegal characters, SC, 94Mar24 * 0.1.2 - added more comments, optimized debug, SC, 94Mar27-Apr10 * 0.1.3 - fixed bug while detecting comments, SC, 94Mar13 * 0.1.3 - fixed bug while detecting not-equal "<>" symbol, SC, 94Mar14 * 0.1.4 - ported to OS/2, SC, 94Apr23-27 * 0.1.4 - changed text formatting, SC, 95Oct29 */ #define _LEX_VERSION "Lexical analyzer v0.1.4, Stéphane Charette, 95Oct29\n" /* * Includes */ #include "..\lexical\lexical.hi" // internal lexical analyzer include file /* * FUNCTION: InitLex */ ERR InitLex( UCHAR path[] ) { ERR RC = ERR_OK; #if( _LEX_DEBUG ) Lex_Debug.ScreenOutput = FALSE; // pre-initialize debug value Lex_Debug.SourceOutput = FALSE; // pre-initialize debug value Lex_Debug.TokenOutput = FALSE; // pre-initialize debug value Lex_Debug.MixOutput = FALSE; // pre-initialize debug value #endif Lex_State.InComment = FALSE; // are we in a comment? Lex_State.InString = FALSE; // are we in a string? Lex_State.LineNumber = 0; // current line number Lex_State.CharOffset = 0; // character offset into current line Lex_State.Error = ERR_OK; // last error Lex_State.TokenID = TOK_NO_ID; // ID of found token Lex_State.LexemeID = LEX_NO_ID; // Actual lexeme if ID is generic Lex_State.VarID = 0; // variable reference strcpy( Lex_State.Token, "" ); // Actual token // open the source file if( strlen( path ) > LEX_MAX_PATH_LEN ) LexError( ERR_LEX_PATH_TOO_LONG ); // is the path valid? strcpy( FileName, path ); // save the path of file if( ( FilePtr = fopen( FileName, "r" ) ) == NULL ) LexError( ERR_LEX_CANNOT_OPEN_FILE ); // open the file // initialize the look-ahead buffer strcpy( Buffer, " \n" ); // set the previous and current character to <SPACE><NEWLINE> if( ! fread( &( Buffer[ 2 ] ), 1, LEX_MAX_BUFFER_LEN - 2, FilePtr ) ) LexError( ERR_LEX_CANNOT_READ_FILE ); // read file return RC; } /* * FUNCTION: ResetLex */ ERR ResetLex( ) { ERR RC = ERR_OK; if( fclose( FilePtr ) ) RC = ERR_LEX_CANNOT_CLOSE_FILE; return RC; } /* * FUNCTION: LexError */ void LexError( ERR error_code ) { Lex_State.Error = error_code; printf( "\nError encountered in lex: %i\n\a", Lex_State.Error ); ResetLex( ); // perform shutdown procedures ExitCode( error_code ); // return to operating system } /* * FUNCTION: GetNextCharacter */ ERR GetNextCharacter( ) { ERR RC = ERR_OK; strncpy( Buffer, &( Buffer[ 1 ] ), ( LEX_MAX_BUFFER_LEN - 1 ) ); // shift the buffer down by one fread( &( Buffer[ LEX_MAX_BUFFER_LEN - 1 ] ), 1, 1, FilePtr ); // read the next character // find out if an EOF occurred if( feof( FilePtr ) ) { // could not read character, so use a space instead Buffer[ LEX_MAX_BUFFER_LEN - 1 ] = ' '; RC = Lex_State.Error = ERR_LEX_END_OF_FILE; // indicate that the eof was reached } // find out if a file error occurred if( ferror( FilePtr ) ) { // could not read character, so use a space instead Buffer[ LEX_MAX_BUFFER_LEN - 1 ] = ' '; RC = Lex_State.Error = ERR_LEX_CANNOT_READ_FILE; // indicate that an actual file read error occurred } if( ( Buffer[ 0 ] == '\r' ) || ( Buffer[ 0 ] == '\n' ) ) { // if the previous character read was an EOL character... Lex_State.LineNumber ++; // ...then increase the line counter by one, Lex_State.CharOffset = 1; // ...and reset the character counter to the start of the line #if( _LEX_DEBUG ) LexDebugWrite( "\n(%3i) ", Lex_State.LineNumber ); #endif } else { // ...else... Lex_State.CharOffset ++; // ...increase the character offset into this line #if( _LEX_DEBUG ) if( Lex_Debug.ScreenOutput ) printf( "%c", Buffer[ 0 ] ); if( Lex_Debug.SourceOutput ) fprintf( LexDebugSourceFilePtr, "%c", Buffer[ 0 ] ); if( Lex_Debug.MixOutput ) fprintf( LexDebugMixFilePtr, "%c", Buffer[ 0 ] ); #endif } return RC; } /* * FUNCTION: GetNextToken */ ERR GetNextToken( ) { ERR RC = ERR_OK; Lex_State.TokenID = TOK_NO_ID; Lex_State.LexemeID = LEX_NO_ID; Lex_State.VarID = SYM_NO_ID; strcpy( Lex_State.Token, "" ); // skip the white space - if any - at the current position in Buffer if( isspace( Buffer[ 1 ] ) ) RC = SkipWhiteSpace( ); while( Lex_State.TokenID == TOK_NO_ID ) { if( ( ( ( Lex_State.InString == FALSE ) && ( Lex_State.InComment == FALSE ) ) && ( strchr( SET_OF_TOKEN_DELIMITERS, Buffer[ 1 ] ) || strchr( SET_OF_ILLEGAL_CHARS, Buffer[ 1 ] ) ) ) || ( ( Lex_State.InComment == TRUE ) && ( Buffer[ 1 ] == '}' ) ) || ( ( Lex_State.InString == TRUE ) && ( Buffer[ 1 ] == '\'' ) ) ) { // token delimiter found - now try and recognize the token // is this a single character delimiter, or does this indicate the end of a previous token? if( Lex_State.Token[ 0 ] == '\0' ) { // this indicates we've just read a single character delimiter switch( Buffer[ 1 ] ) { case '{': // don't do anything with these - the next step will case '}': // take care of handling them correctly break; case ';': Lex_State.TokenID = TOK_SEMICOLON_ID; // semicolon recognized strcpy( Lex_State.Token, ";" ); // make a copy of the token RC = GetNextCharacter( ); // get next character break; case ',': Lex_State.TokenID = TOK_COMMA_ID; // comma recognized strcpy( Lex_State.Token, "," ); // make a copy of the token RC = GetNextCharacter( ); // get next character break; case '.': Lex_State.TokenID = TOK_PERIOD_ID; // period recognized strcpy( Lex_State.Token, "." ); // make a copy of the token RC = GetNextCharacter( ); // get next character if( RC = ERR_LEX_END_OF_FILE ) RC = ERR_OK; // give the guy a break... :) break; case '+': Lex_State.TokenID = TOK_PLUS_ID; // plus recognized strcpy( Lex_State.Token, "+" ); // make a copy of the token RC = GetNextCharacter( ); // get next character break; case '-': Lex_State.TokenID = TOK_MINUS_ID; // minus recognized strcpy( Lex_State.Token, "-" ); // make a copy of the token RC = GetNextCharacter( ); // get next character break; case '*': Lex_State.TokenID = TOK_MULT_ID; // multiply recognized strcpy( Lex_State.Token, "*" ); // make a copy of the token RC = GetNextCharacter( ); // get next character break; case '/': Lex_State.TokenID = TOK_DIV_ID; // divide recognized strcpy( Lex_State.Token, "/" ); // make a copy of the token RC = GetNextCharacter( ); // get next character break; case '(': Lex_State.TokenID = TOK_PO_ID; // parenteses open recognized strcpy( Lex_State.Token, "(" ); // make a copy of the token RC = GetNextCharacter( ); // get next character break; case ')': Lex_State.TokenID = TOK_PC_ID; // parenteses close recognized strcpy( Lex_State.Token, ")" ); // make a copy of the token RC = GetNextCharacter( ); // get next character break; case '\'': break; // pass on the single quotes case '<': if( Buffer[ 2 ] == '=' ) { Lex_State.TokenID = TOK_RELOP_ID; // relational operator recognized Lex_State.LexemeID = LEX_LE_ID; // less_than_or_equal_to lexeme recognized strcpy( Lex_State.Token, "<=" ); // make a copy of the token // move ahead two characters since this is a two-char token if( ( RC = GetNextCharacter( ) ) == ERR_OK ) RC = GetNextCharacter( ); } else { if( Buffer[ 2 ] == '>' ) { Lex_State.TokenID = TOK_RELOP_ID; Lex_State.LexemeID = LEX_NE_ID; strcpy( Lex_State.Token, "<>" ); if( ( RC = GetNextCharacter( ) ) == ERR_OK ) RC = GetNextCharacter( ); } else { Lex_State.TokenID = TOK_RELOP_ID; // relational operator recognized Lex_State.LexemeID = LEX_LT_ID; // less_than lexeme recognized strcpy( Lex_State.Token, "<" ); // make a copy of the token RC = GetNextCharacter( ); // get next character } } break; case '>': if( Buffer[ 2 ] == '=' ) { Lex_State.TokenID = TOK_RELOP_ID; // relational operator recognized Lex_State.LexemeID = LEX_GE_ID; // greater_than_or_equal_to lexeme recognized strcpy( Lex_State.Token, ">=" ); // make a copy of the token // move ahead two characters since this is a two-char token if( ( RC = GetNextCharacter( ) ) == ERR_OK ) RC = GetNextCharacter( ); } else { Lex_State.TokenID = TOK_RELOP_ID; // relational operator recognized Lex_State.LexemeID = LEX_GT_ID; // greater_than lexeme recognized strcpy( Lex_State.Token, ">" ); // make a copy of the token RC = GetNextCharacter( ); // get next character } break; case ':': if( Buffer[ 2 ] == '=' ) { Lex_State.TokenID = TOK_ASSIGNMENT_ID; // assignment recognized strcpy( Lex_State.Token, ":=" ); // make a copy of the token // move ahead two characters since this is a two-char token if( ( RC = GetNextCharacter( ) ) == ERR_OK ) RC = GetNextCharacter( ); } else { RC = Lex_State.Error = ERR_LEX_UNRECOGNIZED_CHAR; Lex_State.TokenID = TOK_ERROR_ID; strcpy( Lex_State.Token, ":" ); // make a copy of the token } break; case '=': Lex_State.TokenID = TOK_RELOP_ID; // relational operator recognized Lex_State.LexemeID = LEX_EQ_ID; // equal lexeme recognized strcpy( Lex_State.Token, "=" ); // make a copy of the token RC = GetNextCharacter( ); // get next character break; default: RC = Lex_State.Error = ERR_LEX_UNRECOGNIZED_CHAR; // set an error flag Lex_State.TokenID = TOK_ERROR_ID; // set an error flag Lex_State.Token[ 0 ] = Buffer[ 1 ]; // make a copy of the unrecognized character Lex_State.Token[ 1 ] = '\0'; break; } } // was our token one of the previous single-character tokens? if( Lex_State.TokenID == TOK_NO_ID ) { // no... // ...then see if it's a special delimiter, such as ' (quote) or {} (curly braces) switch( Buffer[ 1 ] ) { case '\'': // string delimiter found... if( Lex_State.InString ) { Lex_State.InString = FALSE; // ...clear string flag Lex_State.TokenID = TOK_STRING_ID; // set the token to "string" Lex_State.LexemeID = LEX_VALUE_ID; // set the lexeme to "string" GetNextCharacter( ); // ...get next character break; } else { Lex_State.InString = TRUE; // ...set string flag GetNextCharacter( ); // ...get next character break; } case '{': // start of comment block found... Lex_State.InComment ++; // ...increase comment block counter break; case '}': // end of comment block found... Lex_State.InComment --; // ...decrease comment block counter if( Lex_State.InComment < 0 ) LexError( ERR_LEX_CLOSE_COMMENT_BLOCK ); // comment close without open // skip the white space - if any - after the end of the comment if( isspace( Buffer[ 2 ] ) ) { RC = GetNextCharacter( ); SkipWhiteSpace( ); } break; default: // since it wasn't a special token, now try and recongnize it as a reserved word if( strcmp( Lex_State.Token, "program" ) == 0 ) Lex_State.TokenID = TOK_PROGRAM_ID; if( strcmp( Lex_State.Token, "const" ) == 0 ) Lex_State.TokenID = TOK_CONST_ID; if( strcmp( Lex_State.Token, "var" ) == 0 ) Lex_State.TokenID = TOK_VAR_ID; if( strcmp( Lex_State.Token, "begin" ) == 0 ) Lex_State.TokenID = TOK_BEGIN_ID; if( strcmp( Lex_State.Token, "end" ) == 0 ) Lex_State.TokenID = TOK_END_ID; if( strcmp( Lex_State.Token, "if" ) == 0 ) Lex_State.TokenID = TOK_IF_ID; if( strcmp( Lex_State.Token, "then" ) == 0 ) Lex_State.TokenID = TOK_THEN_ID; if( strcmp( Lex_State.Token, "while" ) == 0 ) Lex_State.TokenID = TOK_WHILE_ID; if( strcmp( Lex_State.Token, "do" ) == 0 ) Lex_State.TokenID = TOK_DO_ID; if( strcmp( Lex_State.Token, "write" ) == 0 ) Lex_State.TokenID = TOK_WRITE_ID; if( strcmp( Lex_State.Token, "read" ) == 0 ) Lex_State.TokenID = TOK_READ_ID; if( strcmp( Lex_State.Token, "odd" ) == 0 ) Lex_State.TokenID = TOK_ODD_ID; if( Lex_State.TokenID == TOK_NO_ID ) { // token was not recognized as a reserved work if( isdigit( Lex_State.Token[ 0 ] ) ) { // token recognized as a numerical value Lex_State.TokenID = TOK_IDENTIFIER_ID; Lex_State.LexemeID = LEX_VALUE_ID; sscanf( Lex_State.Token, "%i", &( Lex_State.VarID ) ); } else { // token must be a variable name Lex_State.TokenID = TOK_IDENTIFIER_ID; Lex_State.LexemeID = LEX_VAR_ID; Lex_State.VarID = SYM_NO_ID; } } break; } } } else // ...no delimiter found yet... { if( ! Lex_State.InComment ) // if we aren't in the middle of a comment, then... { UCHAR tmpbuffer[ 2 ] = " "; // temporary buffer tmpbuffer[ 0 ] = Buffer[ 1 ]; // hold current character in temporary buffer // convert to lowercase if not currently analyzing string if( ! Lex_State.InString ) tmpbuffer[ 0 ] = (UCHAR)tolower( tmpbuffer[ 0 ] ); // append the current character to the token if( strlen( Lex_State.Token ) == LEX_MAX_TOKEN_LEN ) { // token is too long to fit in Lex_State.Token[] - generate error Lex_State.Error = ERR_LEX_TOKEN_TOO_LONG; break; } else { // add the new character to Lex_State.Token[] strcat( Lex_State.Token, tmpbuffer ); } } GetNextCharacter( ); // get a new character if( ( Lex_State.Error == ERR_LEX_END_OF_FILE ) && ( ( Lex_State.InComment && ( Buffer[ 1 ] != '}' ) ) || ( Lex_State.InString && ( Buffer[ 1 ] != '\'' ) ) ) ) { // this most likely means unbalanced comments or strings Lex_State.Error = ERR_LEX_UNEXPECTED_EOF; break; } } } // we've just found a token - return to calling programme #if( _LEX_DEBUG ) if( Lex_State.TokenID == TOK_ERROR_ID ) { // if there's an error, print the offending character if( Lex_Debug.ScreenOutput ) printf( "%c", Buffer[ 1 ] ); if( Lex_Debug.SourceOutput ) fprintf( LexDebugSourceFilePtr, "%c", Buffer[ 1 ] ); if( Lex_Debug.MixOutput ) fprintf( LexDebugMixFilePtr, "%c", Buffer[ 1 ] ); } if( ( Lex_Debug.TokenOutput ) || ( Lex_Debug.ScreenOutput ) ) { //if( Lex_Debug.ScreenOutput ) printf( "[T%i", Lex_State.TokenID ); if( Lex_Debug.TokenOutput ) fprintf( LexDebugTokenFilePtr, "[T%i", Lex_State.TokenID ); if( Lex_Debug.MixOutput ) fprintf( LexDebugMixFilePtr, "[T%i", Lex_State.TokenID ); if( Lex_State.TokenID == TOK_IDENTIFIER_ID ) { // token is a variable if( Lex_State.LexemeID == LEX_VAR_ID ) { // token is a symbol table entry //if( Lex_Debug.ScreenOutput ) printf( ":L%i:S%i]", Lex_State.LexemeID, Lex_State.VarID ); if( Lex_Debug.TokenOutput ) fprintf( LexDebugTokenFilePtr, ":L%i:S%i]", Lex_State.LexemeID, Lex_State.VarID ); if( Lex_Debug.MixOutput ) fprintf( LexDebugMixFilePtr, ":L%i:S%i]", Lex_State.LexemeID, Lex_State.VarID ); } else { // token is a numeric value //if( Lex_Debug.ScreenOutput ) printf( ":L%i:Val%i]", Lex_State.LexemeID, Lex_State.VarID ); if( Lex_Debug.TokenOutput ) fprintf( LexDebugTokenFilePtr, ":L%i:Val%i]", Lex_State.LexemeID, Lex_State.VarID ); if( Lex_Debug.MixOutput ) fprintf( LexDebugMixFilePtr, ":L%i:Val%i]", Lex_State.LexemeID, Lex_State.VarID ); } } else { // token is a string, reserved word or operator if( Lex_State.LexemeID ) { // token is an operator //if( Lex_Debug.ScreenOutput ) printf( ":L%i]", Lex_State.LexemeID ); if( Lex_Debug.TokenOutput ) fprintf( LexDebugTokenFilePtr, ":L%i]", Lex_State.LexemeID ); if( Lex_Debug.MixOutput ) fprintf( LexDebugMixFilePtr, ":L%i]", Lex_State.LexemeID ); } else { // token is a reserved word //if( Lex_Debug.ScreenOutput ) printf( "]" ); if( Lex_Debug.TokenOutput ) fprintf( LexDebugTokenFilePtr, "]" ); if( Lex_Debug.MixOutput ) fprintf( LexDebugMixFilePtr, "]" ); } } } #endif return RC; } /* * */ ERR SkipWhiteSpace( ) { ERR RC = ERR_OK; while( isspace( Buffer[ 1 ] ) ) { // find the next non-white character GetNextCharacter( ); if( Lex_State.Error ) { if( Lex_State.Error != ERR_LEX_END_OF_FILE ) { LexError( Lex_State.Error ); } else { // eof was reached, so stop looking for non-white characters Lex_State.TokenID = TOK_END_OF_FILE_ID; break; } } } return RC; } #if( _LEX_DEBUG ) /* * FUNCTIN: InitLexDebug */ ERR InitLexDebug( BOOL screen, BOOL source, BOOL token, BOOL mix ) { ERR RC = ERR_OK; Lex_Debug.ScreenOutput = screen; Lex_Debug.SourceOutput = source; Lex_Debug.TokenOutput = token; Lex_Debug.MixOutput = mix; if( Lex_Debug.SourceOutput ) { if( ! ( LexDebugSourceFilePtr = fopen( LEX_DEBUG_SOURCE_FILENAME, "wt" ) ) ) LexError( ERR_LEX_DEBUG_FILE_ERROR ); } if( Lex_Debug.TokenOutput ) { if( ! ( LexDebugTokenFilePtr = fopen( LEX_DEBUG_TOKEN_FILENAME, "wt" ) ) ) LexError( ERR_LEX_DEBUG_FILE_ERROR ); } if( Lex_Debug.MixOutput ) { if( ! ( LexDebugMixFilePtr = fopen( LEX_DEBUG_MIX_FILENAME, "wt" ) ) ) LexError( ERR_LEX_DEBUG_FILE_ERROR ); } LexDebugWrite( "\n%s\n", _LEX_VERSION ); if( Lex_Debug.ScreenOutput ) LexDebugWrite( "=> screen output enabled.\n" ); else LexDebugWrite( "=> screen output disabled.\n" ); if( Lex_Debug.SourceOutput ) LexDebugWrite( "=> source output to file \"%s\" enabled.\n", LEX_DEBUG_SOURCE_FILENAME ); else LexDebugWrite( "=> source output to file disabled.\n" ); if( Lex_Debug.TokenOutput ) LexDebugWrite( "=> token output to file \"%s\" enabled.\n", LEX_DEBUG_TOKEN_FILENAME ); else LexDebugWrite( "=> token output to file disabled.\n" ); if( Lex_Debug.MixOutput ) LexDebugWrite( "=> mix output to file \"%s\" enabled.\n", LEX_DEBUG_MIX_FILENAME ); else LexDebugWrite( "=> mix output to file disabled.\n" ); return RC; } #endif #if( _LEX_DEBUG ) /* * FUNCTION: ResetLexDebug */ ERR ResetLexDebug( ) { ERR RC = ERR_OK; if( Lex_State.Error != ERR_LEX_END_OF_FILE ) { LexDebugWrite( "\nSource file: %s\n", FileName ); LexDebugWrite( "Line number: %i\n", Lex_State.LineNumber ); LexDebugWrite( "Char offset: %i\n", Lex_State.CharOffset ); } else { LexDebugWrite( "\n\nFinished reading file %s\n", FileName ); } if( Lex_Debug.SourceOutput ) fclose( LexDebugSourceFilePtr ); if( Lex_Debug.TokenOutput ) fclose( LexDebugTokenFilePtr ); if( Lex_Debug.MixOutput ) fclose( LexDebugMixFilePtr ); return RC; } #endif #if( _LEX_DEBUG ) /* * FUNCTION: LexDebugWrite */ ERR LexDebugWrite( UCHAR *text, ... ) { ERR RC = ERR_OK; va_list ptr; /* va is a variable argument type implemented in newer revisions */ va_start( ptr, text ); /* of ANSI C. Support library includes vfprintf, vprintf, vsprintf */ // if screen debug output is enabled... if( Lex_Debug.ScreenOutput ) vprintf( text, ptr ); // ...then output to screen // if file debug source output is enabled... if( Lex_Debug.SourceOutput ) vfprintf( LexDebugSourceFilePtr, text, ptr ); // ...then output to source file // if file debug token output is enabled... if( Lex_Debug.TokenOutput ) vfprintf( LexDebugTokenFilePtr, text, ptr ); // ...then output to token file // if mix debug output is enabled... if( Lex_Debug.MixOutput ) vfprintf( LexDebugMixFilePtr, text, ptr ); // ...then ouput to mix file va_end( ptr ); return RC; } #endif