home *** CD-ROM | disk | FTP | other *** search
- <?php
- /* vim: set expandtab tabstop=4 shiftwidth=4: */
- // +----------------------------------------------------------------------+
- // | Copyright (c) 2002-2004 Brent Cook |
- // +----------------------------------------------------------------------+
- // | This library is free software; you can redistribute it and/or |
- // | modify it under the terms of the GNU Lesser General Public |
- // | License as published by the Free Software Foundation; either |
- // | version 2.1 of the License, or (at your option) any later version. |
- // | |
- // | This library is distributed in the hope that it will be useful, |
- // | but WITHOUT ANY WARRANTY; without even the implied warranty of |
- // | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
- // | Lesser General Public License for more details. |
- // | |
- // | You should have received a copy of the GNU Lesser General Public |
- // | License along with this library; if not, write to the Free Software |
- // | Foundation, Inc., 59 Temple Place, Suite 330,Boston,MA 02111-1307 USA|
- // +----------------------------------------------------------------------+
- // | Authors: Brent Cook <busterbcook@yahoo.com> |
- // | Jason Pell <jasonpell@hotmail.com> |
- // +----------------------------------------------------------------------+
- //
- // $Id: Lexer.php,v 1.20 2004/05/07 12:33:35 busterb Exp $
- //
-
- include 'SQL/ctype.php';
-
- // {{{ token definitions
- // variables: 'ident', 'sys_var'
- // values: 'real_val', 'text_val', 'int_val', null
- // }}}
-
- /**
- * A lexigraphical analyser inspired by the msql lexer
- *
- * @author Brent Cook <busterbcook@yahoo.com>
- * @version 0.5
- * @access public
- * @package SQL_Parser
- */
- class Lexer
- {
- // array of valid tokens for the lexer to recognize
- // format is 'token literal'=>TOKEN_VALUE
- var $symbols = array();
-
- // {{{ instance variables
- var $tokPtr = 0;
- var $tokStart = 0;
- var $tokLen = 0;
- var $tokText = '';
- var $lineNo = 0;
- var $lineBegin = 0;
- var $string = '';
- var $stringLen = 0;
-
- // Will not be altered by skip()
- var $tokAbsStart = 0;
- var $skipText = '';
-
- // Provide lookahead capability.
- var $lookahead = 0;
- // Specify how many tokens to save in tokenStack, so the
- // token stream can be pushed back.
- var $tokenStack = array();
- var $stackPtr = 0;
- // }}}
-
- // {{{ incidental functions
- function Lexer($string = '', $lookahead=0)
- {
- $this->string = $string;
- $this->stringLen = strlen($string);
- $this->lookahead = $lookahead;
- }
-
- function get() {
- ++$this->tokPtr;
- ++$this->tokLen;
- return ($this->tokPtr <= $this->stringLen) ? $this->string{$this->tokPtr - 1} : null;
- }
-
- function unget() {
- --$this->tokPtr;
- --$this->tokLen;
- }
-
- function skip() {
- ++$this->tokStart;
- return ($this->tokPtr != $this->stringLen) ? $this->string{$this->tokPtr++} : '';
- }
-
- function revert() {
- $this->tokPtr = $this->tokStart;
- $this->tokLen = 0;
- }
-
- function isCompop($c) {
- return (($c == '<') || ($c == '>') || ($c == '=') || ($c == '!'));
- }
- // }}}
-
- // {{{ pushBack()
- /*
- * Push back a token, so the very next call to lex() will return that token.
- * Calls to this function will be ignored if there is no lookahead specified
- * to the constructor, or the pushBack() function has already been called the
- * maximum number of token's that can be looked ahead.
- */
- function pushBack()
- {
- if($this->lookahead>0 && count($this->tokenStack)>0 && $this->stackPtr>0) {
- $this->stackPtr--;
- }
- }
- // }}}
-
- // {{{ lex()
- function lex()
- {
- if($this->lookahead>0) {
- // The stackPtr, should always be the same as the count of
- // elements in the tokenStack. The stackPtr, can be thought
- // of as pointing to the next token to be added. If however
- // a pushBack() call is made, the stackPtr, will be less than the
- // count, to indicate that we should take that token from the
- // stack, instead of calling nextToken for a new token.
-
- if ($this->stackPtr<count($this->tokenStack)) {
-
- $this->tokText = $this->tokenStack[$this->stackPtr]['tokText'];
- $this->skipText = $this->tokenStack[$this->stackPtr]['skipText'];
- $token = $this->tokenStack[$this->stackPtr]['token'];
-
- // We have read the token, so now iterate again.
- $this->stackPtr++;
- return $token;
-
- } else {
-
- // If $tokenStack is full (equal to lookahead), pop the oldest
- // element off, to make room for the new one.
-
- if ($this->stackPtr == $this->lookahead) {
- // For some reason array_shift and
- // array_pop screw up the indexing, so we do it manually.
- for($i=0; $i<(count($this->tokenStack)-1); $i++) {
- $this->tokenStack[$i] = $this->tokenStack[$i+1];
- }
-
- // Indicate that we should put the element in
- // at the stackPtr position.
- $this->stackPtr--;
- }
-
- $token = $this->nextToken();
- $this->tokenStack[$this->stackPtr] =
- array('token'=>$token,
- 'tokText'=>$this->tokText,
- 'skipText'=>$this->skipText);
- $this->stackPtr++;
- return $token;
- }
- }
- else
- {
- return $this->nextToken();
- }
- }
- // }}}
-
- // {{{ nextToken()
- function nextToken()
- {
- if ($this->string == '') return;
- $state = 0;
- $this->tokAbsStart = $this->tokStart;
-
- while (true){
- //echo "State: $state, Char: $c\n";
- switch($state) {
- // {{{ State 0 : Start of token
- case 0:
- $this->tokPtr = $this->tokStart;
- $this->tokText = '';
- $this->tokLen = 0;
- $c = $this->get();
-
- if (is_null($c)) { // End Of Input
- $state = 1000;
- break;
- }
-
- while (($c == ' ') || ($c == "\t")
- || ($c == "\n") || ($c == "\r")) {
- if ($c == "\n" || $c == "\r") {
- // Handle MAC/Unix/Windows line endings.
- if($c == "\r") {
- $c = $this->skip();
-
- // If not DOS newline
- if($c != "\n")
- $this->unget();
- }
- ++$this->lineNo;
- $this->lineBegin = $this->tokPtr;
- }
-
- $c = $this->skip();
- $this->tokLen = 1;
- }
-
- // Escape quotes and backslashes
- if ($c == '\\') {
- $t = $this->get();
- if ($t == '\'' || $t == '\\' || $t == '"') {
- $this->tokText = $t;
- $this->tokStart = $this->tokPtr;
- return $this->tokText;
- } else {
- $this->unget();
-
- // Unknown token. Revert to single char
- $state = 999;
- break;
- }
- }
-
- if (($c == '\'') || ($c == '"')) { // text string
- $quote = $c;
- $state = 12;
- break;
- }
-
- if ($c == '_') { // system variable
- $state = 18;
- break;
- }
-
- if (ctype_alpha(ord($c))) { // keyword or ident
- $state = 1;
- break;
- }
-
- if (ctype_digit(ord($c))) { // real or int number
- $state = 5;
- break;
- }
-
- if ($c == '.') {
- $t = $this->get();
- if ($t == '.') { // ellipsis
- if ($this->get() == '.') {
- $this->tokText = '...';
- $this->tokStart = $this->tokPtr;
- return $this->tokText;
- } else {
- $state = 999;
- break;
- }
- } else if (ctype_digit(ord($t))) { // real number
- $this->unget();
- $state = 7;
- break;
- } else { // period
- $this->unget();
- }
- }
-
- if ($c == '#') { // Comments
- $state = 14;
- break;
- }
- if ($c == '-') {
- $t = $this->get();
- if ($t == '-') {
- $state = 14;
- break;
- } else { // negative number
- $this->unget();
- $state = 5;
- break;
- }
- }
-
- if ($this->isCompop($c)) { // comparison operator
- $state = 10;
- break;
- }
- // Unknown token. Revert to single char
- $state = 999;
- break;
- // }}}
-
- // {{{ State 1 : Incomplete keyword or ident
- case 1:
- $c = $this->get();
- if (ctype_alnum(ord($c)) || ($c == '_') || ($c == '.')) {
- $state = 1;
- break;
- }
- $state = 2;
- break;
- // }}}
-
- /* {{{ State 2 : Complete keyword or ident */
- case 2:
- $this->unget();
- $this->tokText = substr($this->string, $this->tokStart,
- $this->tokLen);
-
- $testToken = strtolower($this->tokText);
- if (isset($this->symbols[$testToken])) {
-
- $this->skipText = substr($this->string, $this->tokAbsStart,
- $this->tokStart-$this->tokAbsStart);
- $this->tokStart = $this->tokPtr;
- return $testToken;
- } else {
- $this->skipText = substr($this->string, $this->tokAbsStart,
- $this->tokStart-$this->tokAbsStart);
- $this->tokStart = $this->tokPtr;
- return 'ident';
- }
- break;
- // }}}
-
- // {{{ State 5: Incomplete real or int number
- case 5:
- $c = $this->get();
- if (ctype_digit(ord($c))) {
- $state = 5;
- break;
- } else if ($c == '.') {
- $t = $this->get();
- if($t == '.') { // ellipsis
- $this->unget();
- } else { // real number
- $state = 7;
- break;
- }
- } else if(ctype_alpha(ord($c))) { // number must end with non-alpha character
- $state = 999;
- break;
- } else {
- // complete number
- $state = 6;
- break;
- }
- // }}}
-
- // {{{ State 6: Complete integer number
- case 6:
- $this->unget();
- $this->tokText = intval(substr($this->string, $this->tokStart,
- $this->tokLen));
- $this->skipText = substr($this->string, $this->tokAbsStart,
- $this->tokStart-$this->tokAbsStart);
- $this->tokStart = $this->tokPtr;
- return 'int_val';
- break;
- // }}}
-
- // {{{ State 7: Incomplete real number
- case 7:
- $c = $this->get();
-
- /* Analogy Start */
- if ($c == 'e' || $c == 'E') {
- $state = 15;
- break;
- }
- /* Analogy End */
-
- if (ctype_digit(ord($c))) {
- $state = 7;
- break;
- }
- $state = 8;
- break;
- // }}}
-
- // {{{ State 8: Complete real number */
- case 8:
- $this->unget();
- $this->tokText = floatval(substr($this->string, $this->tokStart,
- $this->tokLen));
- $this->skipText = substr($this->string, $this->tokAbsStart,
- $this->tokStart-$this->tokAbsStart);
- $this->tokStart = $this->tokPtr;
- return 'real_val';
- // }}}
-
- // {{{ State 10: Incomplete comparison operator
- case 10:
- $c = $this->get();
- if ($this->isCompop($c))
- {
- $state = 10;
- break;
- }
- $state = 11;
- break;
- // }}}
-
- // {{{ State 11: Complete comparison operator
- case 11:
- $this->unget();
- $this->tokText = substr($this->string, $this->tokStart,
- $this->tokLen);
- if($this->tokText) {
- $this->skipText = substr($this->string, $this->tokAbsStart,
- $this->tokStart-$this->tokAbsStart);
- $this->tokStart = $this->tokPtr;
- return $this->tokText;
- }
- $state = 999;
- break;
- // }}}
-
- // {{{ State 12: Incomplete text string
- case 12:
- $bail = false;
- while (!$bail) {
- switch ($this->get()) {
- case '':
- $this->tokText = null;
- $bail = true;
- break;
- case "\\":
- if (!$this->get()) {
- $this->tokText = null;
- $bail = true;
- }
- //$bail = true;
- break;
- case $quote:
- $this->tokText = stripslashes(substr($this->string,
- ($this->tokStart+1), ($this->tokLen-2)));
- $bail = true;
- break;
- }
- }
- if (!is_null($this->tokText)) {
- $state = 13;
- break;
- }
- $state = 999;
- break;
- // }}}
-
- // {{{ State 13: Complete text string
- case 13:
- $this->skipText = substr($this->string, $this->tokAbsStart,
- $this->tokStart-$this->tokAbsStart);
- $this->tokStart = $this->tokPtr;
- return 'text_val';
- break;
- // }}}
-
- // {{{ State 14: Comment
- case 14:
- $c = $this->skip();
- if ($c == "\n" || $c == "\r" || $c == "") {
- // Handle MAC/Unix/Windows line endings.
- if ($c == "\r") {
- $c = $this->skip();
- // If not DOS newline
- if ($c != "\n") {
- $this->unget();
- }
- }
-
- if ($c != "") {
- ++$this->lineNo;
- $this->lineBegin = $this->tokPtr;
- }
-
- // We need to skip all the text.
- $this->tokStart = $this->tokPtr;
- $state = 0;
- } else {
- $state = 14;
- }
- break;
- // }}}
-
- // {{{ State 15: Exponent Sign in Scientific Notation
- case 15:
- $c = $this->get();
- if($c == '-' || $c == '+') {
- $state = 16;
- break;
- }
- $state = 999;
- break;
- // }}}
-
- // {{{ state 16: Exponent Value-first digit in Scientific Notation
- case 16:
- $c = $this->get();
- if (ctype_digit(ord($c))) {
- $state = 17;
- break;
- }
- $state = 999; // if no digit, then token is unknown
- break;
- // }}}
-
- // {{{ State 17: Exponent Value in Scientific Notation
- case 17:
- $c = $this->get();
- if (ctype_digit(ord($c))) {
- $state = 17;
- break;
- }
- $state = 8; // At least 1 exponent digit was required
- break;
- // }}}
-
- // {{{ State 18 : Incomplete System Variable
- case 18:
- $c = $this->get();
- if (ctype_alnum(ord($c)) || $c == '_') {
- $state = 18;
- break;
- }
- $state = 19;
- break;
- // }}}
-
- // {{{ State 19: Complete Sys Var
- case 19:
- $this->unget();
- $this->tokText = substr($this->string, $this->tokStart,
- $this->tokLen);
- $this->skipText = substr($this->string, $this->tokAbsStart,
- $this->tokStart-$this->tokAbsStart);
- $this->tokStart = $this->tokPtr;
- return 'sys_var';
- // }}}
-
- // {{{ State 999 : Unknown token. Revert to single char
- case 999:
- $this->revert();
- $this->tokText = $this->get();
- $this->skipText = substr($this->string, $this->tokAbsStart,
- $this->tokStart-$this->tokAbsStart);
- $this->tokStart = $this->tokPtr;
- return $this->tokText;
- // }}}
-
- // {{{ State 1000 : End Of Input
- case 1000:
- $this->tokText = '*end of input*';
- $this->skipText = substr($this->string, $this->tokAbsStart,
- $this->tokStart-$this->tokAbsStart);
- $this->tokStart = $this->tokPtr;
- return null;
- // }}}
- }
- }
- }
- // }}}
- }
- ?>
-