Source code for pyExcelerator.ExcelFormulaLexer

#!/usr/bin/env python
# -*- coding: windows-1251 -*-

#  Copyright (C) 2005 Roman V. Kiseliov
#  All rights reserved.
#
#  Redistribution and use in source and binary forms, with or without
#  modification, are permitted provided that the following conditions
#  are met:
#
#  1. Redistributions of source code must retain the above copyright
#     notice, this list of conditions and the following disclaimer.
#
#  2. Redistributions in binary form must reproduce the above copyright
#     notice, this list of conditions and the following disclaimer in
#     the documentation and/or other materials provided with the
#     distribution.
#
#  3. All advertising materials mentioning features or use of this
#     software must display the following acknowledgment:
#     "This product includes software developed by
#      Roman V. Kiseliov <roman@kiseliov.ru>."
#
#  4. Redistributions of any form whatsoever must retain the following
#     acknowledgment:
#     "This product includes software developed by
#      Roman V. Kiseliov <roman@kiseliov.ru>."
#
#  THIS SOFTWARE IS PROVIDED BY Roman V. Kiseliov ``AS IS'' AND ANY
#  EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
#  PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL Roman V. Kiseliov OR
#  ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
#  NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
#  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
#  HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
#  STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
#  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
#  OF THE POSSIBILITY OF SUCH DAMAGE.


__rev_id__ = """$Id: ExcelFormulaLexer.py,v 1.4 2005/08/14 06:40:23 rvk Exp $"""


import sys
from antlr import EOF, CommonToken as Tok, TokenStream, TokenStreamException
import struct
import ExcelFormulaParser
from re import compile as recompile, match, LOCALE, UNICODE, IGNORECASE


int_const_pattern = recompile(r"\d+")
flt_const_pattern = recompile(r"\d*\.\d+(?:[Ee][+-]?\d+)?")
str_const_pattern = recompile(r'["][^"]*["]')
#range2d_pattern   = recompile(r"\$?[A-I]?[A-Z]\$?\d+:\$?[A-I]?[A-Z]\$?\d+")
ref2d_pattern     = recompile(r"\$?[A-I]?[A-Z]\$?\d+")
true_pattern      = recompile(r"TRUE", IGNORECASE)
false_pattern     = recompile(r"FALSE", IGNORECASE)
name_pattern      = recompile(r"[\.\w]+", LOCALE)

pattern_type_tuples = (
    (flt_const_pattern, ExcelFormulaParser.NUM_CONST),
    (int_const_pattern, ExcelFormulaParser.INT_CONST),
    (str_const_pattern, ExcelFormulaParser.STR_CONST),
#    (range2d_pattern  , ExcelFormulaParser.RANGE2D),
    (ref2d_pattern    , ExcelFormulaParser.REF2D),
    (true_pattern     , ExcelFormulaParser.TRUE_CONST),
    (false_pattern    , ExcelFormulaParser.FALSE_CONST),
    (name_pattern     , ExcelFormulaParser.NAME)
)


type_text_tuples = (
    (ExcelFormulaParser.NE, '<>'),
    (ExcelFormulaParser.LE, '<='),
    (ExcelFormulaParser.GE, '>='),
    (ExcelFormulaParser.EQ, '='),
    (ExcelFormulaParser.LT, '<'),
    (ExcelFormulaParser.GT, '>'),
    (ExcelFormulaParser.ADD, '+'),
    (ExcelFormulaParser.SUB, '-'),
    (ExcelFormulaParser.MUL, '*'),
    (ExcelFormulaParser.DIV, '/'),
    (ExcelFormulaParser.COLON, ':'),
    (ExcelFormulaParser.SEMICOLON, ';'),
    (ExcelFormulaParser.COMMA, ','),
    (ExcelFormulaParser.LP, '('),
    (ExcelFormulaParser.RP, ')'),
    (ExcelFormulaParser.CONCAT, '&'),
    (ExcelFormulaParser.PERCENT, '%'),
    (ExcelFormulaParser.POWER, '^')
)


[docs]class Lexer(TokenStream): def __init__(self, text): self._text = text[:] self._pos = 0 self._line = 0
[docs] def isEOF(self): return len(self._text) <= self._pos
[docs] def rest(self): return self._text[self._pos:]
[docs] def curr_ch(self): return self._text[self._pos]
[docs] def next_ch(self, n = 1): self._pos += n
[docs] def is_whitespace(self): return self.curr_ch() in " \t\n\r\f\v"
[docs] def match_pattern(self, pattern, toktype): m = pattern.match(self._text[self._pos:]) if m: start_pos = self._pos + m.start(0) end_pos = self._pos + m.end(0) tt = self._text[start_pos:end_pos] self._pos = end_pos return Tok(type = toktype, text = tt, col = start_pos + 1) else: return None
[docs] def nextToken(self): # skip whitespace while not self.isEOF() and self.is_whitespace(): self.next_ch() if self.isEOF(): return Tok(type = EOF) # first, try to match token with more chars for ptt in pattern_type_tuples: t = self.match_pattern(*ptt); if t: return t # second, we want find short tokens for ty, te in type_text_tuples: if self.rest().startswith(te): self.next_ch(len(te)) return Tok(type = ty, text = te, col = self._pos) # at this point, smth strange is happened raise TokenStreamException("Unknown char %s at %u col." % (self.curr_ch(), self._pos))
if __name__ == '__main__' : import locale locale.setlocale(locale.LC_ALL, 'russian') try: for t in Lexer('1+2+3+67.8678 + " @##$$$ klhkh kljhklhkl " + .58e-678*A1:B4 - 1lkjljlkjl3535порпор'): print t except TokenStreamException, e: print "error:", e