200 lines
6.7 KiB
Python
200 lines
6.7 KiB
Python
#=======================================================================
|
|
#
|
|
# Python Lexical Analyser
|
|
#
|
|
# Lexical Analyser Specification
|
|
#
|
|
#=======================================================================
|
|
|
|
from __future__ import absolute_import
|
|
|
|
import types
|
|
|
|
from . import Actions
|
|
from . import DFA
|
|
from . import Errors
|
|
from . import Machines
|
|
from . import Regexps
|
|
|
|
# debug_flags for Lexicon constructor
|
|
DUMP_NFA = 1
|
|
DUMP_DFA = 2
|
|
|
|
|
|
class State(object):
|
|
"""
|
|
This class is used as part of a Plex.Lexicon specification to
|
|
introduce a user-defined state.
|
|
|
|
Constructor:
|
|
|
|
State(name, token_specifications)
|
|
"""
|
|
|
|
name = None
|
|
tokens = None
|
|
|
|
def __init__(self, name, tokens):
|
|
self.name = name
|
|
self.tokens = tokens
|
|
|
|
|
|
class Lexicon(object):
|
|
"""
|
|
Lexicon(specification) builds a lexical analyser from the given
|
|
|specification|. The specification consists of a list of
|
|
specification items. Each specification item may be either:
|
|
|
|
1) A token definition, which is a tuple:
|
|
|
|
(pattern, action)
|
|
|
|
The |pattern| is a regular axpression built using the
|
|
constructors defined in the Plex module.
|
|
|
|
The |action| is the action to be performed when this pattern
|
|
is recognised (see below).
|
|
|
|
2) A state definition:
|
|
|
|
State(name, tokens)
|
|
|
|
where |name| is a character string naming the state,
|
|
and |tokens| is a list of token definitions as
|
|
above. The meaning and usage of states is described
|
|
below.
|
|
|
|
Actions
|
|
-------
|
|
|
|
The |action| in a token specication may be one of three things:
|
|
|
|
1) A function, which is called as follows:
|
|
|
|
function(scanner, text)
|
|
|
|
where |scanner| is the relevant Scanner instance, and |text|
|
|
is the matched text. If the function returns anything
|
|
other than None, that value is returned as the value of the
|
|
token. If it returns None, scanning continues as if the IGNORE
|
|
action were specified (see below).
|
|
|
|
2) One of the following special actions:
|
|
|
|
IGNORE means that the recognised characters will be treated as
|
|
white space and ignored. Scanning will continue until
|
|
the next non-ignored token is recognised before returning.
|
|
|
|
TEXT causes the scanned text itself to be returned as the
|
|
value of the token.
|
|
|
|
3) Any other value, which is returned as the value of the token.
|
|
|
|
States
|
|
------
|
|
|
|
At any given time, the scanner is in one of a number of states.
|
|
Associated with each state is a set of possible tokens. When scanning,
|
|
only tokens associated with the current state are recognised.
|
|
|
|
There is a default state, whose name is the empty string. Token
|
|
definitions which are not inside any State definition belong to
|
|
the default state.
|
|
|
|
The initial state of the scanner is the default state. The state can
|
|
be changed in one of two ways:
|
|
|
|
1) Using Begin(state_name) as the action of a token.
|
|
|
|
2) Calling the begin(state_name) method of the Scanner.
|
|
|
|
To change back to the default state, use '' as the state name.
|
|
"""
|
|
|
|
machine = None # Machine
|
|
tables = None # StateTableMachine
|
|
|
|
def __init__(self, specifications, debug=None, debug_flags=7, timings=None):
|
|
if not isinstance(specifications, list):
|
|
raise Errors.InvalidScanner("Scanner definition is not a list")
|
|
if timings:
|
|
from .Timing import time
|
|
|
|
total_time = 0.0
|
|
time1 = time()
|
|
nfa = Machines.Machine()
|
|
default_initial_state = nfa.new_initial_state('')
|
|
token_number = 1
|
|
for spec in specifications:
|
|
if isinstance(spec, State):
|
|
user_initial_state = nfa.new_initial_state(spec.name)
|
|
for token in spec.tokens:
|
|
self.add_token_to_machine(
|
|
nfa, user_initial_state, token, token_number)
|
|
token_number += 1
|
|
elif isinstance(spec, tuple):
|
|
self.add_token_to_machine(
|
|
nfa, default_initial_state, spec, token_number)
|
|
token_number += 1
|
|
else:
|
|
raise Errors.InvalidToken(
|
|
token_number,
|
|
"Expected a token definition (tuple) or State instance")
|
|
if timings:
|
|
time2 = time()
|
|
total_time = total_time + (time2 - time1)
|
|
time3 = time()
|
|
if debug and (debug_flags & 1):
|
|
debug.write("\n============= NFA ===========\n")
|
|
nfa.dump(debug)
|
|
dfa = DFA.nfa_to_dfa(nfa, debug=(debug_flags & 3) == 3 and debug)
|
|
if timings:
|
|
time4 = time()
|
|
total_time = total_time + (time4 - time3)
|
|
if debug and (debug_flags & 2):
|
|
debug.write("\n============= DFA ===========\n")
|
|
dfa.dump(debug)
|
|
if timings:
|
|
timings.write("Constructing NFA : %5.2f\n" % (time2 - time1))
|
|
timings.write("Converting to DFA: %5.2f\n" % (time4 - time3))
|
|
timings.write("TOTAL : %5.2f\n" % total_time)
|
|
self.machine = dfa
|
|
|
|
def add_token_to_machine(self, machine, initial_state, token_spec, token_number):
|
|
try:
|
|
(re, action_spec) = self.parse_token_definition(token_spec)
|
|
# Disabled this -- matching empty strings can be useful
|
|
#if re.nullable:
|
|
# raise Errors.InvalidToken(
|
|
# token_number, "Pattern can match 0 input symbols")
|
|
if isinstance(action_spec, Actions.Action):
|
|
action = action_spec
|
|
else:
|
|
try:
|
|
action_spec.__call__
|
|
except AttributeError:
|
|
action = Actions.Return(action_spec)
|
|
else:
|
|
action = Actions.Call(action_spec)
|
|
final_state = machine.new_state()
|
|
re.build_machine(machine, initial_state, final_state,
|
|
match_bol=1, nocase=0)
|
|
final_state.set_action(action, priority=-token_number)
|
|
except Errors.PlexError as e:
|
|
raise e.__class__("Token number %d: %s" % (token_number, e))
|
|
|
|
def parse_token_definition(self, token_spec):
|
|
if not isinstance(token_spec, tuple):
|
|
raise Errors.InvalidToken("Token definition is not a tuple")
|
|
if len(token_spec) != 2:
|
|
raise Errors.InvalidToken("Wrong number of items in token definition")
|
|
pattern, action = token_spec
|
|
if not isinstance(pattern, Regexps.RE):
|
|
raise Errors.InvalidToken("Pattern is not an RE instance")
|
|
return (pattern, action)
|
|
|
|
def get_initial_state(self, name):
|
|
return self.machine.get_initial_state(name)
|
|
|
|
|
|
|