diff options
author | Slávek Banko <slavek.banko@axis.cz> | 2021-03-26 13:52:33 +0100 |
---|---|---|
committer | Slávek Banko <slavek.banko@axis.cz> | 2021-03-26 13:52:33 +0100 |
commit | 0f27805eedcc40ae34009aa31a4dc08cb949f867 (patch) | |
tree | 8b1c8995d7fdab97acde4bd7c63f96d378c34d02 /debian/pyrex/pyrex-0.9.9/Pyrex/Plex | |
parent | bad411472a12b93f8bfca6b7ca52d89488a8d8ce (diff) | |
download | extra-dependencies-0f27805e.tar.gz extra-dependencies-0f27805e.zip |
DEB pyrex: Added to repository.
Signed-off-by: Slávek Banko <slavek.banko@axis.cz>
Diffstat (limited to 'debian/pyrex/pyrex-0.9.9/Pyrex/Plex')
-rwxr-xr-x | debian/pyrex/pyrex-0.9.9/Pyrex/Plex/Actions.py | 109 | ||||
-rwxr-xr-x | debian/pyrex/pyrex-0.9.9/Pyrex/Plex/DFA.py | 156 | ||||
-rwxr-xr-x | debian/pyrex/pyrex-0.9.9/Pyrex/Plex/Errors.py | 52 | ||||
-rwxr-xr-x | debian/pyrex/pyrex-0.9.9/Pyrex/Plex/Lexicons.py | 192 | ||||
-rwxr-xr-x | debian/pyrex/pyrex-0.9.9/Pyrex/Plex/Machines.py | 326 | ||||
-rwxr-xr-x | debian/pyrex/pyrex-0.9.9/Pyrex/Plex/Regexps.py | 557 | ||||
-rwxr-xr-x | debian/pyrex/pyrex-0.9.9/Pyrex/Plex/Scanners.py | 377 | ||||
-rwxr-xr-x | debian/pyrex/pyrex-0.9.9/Pyrex/Plex/Timing.py | 22 | ||||
-rwxr-xr-x | debian/pyrex/pyrex-0.9.9/Pyrex/Plex/Traditional.py | 154 | ||||
-rwxr-xr-x | debian/pyrex/pyrex-0.9.9/Pyrex/Plex/Transitions.py | 253 | ||||
-rwxr-xr-x | debian/pyrex/pyrex-0.9.9/Pyrex/Plex/__init__.py | 40 | ||||
-rwxr-xr-x | debian/pyrex/pyrex-0.9.9/Pyrex/Plex/test_tm.py | 24 |
12 files changed, 2262 insertions, 0 deletions
diff --git a/debian/pyrex/pyrex-0.9.9/Pyrex/Plex/Actions.py b/debian/pyrex/pyrex-0.9.9/Pyrex/Plex/Actions.py new file mode 100755 index 00000000..23253a90 --- /dev/null +++ b/debian/pyrex/pyrex-0.9.9/Pyrex/Plex/Actions.py @@ -0,0 +1,109 @@ +#======================================================================= +# +# Python Lexical Analyser +# +# Actions for use in token specifications +# +#======================================================================= + +class Action: + + def same_as(self, other): + return self is other + + +class Return(Action): + """ + Internal Plex action which causes |value| to + be returned as the value of the associated token + """ + + value = None + + def __init__(self, value): + self.value = value + + def perform(self, token_stream, text): + return self.value + + def same_as(self, other): + return isinstance(other, Return) and self.value == other.value + + def __repr__(self): + return "Return(%s)" % repr(self.value) + + +class Call(Action): + """ + Internal Plex action which causes a function to be called. + """ + + function = None + + def __init__(self, function): + self.function = function + + def perform(self, token_stream, text): + return self.function(token_stream, text) + + def __repr__(self): + return "Call(%s)" % self.function.__name__ + + def same_as(self, other): + return isinstance(other, Call) and self.function is other.function + + +class Begin(Action): + """ + Begin(state_name) is a Plex action which causes the Scanner to + enter the state |state_name|. See the docstring of Plex.Lexicon + for more information. + """ + + state_name = None + + def __init__(self, state_name): + self.state_name = state_name + + def perform(self, token_stream, text): + token_stream.begin(self.state_name) + + def __repr__(self): + return "Begin(%s)" % self.state_name + + def same_as(self, other): + return isinstance(other, Begin) and self.state_name == other.state_name + + +class Ignore(Action): + """ + IGNORE is a Plex action which causes its associated token + to be ignored. See the docstring of Plex.Lexicon for more + information. + """ + def perform(self, token_stream, text): + return None + + def __repr__(self): + return "IGNORE" + +IGNORE = Ignore() +IGNORE.__doc__ = Ignore.__doc__ + +class Text(Action): + """ + TEXT is a Plex action which causes the text of a token to + be returned as the value of the token. See the docstring of + Plex.Lexicon for more information. + """ + + def perform(self, token_stream, text): + return text + + def __repr__(self): + return "TEXT" + +TEXT = Text() +TEXT.__doc__ = Text.__doc__ + + diff --git a/debian/pyrex/pyrex-0.9.9/Pyrex/Plex/DFA.py b/debian/pyrex/pyrex-0.9.9/Pyrex/Plex/DFA.py new file mode 100755 index 00000000..2c0004b0 --- /dev/null +++ b/debian/pyrex/pyrex-0.9.9/Pyrex/Plex/DFA.py @@ -0,0 +1,156 @@ +#======================================================================= +# +# Python Lexical Analyser +# +# Converting NFA to DFA +# +#======================================================================= + +import Machines +from Machines import LOWEST_PRIORITY +from Transitions import TransitionMap + +def nfa_to_dfa(old_machine, debug = None): + """ + Given a nondeterministic Machine, return a new equivalent + Machine which is deterministic. + """ + # We build a new machine whose states correspond to sets of states + # in the old machine. Initially we add a new state corresponding to + # the epsilon-closure of each initial old state. Then we give transitions + # to each new state which are the union of all transitions out of any + # of the corresponding old states. The new state reached on a given + # character is the one corresponding to the set of states reachable + # on that character from any of the old states. As new combinations of + # old states are created, new states are added as needed until closure + # is reached. + new_machine = Machines.FastMachine() + state_map = StateMap(new_machine) + # Seed the process using the initial states of the old machine. + # Make the corresponding new states into initial states of the new + # machine with the same names. + for (key, old_state) in old_machine.initial_states.items(): + new_state = state_map.old_to_new(epsilon_closure(old_state)) + new_machine.make_initial_state(key, new_state) + # Tricky bit here: we add things to the end of this list while we're + # iterating over it. The iteration stops when closure is achieved. + for new_state in new_machine.states: + transitions = TransitionMap() + for old_state in state_map.new_to_old(new_state).keys(): + for event, old_target_states in old_state.transitions.items(): + if event and old_target_states: + transitions.add_set(event, set_epsilon_closure(old_target_states)) + for event, old_states in transitions.items(): + new_machine.add_transitions(new_state, event, state_map.old_to_new(old_states)) + if debug: + debug.write("\n===== State Mapping =====\n") + state_map.dump(debug) + return new_machine + +def set_epsilon_closure(state_set): + """ + Given a set of states, return the union of the epsilon + closures of its member states. + """ + result = {} + for state1 in state_set.keys(): + for state2 in epsilon_closure(state1).keys(): + result[state2] = 1 + return result + +def epsilon_closure(state): + """ + Return the set of states reachable from the given state + by epsilon moves. + """ + # Cache the result + result = state.epsilon_closure + if result is None: + result = {} + state.epsilon_closure = result + add_to_epsilon_closure(result, state) + return result + +def add_to_epsilon_closure(state_set, state): + """ + Recursively add to |state_set| states reachable from the given state + by epsilon moves. + """ + if not state_set.get(state, 0): + state_set[state] = 1 + state_set_2 = state.transitions.get_epsilon() + if state_set_2: + for state2 in state_set_2.keys(): + add_to_epsilon_closure(state_set, state2) + +class StateMap: + """ + Helper class used by nfa_to_dfa() to map back and forth between + sets of states from the old machine and states of the new machine. + """ + new_machine = None # Machine + old_to_new_dict = None # {(old_state,...) : new_state} + new_to_old_dict = None # {id(new_state) : old_state_set} + + def __init__(self, new_machine): + self.new_machine = new_machine + self.old_to_new_dict = {} + self.new_to_old_dict= {} + + def old_to_new(self, old_state_set): + """ + Return the state of the new machine corresponding to the + set of old machine states represented by |state_set|. A new + state will be created if necessary. If any of the old states + are accepting states, the new state will be an accepting state + with the highest priority action from the old states. + """ + key = self.make_key(old_state_set) + new_state = self.old_to_new_dict.get(key, None) + if not new_state: + action = self.highest_priority_action(old_state_set) + new_state = self.new_machine.new_state(action) + self.old_to_new_dict[key] = new_state + self.new_to_old_dict[id(new_state)] = old_state_set + #for old_state in old_state_set.keys(): + #new_state.merge_actions(old_state) + return new_state + + def highest_priority_action(self, state_set): + best_action = None + best_priority = LOWEST_PRIORITY + for state in state_set.keys(): + priority = state.action_priority + if priority > best_priority: + best_action = state.action + best_priority = priority + return best_action + +# def old_to_new_set(self, old_state_set): +# """ +# Return the new state corresponding to a set of old states as +# a singleton set. +# """ +# return {self.old_to_new(old_state_set):1} + + def new_to_old(self, new_state): + """Given a new state, return a set of corresponding old states.""" + return self.new_to_old_dict[id(new_state)] + + def make_key(self, state_set): + """ + Convert a set of states into a uniquified + sorted tuple suitable for use as a dictionary key. + """ + lst = state_set.keys() + lst.sort() + return tuple(lst) + + def dump(self, file): + from Transitions import state_set_str + for new_state in self.new_machine.states: + old_state_set = self.new_to_old_dict[id(new_state)] + file.write(" State %s <-- %s\n" % ( + new_state['number'], state_set_str(old_state_set))) + + diff --git a/debian/pyrex/pyrex-0.9.9/Pyrex/Plex/Errors.py b/debian/pyrex/pyrex-0.9.9/Pyrex/Plex/Errors.py new file mode 100755 index 00000000..ae033672 --- /dev/null +++ b/debian/pyrex/pyrex-0.9.9/Pyrex/Plex/Errors.py @@ -0,0 +1,52 @@ +#======================================================================= +# +# Python Lexical Analyser +# +# Exception classes +# +#======================================================================= + +import exceptions + +class PlexError(exceptions.Exception): + message = "" + +class PlexTypeError(PlexError, TypeError): + pass + +class PlexValueError(PlexError, ValueError): + pass + +class InvalidRegex(PlexError): + pass + +class InvalidToken(PlexError): + + def __init__(self, token_number, message): + PlexError.__init__(self, "Token number %d: %s" % (token_number, message)) + +class InvalidScanner(PlexError): + pass + +class AmbiguousAction(PlexError): + message = "Two tokens with different actions can match the same string" + + def __init__(self): + pass + +class UnrecognizedInput(PlexError): + scanner = None + position = None + state_name = None + + def __init__(self, scanner, state_name): + self.scanner = scanner + self.position = scanner.position() + self.state_name = state_name + + def __str__(self): + return ("'%s', line %d, char %d: Token not recognised in state %s" + % (self.position + (repr(self.state_name),))) + + + diff --git a/debian/pyrex/pyrex-0.9.9/Pyrex/Plex/Lexicons.py b/debian/pyrex/pyrex-0.9.9/Pyrex/Plex/Lexicons.py new file mode 100755 index 00000000..32b12c43 --- /dev/null +++ b/debian/pyrex/pyrex-0.9.9/Pyrex/Plex/Lexicons.py @@ -0,0 +1,192 @@ +#======================================================================= +# +# Python Lexical Analyser +# +# Lexical Analyser Specification +# +#======================================================================= + +import types + +import Actions +import DFA +import Errors +import Machines +import Regexps + +# debug_flags for Lexicon constructor +DUMP_NFA = 1 +DUMP_DFA = 2 + +class State: + """ + This class is used as part of a Plex.Lexicon specification to + introduce a user-defined state. + + Constructor: + + State(name, token_specifications) + """ + + name = None + tokens = None + + def __init__(self, name, tokens): + self.name = name + self.tokens = tokens + +class Lexicon: + """ + Lexicon(specification) builds a lexical analyser from the given + |specification|. The specification consists of a list of + specification items. Each specification item may be either: + + 1) A token definition, which is a tuple: + + (pattern, action) + + The |pattern| is a regular axpression built using the + constructors defined in the Plex module. + + The |action| is the action to be performed when this pattern + is recognised (see below). + + 2) A state definition: + + State(name, tokens) + + where |name| is a character string naming the state, + and |tokens| is a list of token definitions as + above. The meaning and usage of states is described + below. + + Actions + ------- + + The |action| in a token specication may be one of three things: + + 1) A function, which is called as follows: + + function(scanner, text) + + where |scanner| is the relevant Scanner instance, and |text| + is the matched text. If the function returns anything + other than None, that value is returned as the value of the + token. If it returns None, scanning continues as if the IGNORE + action were specified (see below). + + 2) One of the following special actions: + + IGNORE means that the recognised characters will be treated as + white space and ignored. Scanning will continue until + the next non-ignored token is recognised before returning. + + TEXT causes the scanned text itself to be returned as the + value of the token. + + 3) Any other value, which is returned as the value of the token. + + States + ------ + + At any given time, the scanner is in one of a number of states. + Associated with each state is a set of possible tokens. When scanning, + only tokens associated with the current state are recognised. + + There is a default state, whose name is the empty string. Token + definitions which are not inside any State definition belong to + the default state. + + The initial state of the scanner is the default state. The state can + be changed in one of two ways: + + 1) Using Begin(state_name) as the action of a token. + + 2) Calling the begin(state_name) method of the Scanner. + + To change back to the default state, use '' as the state name. + """ + + machine = None # Machine + tables = None # StateTableMachine + + def __init__(self, specifications, debug = None, debug_flags = 7, timings = None): + if type(specifications) <> types.ListType: + raise Errors.InvalidScanner("Scanner definition is not a list") + if timings: + from Timing import time + total_time = 0.0 + time1 = time() + nfa = Machines.Machine() + default_initial_state = nfa.new_initial_state('') + token_number = 1 + for spec in specifications: + if isinstance(spec, State): + user_initial_state = nfa.new_initial_state(spec.name) + for token in spec.tokens: + self.add_token_to_machine( + nfa, user_initial_state, token, token_number) + token_number = token_number + 1 + elif type(spec) == types.TupleType: + self.add_token_to_machine( + nfa, default_initial_state, spec, token_number) + token_number = token_number + 1 + else: + raise Errors.InvalidToken( + token_number, + "Expected a token definition (tuple) or State instance") + if timings: + time2 = time() + total_time = total_time + (time2 - time1) + time3 = time() + if debug and (debug_flags & 1): + debug.write("\n============= NFA ===========\n") + nfa.dump(debug) + dfa = DFA.nfa_to_dfa(nfa, debug = (debug_flags & 3) == 3 and debug) + if timings: + time4 = time() + total_time = total_time + (time4 - time3) + if debug and (debug_flags & 2): + debug.write("\n============= DFA ===========\n") + dfa.dump(debug) + if timings: + timings.write("Constructing NFA : %5.2f\n" % (time2 - time1)) + timings.write("Converting to DFA: %5.2f\n" % (time4 - time3)) + timings.write("TOTAL : %5.2f\n" % total_time) + self.machine = dfa + + def add_token_to_machine(self, machine, initial_state, token_spec, token_number): + try: + (re, action_spec) = self.parse_token_definition(token_spec) + # Disabled this -- matching empty strings can be useful + #if re.nullable: + # raise Errors.InvalidToken( + # token_number, "Pattern can match 0 input symbols") + if isinstance(action_spec, Actions.Action): + action = action_spec + elif callable(action_spec): + action = Actions.Call(action_spec) + else: + action = Actions.Return(action_spec) + final_state = machine.new_state() + re.build_machine(machine, initial_state, final_state, + match_bol = 1, nocase = 0) + final_state.set_action(action, priority = -token_number) + except Errors.PlexError, e: + raise e.__class__("Token number %d: %s" % (token_number, e)) + + def parse_token_definition(self, token_spec): + if type(token_spec) <> types.TupleType: + raise Errors.InvalidToken("Token definition is not a tuple") + if len(token_spec) <> 2: + raise Errors.InvalidToken("Wrong number of items in token definition") + pattern, action = token_spec + if not isinstance(pattern, Regexps.RE): + raise Errors.InvalidToken("Pattern is not an RE instance") + return (pattern, action) + + def get_initial_state(self, name): + return self.machine.get_initial_state(name) + + + diff --git a/debian/pyrex/pyrex-0.9.9/Pyrex/Plex/Machines.py b/debian/pyrex/pyrex-0.9.9/Pyrex/Plex/Machines.py new file mode 100755 index 00000000..fb9ba717 --- /dev/null +++ b/debian/pyrex/pyrex-0.9.9/Pyrex/Plex/Machines.py @@ -0,0 +1,326 @@ +#======================================================================= +# +# Python Lexical Analyser +# +# Classes for building NFAs and DFAs +# +#======================================================================= + +import string +import sys +from sys import maxint +from types import TupleType + +from Transitions import TransitionMap + +LOWEST_PRIORITY = -sys.maxint + +class Machine: + """A collection of Nodes representing an NFA or DFA.""" + states = None # [Node] + next_state_number = 1 + initial_states = None # {(name, bol): Node} + + def __init__(self): + self.states = [] + self.initial_states = {} + + def __del__(self): + #print "Destroying", self ### + for state in self.states: + state.destroy() + + def new_state(self): + """Add a new state to the machine and return it.""" + s = Node() + n = self.next_state_number + self.next_state_number = n + 1 + s.number = n + self.states.append(s) + return s + + def new_initial_state(self, name): + state = self.new_state() + self.make_initial_state(name, state) + return state + + def make_initial_state(self, name, state): + self.initial_states[name] = state + + def get_initial_state(self, name): + return self.initial_states[name] + + def dump(self, file): + file.write("Plex.Machine:\n") + if self.initial_states is not None: + file.write(" Initial states:\n") + for (name, state) in self.initial_states.items(): + file.write(" '%s': %d\n" % (name, state.number)) + for s in self.states: + s.dump(file) + +class Node: + """A state of an NFA or DFA.""" + transitions = None # TransitionMap + action = None # Action + action_priority = None # integer + number = 0 # for debug output + epsilon_closure = None # used by nfa_to_dfa() + + def __init__(self): + # Preinitialise the list of empty transitions, because + # the nfa-to-dfa algorithm needs it + #self.transitions = {'':[]} + self.transitions = TransitionMap() + self.action_priority = LOWEST_PRIORITY + + def destroy(self): + #print "Destroying", self ### + self.transitions = None + self.action = None + self.epsilon_closure = None + + def add_transition(self, event, new_state): + self.transitions.add(event, new_state) + + def link_to(self, state): + """Add an epsilon-move from this state to another state.""" + self.add_transition('', state) + + def set_action(self, action, priority): + """Make this an accepting state with the given action. If + there is already an action, choose the action with highest + priority.""" + if priority > self.action_priority: + self.action = action + self.action_priority = priority + + def get_action(self): + return self.action + + def get_action_priority(self): + return self.action_priority + +# def merge_actions(self, other_state): +# """Merge actions of other state into this state according +# to their priorities.""" +# action = other_state.get_action() +# priority = other_state.get_action_priority() +# self.set_action(action, priority) + + def is_accepting(self): + return self.action is not None + + def __str__(self): + return "State %d" % self.number + + def dump(self, file): + import string + # Header + file.write(" State %d:\n" % self.number) + # Transitions +# self.dump_transitions(file) + self.transitions.dump(file) + # Action + action = self.action + priority = self.action_priority + if action is not None: + file.write(" %s [priority %d]\n" % (action, priority)) + + +class FastMachine: + """ + FastMachine is a deterministic machine represented in a way that + allows fast scanning. + """ + initial_states = None # {state_name:state} + states = None # [state] + # where state = {event:state, 'else':state, 'action':Action} + next_number = 1 # for debugging + + new_state_template = { + '':None, 'bol':None, 'eol':None, 'eof':None, 'else':None + } + + def __init__(self, old_machine = None): + self.initial_states = initial_states = {} + self.states = [] + if old_machine: + self.old_to_new = old_to_new = {} + for old_state in old_machine.states: + new_state = self.new_state() + old_to_new[old_state] = new_state + for name, old_state in old_machine.initial_states.items(): + initial_states[name] = old_to_new[old_state] + for old_state in old_machine.states: + new_state = old_to_new[old_state] + for event, old_state_set in old_state.transitions.items(): + if old_state_set: + new_state[event] = old_to_new[old_state_set.keys()[0]] + else: + new_state[event] = None + new_state['action'] = old_state.action + + def __del__(self): + for state in self.states: + state.clear() + + def new_state(self, action = None): + number = self.next_number + self.next_number = number + 1 + result = self.new_state_template.copy() + result['number'] = number + result['action'] = action + self.states.append(result) + return result + + def make_initial_state(self, name, state): + self.initial_states[name] = state + + def add_transitions(self, state, event, new_state): + if type(event) == TupleType: + code0, code1 = event + if code0 == -maxint: + state['else'] = new_state + elif code1 <> maxint: + while code0 < code1: + state[chr(code0)] = new_state + code0 = code0 + 1 + else: + state[event] = new_state + + def get_initial_state(self, name): + return self.initial_states[name] + + def dump(self, file): + file.write("Plex.FastMachine:\n") + file.write(" Initial states:\n") + for name, state in self.initial_states.items(): + file.write(" %s: %s\n" % (repr(name), state['number'])) + for state in self.states: + self.dump_state(state, file) + + def dump_state(self, state, file): + import string + # Header + file.write(" State %d:\n" % state['number']) + # Transitions + self.dump_transitions(state, file) + # Action + action = state['action'] + if action is not None: + file.write(" %s\n" % action) + + def dump_transitions(self, state, file): + chars_leading_to_state = {} + special_to_state = {} + for (c, s) in state.items(): + if len(c) == 1: + chars = chars_leading_to_state.get(id(s), None) + if chars is None: + chars = [] + chars_leading_to_state[id(s)] = chars + chars.append(c) + elif len(c) <= 4: + special_to_state[c] = s + ranges_to_state = {} + for state in self.states: + char_list = chars_leading_to_state.get(id(state), None) + if char_list: + ranges = self.chars_to_ranges(char_list) + ranges_to_state[ranges] = state + ranges_list = ranges_to_state.keys() + ranges_list.sort() + for ranges in ranges_list: + key = self.ranges_to_string(ranges) + state = ranges_to_state[ranges] + file.write(" %s --> State %d\n" % (key, state['number'])) + for key in ('bol', 'eol', 'eof', 'else'): + state = special_to_state.get(key, None) + if state: + file.write(" %s --> State %d\n" % (key, state['number'])) + + def chars_to_ranges(self, char_list): + char_list.sort() + i = 0 + n = len(char_list) + result = [] + while i < n: + c1 = ord(char_list[i]) + c2 = c1 + i = i + 1 + while i < n and ord(char_list[i]) == c2 + 1: + i = i + 1 + c2 = c2 + 1 + result.append((chr(c1), chr(c2))) + return tuple(result) + + def ranges_to_string(self, range_list): + return string.join(map(self.range_to_string, range_list), ",") + + def range_to_string(self, (c1, c2)): + if c1 == c2: + return repr(c1) + else: + return "%s..%s" % (repr(c1), repr(c2)) +## +## (Superseded by Machines.FastMachine) +## +## class StateTableMachine: +## """ +## StateTableMachine is an alternative representation of a Machine +## that can be run more efficiently. +## """ +## initial_states = None # {state_name:state_index} +## states = None # [([state] indexed by char code, Action)] + +## special_map = {'bol':256, 'eol':257, 'eof':258} + +## def __init__(self, m): +## """ +## Initialise StateTableMachine from Machine |m|. +## """ +## initial_states = self.initial_states = {} +## states = self.states = [None] +## old_to_new = {} +## i = 1 +## for old_state in m.states: +## new_state = ([0] * 259, old_state.get_action()) +## states.append(new_state) +## old_to_new[old_state] = i # new_state +## i = i + 1 +## for name, old_state in m.initial_states.items(): +## initial_states[name] = old_to_new[old_state] +## for old_state in m.states: +## new_state_index = old_to_new[old_state] +## new_table = states[new_state_index][0] +## transitions = old_state.transitions +## for c, old_targets in transitions.items(): +## if old_targets: +## old_target = old_targets[0] +## new_target_index = old_to_new[old_target] +## if len(c) == 1: +## a = ord(c) +## else: +## a = self.special_map[c] +## new_table[a] = states[new_target_index] + +## def dump(self, f): +## f.write("Plex.StateTableMachine:\n") +## f.write(" Initial states:\n") +## for name, index in self.initial_states.items(): +## f.write(" %s: State %d\n" % ( +## repr(name), id(self.states[index]))) +## for i in xrange(1, len(self.states)): +## table, action = self.states[i] +## f.write(" State %d:" % i) +## if action: +## f.write("%s" % action) +## f.write("\n") +## f.write(" %s\n" % map(id,table)) + + + + + + diff --git a/debian/pyrex/pyrex-0.9.9/Pyrex/Plex/Regexps.py b/debian/pyrex/pyrex-0.9.9/Pyrex/Plex/Regexps.py new file mode 100755 index 00000000..6164d3bd --- /dev/null +++ b/debian/pyrex/pyrex-0.9.9/Pyrex/Plex/Regexps.py @@ -0,0 +1,557 @@ +#======================================================================= +# +# Python Lexical Analyser +# +# Regular Expressions +# +#======================================================================= + +import array +import string +import types +from sys import maxint + +import Errors + +# +# Constants +# + +BOL = 'bol' +EOL = 'eol' +EOF = 'eof' + +nl_code = ord('\n') + +# +# Helper functions +# + +def chars_to_ranges(s): + """ + Return a list of character codes consisting of pairs + [code1a, code1b, code2a, code2b,...] which cover all + the characters in |s|. + """ + char_list = list(s) + char_list.sort() + i = 0 + n = len(char_list) + result = [] + while i < n: + code1 = ord(char_list[i]) + code2 = code1 + 1 + i = i + 1 + while i < n and code2 >= ord(char_list[i]): + code2 = code2 + 1 + i = i + 1 + result.append(code1) + result.append(code2) + return result + +def uppercase_range(code1, code2): + """ + If the range of characters from code1 to code2-1 includes any + lower case letters, return the corresponding upper case range. + """ + code3 = max(code1, ord('a')) + code4 = min(code2, ord('z') + 1) + if code3 < code4: + d = ord('A') - ord('a') + return (code3 + d, code4 + d) + else: + return None + +def lowercase_range(code1, code2): + """ + If the range of characters from code1 to code2-1 includes any + upper case letters, return the corresponding lower case range. + """ + code3 = max(code1, ord('A')) + code4 = min(code2, ord('Z') + 1) + if code3 < code4: + d = ord('a') - ord('A') + return (code3 + d, code4 + d) + else: + return None + +def CodeRanges(code_list): + """ + Given a list of codes as returned by chars_to_ranges, return + an RE which will match a character in any of the ranges. + """ + re_list = [] + for i in xrange(0, len(code_list), 2): + re_list.append(CodeRange(code_list[i], code_list[i + 1])) + return apply(Alt, tuple(re_list)) + +def CodeRange(code1, code2): + """ + CodeRange(code1, code2) is an RE which matches any character + with a code |c| in the range |code1| <= |c| < |code2|. + """ + if code1 <= nl_code < code2: + return Alt(RawCodeRange(code1, nl_code), + RawNewline, + RawCodeRange(nl_code + 1, code2)) + else: + return RawCodeRange(code1, code2) + +# +# Abstract classes +# + +class RE: + """RE is the base class for regular expression constructors. + The following operators are defined on REs: + + re1 + re2 is an RE which matches |re1| followed by |re2| + re1 | re2 is an RE which matches either |re1| or |re2| + """ + + nullable = 1 # True if this RE can match 0 input symbols + match_nl = 1 # True if this RE can match a string ending with '\n' + str = None # Set to a string to override the class's __str__ result + + def build_machine(self, machine, initial_state, final_state, + match_bol, nocase): + """ + This method should add states to |machine| to implement this + RE, starting at |initial_state| and ending at |final_state|. + If |match_bol| is true, the RE must be able to match at the + beginning of a line. If nocase is true, upper and lower case + letters should be treated as equivalent. + """ + raise exceptions.UnimplementedMethod("%s.build_machine not implemented" % + self.__class__.__name__) + + def build_opt(self, m, initial_state, c): + """ + Given a state |s| of machine |m|, return a new state + reachable from |s| on character |c| or epsilon. + """ + s = m.new_state() + initial_state.link_to(s) + initial_state.add_transition(c, s) + return s + + def __add__(self, other): + return Seq(self, other) + + def __or__(self, other): + return Alt(self, other) + + def __str__(self): + if self.str: + return self.str + else: + return self.calc_str() + + def check_re(self, num, value): + if not isinstance(value, RE): + self.wrong_type(num, value, "Plex.RE instance") + + def check_string(self, num, value): + if type(value) <> type(''): + self.wrong_type(num, value, "string") + + def check_char(self, num, value): + self.check_string(num, value) + if len(value) <> 1: + raise Errors.PlexValueError("Invalid value for argument %d of Plex.%s." + "Expected a string of length 1, got: %s" % ( + num, self.__class__.__name__, repr(value))) + + def wrong_type(self, num, value, expected): + if type(value) == types.InstanceType: + got = "%s.%s instance" % ( + value.__class__.__module__, value.__class__.__name__) + else: + got = type(value).__name__ + raise Errors.PlexTypeError("Invalid type for argument %d of Plex.%s " + "(expected %s, got %s" % ( + num, self.__class__.__name__, expected, got)) + +# +# Primitive RE constructors +# ------------------------- +# +# These are the basic REs from which all others are built. +# + +## class Char(RE): +## """ +## Char(c) is an RE which matches the character |c|. +## """ + +## nullable = 0 + +## def __init__(self, char): +## self.char = char +## self.match_nl = char == '\n' + +## def build_machine(self, m, initial_state, final_state, match_bol, nocase): +## c = self.char +## if match_bol and c <> BOL: +## s1 = self.build_opt(m, initial_state, BOL) +## else: +## s1 = initial_state +## if c == '\n' or c == EOF: +## s1 = self.build_opt(m, s1, EOL) +## if len(c) == 1: +## code = ord(self.char) +## s1.add_transition((code, code+1), final_state) +## if nocase and is_letter_code(code): +## code2 = other_case_code(code) +## s1.add_transition((code2, code2+1), final_state) +## else: +## s1.add_transition(c, final_state) + +## def calc_str(self): +## return "Char(%s)" % repr(self.char) + +def Char(c): + """ + Char(c) is an RE which matches the character |c|. + """ + if len(c) == 1: + result = CodeRange(ord(c), ord(c) + 1) + else: + result = SpecialSymbol(c) + result.str = "Char(%s)" % repr(c) + return result + +class RawCodeRange(RE): + """ + RawCodeRange(code1, code2) is a low-level RE which matches any character + with a code |c| in the range |code1| <= |c| < |code2|, where the range + does not include newline. For internal use only. + """ + nullable = 0 + match_nl = 0 + range = None # (code, code) + uppercase_range = None # (code, code) or None + lowercase_range = None # (code, code) or None + + def __init__(self, code1, code2): + self.range = (code1, code2) + self.uppercase_range = uppercase_range(code1, code2) + self.lowercase_range = lowercase_range(code1, code2) + + def build_machine(self, m, initial_state, final_state, match_bol, nocase): + if match_bol: + initial_state = self.build_opt(m, initial_state, BOL) + initial_state.add_transition(self.range, final_state) + if nocase: + if self.uppercase_range: + initial_state.add_transition(self.uppercase_range, final_state) + if self.lowercase_range: + initial_state.add_transition(self.lowercase_range, final_state) + + def calc_str(self): + return "CodeRange(%d,%d)" % (self.code1, self.code2) + +class _RawNewline(RE): + """ + RawNewline is a low-level RE which matches a newline character. + For internal use only. + """ + nullable = 0 + match_nl = 1 + + def build_machine(self, m, initial_state, final_state, match_bol, nocase): + if match_bol: + initial_state = self.build_opt(m, initial_state, BOL) + s = self.build_opt(m, initial_state, EOL) + s.add_transition((nl_code, nl_code + 1), final_state) + +RawNewline = _RawNewline() + + +class SpecialSymbol(RE): + """ + SpecialSymbol(sym) is an RE which matches the special input + symbol |sym|, which is one of BOL, EOL or EOF. + """ + nullable = 0 + match_nl = 0 + sym = None + + def __init__(self, sym): + self.sym = sym + + def build_machine(self, m, initial_state, final_state, match_bol, nocase): + # Sequences 'bol bol' and 'bol eof' are impossible, so only need + # to allow for bol if sym is eol + if match_bol and self.sym == EOL: + initial_state = self.build_opt(m, initial_state, BOL) + initial_state.add_transition(self.sym, final_state) + + +class Seq(RE): + """Seq(re1, re2, re3...) is an RE which matches |re1| followed by + |re2| followed by |re3|...""" + + def __init__(self, *re_list): + nullable = 1 + for i in xrange(len(re_list)): + re = re_list[i] + self.check_re(i, re) + nullable = nullable and re.nullable + self.re_list = re_list + self.nullable = nullable + i = len(re_list) + match_nl = 0 + while i: + i = i - 1 + re = re_list[i] + if re.match_nl: + match_nl = 1 + break + if not re.nullable: + break + self.match_nl = match_nl + + def build_machine(self, m, initial_state, final_state, match_bol, nocase): + re_list = self.re_list + if len(re_list) == 0: + initial_state.link_to(final_state) + else: + s1 = initial_state + n = len(re_list) + for i in xrange(n): + if i < n - 1: + s2 = m.new_state() + else: + s2 = final_state + re = re_list[i] + re.build_machine(m, s1, s2, match_bol, nocase) + s1 = s2 + match_bol = re.match_nl or (match_bol and re.nullable) + + def calc_str(self): + return "Seq(%s)" % string.join(map(str, self.re_list), ",") + + +class Alt(RE): + """Alt(re1, re2, re3...) is an RE which matches either |re1| or + |re2| or |re3|...""" + + def __init__(self, *re_list): + self.re_list = re_list + nullable = 0 + match_nl = 0 + nullable_res = [] + non_nullable_res = [] + i = 1 + for re in re_list: + self.check_re(i, re) + if re.nullable: + nullable_res.append(re) + nullable = 1 + else: + non_nullable_res.append(re) + if re.match_nl: + match_nl = 1 + i = i + 1 + self.nullable_res = nullable_res + self.non_nullable_res = non_nullable_res + self.nullable = nullable + self.match_nl = match_nl + + def build_machine(self, m, initial_state, final_state, match_bol, nocase): + for re in self.nullable_res: + re.build_machine(m, initial_state, final_state, match_bol, nocase) + if self.non_nullable_res: + if match_bol: + initial_state = self.build_opt(m, initial_state, BOL) + for re in self.non_nullable_res: + re.build_machine(m, initial_state, final_state, 0, nocase) + + def calc_str(self): + return "Alt(%s)" % string.join(map(str, self.re_list), ",") + + +class Rep1(RE): + """Rep1(re) is an RE which matches one or more repetitions of |re|.""" + + def __init__(self, re): + self.check_re(1, re) + self.re = re + self.nullable = re.nullable + self.match_nl = re.match_nl + + def build_machine(self, m, initial_state, final_state, match_bol, nocase): + s1 = m.new_state() + s2 = m.new_state() + initial_state.link_to(s1) + self.re.build_machine(m, s1, s2, match_bol or self.re.match_nl, nocase) + s2.link_to(s1) + s2.link_to(final_state) + + def calc_str(self): + return "Rep1(%s)" % self.re + + +class SwitchCase(RE): + """ + SwitchCase(re, nocase) is an RE which matches the same strings as RE, + but treating upper and lower case letters according to |nocase|. If + |nocase| is true, case is ignored, otherwise it is not. + """ + re = None + nocase = None + + def __init__(self, re, nocase): + self.re = re + self.nocase = nocase + self.nullable = re.nullable + self.match_nl = re.match_nl + + def build_machine(self, m, initial_state, final_state, match_bol, nocase): + self.re.build_machine(m, initial_state, final_state, match_bol, + self.nocase) + + def calc_str(self): + if self.nocase: + name = "NoCase" + else: + name = "Case" + return "%s(%s)" % (name, self.re) + +# +# Composite RE constructors +# ------------------------- +# +# These REs are defined in terms of the primitive REs. +# + +Empty = Seq() +Empty.__doc__ = \ + """ + Empty is an RE which matches the empty string. + """ +Empty.str = "Empty" + +def Str1(s): + """ + Str1(s) is an RE which matches the literal string |s|. + """ + result = apply(Seq, tuple(map(Char, s))) + result.str = "Str(%s)" % repr(s) + return result + +def Str(*strs): + """ + Str(s) is an RE which matches the literal string |s|. + Str(s1, s2, s3, ...) is an RE which matches any of |s1| or |s2| or |s3|... + """ + if len(strs) == 1: + return Str1(strs[0]) + else: + result = apply(Alt, tuple(map(Str1, strs))) + result.str = "Str(%s)" % string.join(map(repr, strs), ",") + return result + +def Any(s): + """ + Any(s) is an RE which matches any character in the string |s|. + """ + #result = apply(Alt, tuple(map(Char, s))) + result = CodeRanges(chars_to_ranges(s)) + result.str = "Any(%s)" % repr(s) + return result + +def AnyBut(s): + """ + AnyBut(s) is an RE which matches any character (including + newline) which is not in the string |s|. + """ + ranges = chars_to_ranges(s) + ranges.insert(0, -maxint) + ranges.append(maxint) + result = CodeRanges(ranges) + result.str = "AnyBut(%s)" % repr(s) + return result + +AnyChar = AnyBut("") +AnyChar.__doc__ = \ + """ + AnyChar is an RE which matches any single character (including a newline). + """ +AnyChar.str = "AnyChar" + +def Range(s1, s2 = None): + """ + Range(c1, c2) is an RE which matches any single character in the range + |c1| to |c2| inclusive. + Range(s) where |s| is a string of even length is an RE which matches + any single character in the ranges |s[0]| to |s[1]|, |s[2]| to |s[3]|,... + """ + if s2: + result = CodeRange(ord(s1), ord(s2) + 1) + result.str = "Range(%s,%s)" % (s1, s2) + else: + ranges = [] + for i in range(0, len(s1), 2): + ranges.append(CodeRange(ord(s1[i]), ord(s1[i+1]) + 1)) + result = apply(Alt, tuple(ranges)) + result.str = "Range(%s)" % repr(s1) + return result + +def Opt(re): + """ + Opt(re) is an RE which matches either |re| or the empty string. + """ + result = Alt(re, Empty) + result.str = "Opt(%s)" % re + return result + +def Rep(re): + """ + Rep(re) is an RE which matches zero or more repetitions of |re|. + """ + result = Opt(Rep1(re)) + result.str = "Rep(%s)" % re + return result + +def NoCase(re): + """ + NoCase(re) is an RE which matches the same strings as RE, but treating + upper and lower case letters as equivalent. + """ + return SwitchCase(re, nocase = 1) + +def Case(re): + """ + Case(re) is an RE which matches the same strings as RE, but treating + upper and lower case letters as distinct, i.e. it cancels the effect + of any enclosing NoCase(). + """ + return SwitchCase(re, nocase = 0) + +# +# RE Constants +# + +Bol = Char(BOL) +Bol.__doc__ = \ + """ + Bol is an RE which matches the beginning of a line. + """ +Bol.str = "Bol" + +Eol = Char(EOL) +Eol.__doc__ = \ + """ + Eol is an RE which matches the end of a line. + """ +Eol.str = "Eol" + +Eof = Char(EOF) +Eof.__doc__ = \ + """ + Eof is an RE which matches the end of the file. + """ +Eof.str = "Eof" + diff --git a/debian/pyrex/pyrex-0.9.9/Pyrex/Plex/Scanners.py b/debian/pyrex/pyrex-0.9.9/Pyrex/Plex/Scanners.py new file mode 100755 index 00000000..6278d88b --- /dev/null +++ b/debian/pyrex/pyrex-0.9.9/Pyrex/Plex/Scanners.py @@ -0,0 +1,377 @@ +#======================================================================= +# +# Python Lexical Analyser +# +# +# Scanning an input stream +# +#======================================================================= + +import Errors +from Regexps import BOL, EOL, EOF + +class Scanner: + """ + A Scanner is used to read tokens from a stream of characters + using the token set specified by a Plex.Lexicon. + + Constructor: + + Scanner(lexicon, stream, name = '') + + See the docstring of the __init__ method for details. + + Methods: + + See the docstrings of the individual methods for more + information. + + read() --> (value, text) + Reads the next lexical token from the stream. + + position() --> (name, line, col) + Returns the position of the last token read using the + read() method. + + begin(state_name) + Causes scanner to change state. + + produce(value [, text]) + Causes return of a token value to the caller of the + Scanner. + + """ + + lexicon = None # Lexicon + stream = None # file-like object + name = '' + buffer = '' + buf_start_pos = 0 # position in input of start of buffer + next_pos = 0 # position in input of next char to read + cur_pos = 0 # position in input of current char + cur_line = 1 # line number of current char + cur_line_start = 0 # position in input of start of current line + start_pos = 0 # position in input of start of token + start_line = 0 # line number of start of token + start_col = 0 # position in line of start of token + text = None # text of last token read + initial_state = None # Node + state_name = '' # Name of initial state + queue = None # list of tokens to be returned + trace = 0 + + def __init__(self, lexicon, stream, name = ''): + """ + Scanner(lexicon, stream, name = '') + + |lexicon| is a Plex.Lexicon instance specifying the lexical tokens + to be recognised. + + |stream| can be a file object or anything which implements a + compatible read() method. + + |name| is optional, and may be the name of the file being + scanned or any other identifying string. + """ + self.lexicon = lexicon + self.stream = stream + self.name = name + self.queue = [] + self.initial_state = None + self.begin('') + self.next_pos = 0 + self.cur_pos = 0 + self.cur_line_start = 0 + self.cur_char = BOL + self.input_state = 1 + + def read(self): + """ + Read the next lexical token from the stream and return a + tuple (value, text), where |value| is the value associated with + the token as specified by the Lexicon, and |text| is the actual + string read from the stream. Returns (None, '') on end of file. + """ + queue = self.queue + while not queue: + self.text, action = self.scan_a_token() + if action is None: + self.produce(None) + self.eof() + else: + value = action.perform(self, self.text) + if value is not None: + self.produce(value) + result = queue[0] + del queue[0] + return result + + def scan_a_token(self): + """ + Read the next input sequence recognised by the machine + and return (text, action). Returns ('', None) on end of + file. + """ + self.start_pos = self.cur_pos + self.start_line = self.cur_line + self.start_col = self.cur_pos - self.cur_line_start +# if self.trace: +# action = self.run_machine() +# else: +# action = self.run_machine_inlined() + action = self.run_machine_inlined() + if action: + if self.trace: + print "Scanner: read: Performing", action, "%d:%d" % ( + self.start_pos, self.cur_pos) + base = self.buf_start_pos + text = self.buffer[self.start_pos - base : self.cur_pos - base] + return (text, action) + else: + if self.cur_pos == self.start_pos: + if self.cur_char == EOL: + self.next_char() + if not self.cur_char or self.cur_char == EOF: + return ('', None) + raise Errors.UnrecognizedInput(self, self.state_name) + + def run_machine(self): + """ + Run the machine until no more transitions are possible. + """ + self.state = self.initial_state + self.backup_state = None + while self.transition(): + pass + return self.back_up() + + def run_machine_inlined(self): + """ + Inlined version of run_machine for speed. + """ + state = self.initial_state + cur_pos = self.cur_pos + cur_line = self.cur_line + cur_line_start = self.cur_line_start + cur_char = self.cur_char + input_state = self.input_state + next_pos = self.next_pos + buffer = self.buffer + buf_start_pos = self.buf_start_pos + buf_len = len(buffer) + backup_state = None + trace = self.trace + while 1: + if trace: #TRACE# + print "State %d, %d/%d:%s -->" % ( #TRACE# + state['number'], input_state, cur_pos, repr(cur_char)), #TRACE# + # Begin inlined self.save_for_backup() + #action = state.action #@slow + action = state['action'] #@fast + if action: + backup_state = ( + action, cur_pos, cur_line, cur_line_start, cur_char, input_state, next_pos) + # End inlined self.save_for_backup() + c = cur_char + #new_state = state.new_state(c) #@slow + new_state = state.get(c, -1) #@fast + if new_state == -1: #@fast + new_state = c and state.get('else') #@fast + if new_state: + if trace: #TRACE# + print "State %d" % new_state['number'] #TRACE# + state = new_state + # Begin inlined: self.next_char() + if input_state == 1: + cur_pos = next_pos + # Begin inlined: c = self.read_char() + buf_index = next_pos - buf_start_pos + if buf_index < buf_len: + c = buffer[buf_index] + next_pos = next_pos + 1 + else: + discard = self.start_pos - buf_start_pos + data = self.stream.read(0x1000) + buffer = self.buffer[discard:] + data + self.buffer = buffer + buf_start_pos = buf_start_pos + discard + self.buf_start_pos = buf_start_pos + buf_len = len(buffer) + buf_index = buf_index - discard + if data: + c = buffer[buf_index] + next_pos = next_pos + 1 + else: + c = '' + # End inlined: c = self.read_char() + if c == '\n': + cur_char = EOL + input_state = 2 + elif not c: + cur_char = EOL + input_state = 4 + else: + cur_char = c + elif input_state == 2: + cur_char = '\n' + input_state = 3 + elif input_state == 3: + cur_line = cur_line + 1 + cur_line_start = cur_pos = next_pos + cur_char = BOL + input_state = 1 + elif input_state == 4: + cur_char = EOF + input_state = 5 + else: # input_state = 5 + cur_char = '' + # End inlined self.next_char() + else: # not new_state + if trace: #TRACE# + print "blocked" #TRACE# + # Begin inlined: action = self.back_up() + if backup_state: + (action, cur_pos, cur_line, cur_line_start, + cur_char, input_state, next_pos) = backup_state + else: + action = None + break # while 1 + # End inlined: action = self.back_up() + self.cur_pos = cur_pos + self.cur_line = cur_line + self.cur_line_start = cur_line_start + self.cur_char = cur_char + self.input_state = input_state + self.next_pos = next_pos + if trace: #TRACE# + if action: #TRACE# + print "Doing", action #TRACE# + return action + +# def transition(self): +# self.save_for_backup() +# c = self.cur_char +# new_state = self.state.new_state(c) +# if new_state: +# if self.trace: +# print "Scanner: read: State %d: %s --> State %d" % ( +# self.state.number, repr(c), new_state.number) +# self.state = new_state +# self.next_char() +# return 1 +# else: +# if self.trace: +# print "Scanner: read: State %d: %s --> blocked" % ( +# self.state.number, repr(c)) +# return 0 + +# def save_for_backup(self): +# action = self.state.get_action() +# if action: +# if self.trace: +# print "Scanner: read: Saving backup point at", self.cur_pos +# self.backup_state = ( +# action, self.cur_pos, self.cur_line, self.cur_line_start, +# self.cur_char, self.input_state, self.next_pos) + +# def back_up(self): +# backup_state = self.backup_state +# if backup_state: +# (action, self.cur_pos, self.cur_line, self.cur_line_start, +# self.cur_char, self.input_state, self.next_pos) = backup_state +# if self.trace: +# print "Scanner: read: Backing up to", self.cur_pos +# return action +# else: +# return None + + def next_char(self): + input_state = self.input_state + if self.trace: + print "Scanner: next:", " "*20, "[%d] %d" % (input_state, self.cur_pos), + if input_state == 1: + self.cur_pos = self.next_pos + c = self.read_char() + if c == '\n': + self.cur_char = EOL + self.input_state = 2 + elif not c: + self.cur_char = EOL + self.input_state = 4 + else: + self.cur_char = c + elif input_state == 2: + self.cur_char = '\n' + self.input_state = 3 + elif input_state == 3: + self.cur_line = self.cur_line + 1 + self.cur_line_start = self.cur_pos = self.next_pos + self.cur_char = BOL + self.input_state = 1 + elif input_state == 4: + self.cur_char = EOF + self.input_state = 5 + else: # input_state = 5 + self.cur_char = '' + if self.trace: + print "--> [%d] %d %s" % (input_state, self.cur_pos, repr(self.cur_char)) + +# def read_char(self): +# """ +# Get the next input character, filling the buffer if necessary. +# Returns '' at end of file. +# """ +# next_pos = self.next_pos +# buf_index = next_pos - self.buf_start_pos +# if buf_index == len(self.buffer): +# discard = self.start_pos - self.buf_start_pos +# data = self.stream.read(0x1000) +# self.buffer = self.buffer[discard:] + data +# self.buf_start_pos = self.buf_start_pos + discard +# buf_index = buf_index - discard +# if not data: +# return '' +# c = self.buffer[buf_index] +# self.next_pos = next_pos + 1 +# return c + + def position(self): + """ + Return a tuple (name, line, col) representing the location of + the last token read using the read() method. |name| is the + name that was provided to the Scanner constructor; |line| + is the line number in the stream (1-based); |col| is the + position within the line of the first character of the token + (0-based). + """ + return (self.name, self.start_line, self.start_col) + + def begin(self, state_name): + """Set the current state of the scanner to the named state.""" + self.initial_state = ( + self.lexicon.get_initial_state(state_name)) + self.state_name = state_name + + def produce(self, value, text = None): + """ + Called from an action procedure, causes |value| to be returned + as the token value from read(). If |text| is supplied, it is + returned in place of the scanned text. + + produce() can be called more than once during a single call to an action + procedure, in which case the tokens are queued up and returned one + at a time by subsequent calls to read(), until the queue is empty, + whereupon scanning resumes. + """ + if text is None: + text = self.text + self.queue.append((value, text)) + + def eof(self): + """ + Override this method if you want something to be done at + end of file. + """ + +# For backward compatibility: +setattr(Scanner, "yield", Scanner.produce) diff --git a/debian/pyrex/pyrex-0.9.9/Pyrex/Plex/Timing.py b/debian/pyrex/pyrex-0.9.9/Pyrex/Plex/Timing.py new file mode 100755 index 00000000..f47c5c89 --- /dev/null +++ b/debian/pyrex/pyrex-0.9.9/Pyrex/Plex/Timing.py @@ -0,0 +1,22 @@ +# +# Get time in platform-dependent way +# + +import os +from sys import platform, exit, stderr + +if platform == 'mac': + import MacOS + def time(): + return MacOS.GetTicks() / 60.0 + timekind = "real" +elif hasattr(os, 'times'): + def time(): + t = os.times() + return t[0] + t[1] + timekind = "cpu" +else: + stderr.write( + "Don't know how to get time on platform %s\n" % repr(platform)) + exit(1) + diff --git a/debian/pyrex/pyrex-0.9.9/Pyrex/Plex/Traditional.py b/debian/pyrex/pyrex-0.9.9/Pyrex/Plex/Traditional.py new file mode 100755 index 00000000..b3148c1e --- /dev/null +++ b/debian/pyrex/pyrex-0.9.9/Pyrex/Plex/Traditional.py @@ -0,0 +1,154 @@ +#======================================================================= +# +# Python Lexical Analyser +# +# Traditional Regular Expression Syntax +# +#======================================================================= + +from Regexps import * +from Errors import PlexError + +class RegexpSyntaxError(PlexError): + pass + +def re(s): + """ + Convert traditional string representation of regular expression |s| + into Plex representation. + """ + return REParser(s).parse_re() + +class REParser: + + def __init__(self, s): + self.s = s + self.i = -1 + self.end = 0 + self.next() + + def parse_re(self): + re = self.parse_alt() + if not self.end: + self.error("Unexpected %s" % repr(self.c)) + return re + + def parse_alt(self): + """Parse a set of alternative regexps.""" + re = self.parse_seq() + if self.c == '|': + re_list = [re] + while self.c == '|': + self.next() + re_list.append(self.parse_seq()) + re = apply(Alt, tuple(re_list)) + return re + + def parse_seq(self): + """Parse a sequence of regexps.""" + re_list = [] + while not self.end and not self.c in "|)": + re_list.append(self.parse_mod()) + return apply(Seq, tuple(re_list)) + + def parse_mod(self): + """Parse a primitive regexp followed by *, +, ? modifiers.""" + re = self.parse_prim() + while not self.end and self.c in "*+?": + if self.c == '*': + re = Rep(re) + elif self.c == '+': + re = Rep1(re) + else: # self.c == '?' + re = Opt(re) + self.next() + return re + + def parse_prim(self): + """Parse a primitive regexp.""" + c = self.get() + if c == '.': + re = AnyBut("\n") + elif c == '^': + re = Bol + elif c == '$': + re = Eol + elif c == '(': + re = self.parse_alt() + self.expect(')') + elif c == '[': + re = self.parse_charset() + self.expect(']') + else: + if c == '\\': + c = self.get() + re = Char(c) + return re + + def parse_charset(self): + """Parse a charset. Does not include the surrounding [].""" + char_list = [] + invert = 0 + if self.c == '^': + invert = 1 + self.next() + if self.c == ']': + char_list.append(']') + self.next() + while not self.end and self.c <> ']': + c1 = self.get() + if self.c == '-' and self.lookahead(1) <> ']': + self.next() + c2 = self.get() + for a in xrange(ord(c1), ord(c2) + 1): + char_list.append(chr(a)) + else: + char_list.append(c1) + chars = string.join(char_list, "") + if invert: + return AnyBut(chars) + else: + return Any(chars) + + def next(self): + """Advance to the next char.""" + s = self.s + i = self.i = self.i + 1 + if i < len(s): + self.c = s[i] + else: + self.c = '' + self.end = 1 + + def get(self): + if self.end: + self.error("Premature end of string") + c = self.c + self.next() + return c + + def lookahead(self, n): + """Look ahead n chars.""" + j = self.i + n + if j < len(self.s): + return self.s[j] + else: + return '' + + def expect(self, c): + """ + Expect to find character |c| at current position. + Raises an exception otherwise. + """ + if self.c == c: + self.next() + else: + self.error("Missing %s" % repr(c)) + + def error(self, mess): + """Raise exception to signal syntax error in regexp.""" + raise RegexpSyntaxError("Syntax error in regexp %s at position %d: %s" % ( + repr(self.s), self.i, mess)) + + + diff --git a/debian/pyrex/pyrex-0.9.9/Pyrex/Plex/Transitions.py b/debian/pyrex/pyrex-0.9.9/Pyrex/Plex/Transitions.py new file mode 100755 index 00000000..c1edd5ef --- /dev/null +++ b/debian/pyrex/pyrex-0.9.9/Pyrex/Plex/Transitions.py @@ -0,0 +1,253 @@ +# +# Plex - Transition Maps +# +# This version represents state sets direcly as dicts +# for speed. +# + +from copy import copy +import string +from sys import maxint +from types import TupleType + +class TransitionMap: + """ + A TransitionMap maps an input event to a set of states. + An input event is one of: a range of character codes, + the empty string (representing an epsilon move), or one + of the special symbols BOL, EOL, EOF. + + For characters, this implementation compactly represents + the map by means of a list: + + [code_0, states_0, code_1, states_1, code_2, states_2, + ..., code_n-1, states_n-1, code_n] + + where |code_i| is a character code, and |states_i| is a + set of states corresponding to characters with codes |c| + in the range |code_i| <= |c| <= |code_i+1|. + + The following invariants hold: + n >= 1 + code_0 == -maxint + code_n == maxint + code_i < code_i+1 for i in 0..n-1 + states_0 == states_n-1 + + Mappings for the special events '', BOL, EOL, EOF are + kept separately in a dictionary. + """ + + map = None # The list of codes and states + special = None # Mapping for special events + + def __init__(self, map = None, special = None): + if not map: + map = [-maxint, {}, maxint] + if not special: + special = {} + self.map = map + self.special = special + #self.check() ### + + def add(self, event, new_state, + TupleType = TupleType): + """ + Add transition to |new_state| on |event|. + """ + if type(event) == TupleType: + code0, code1 = event + i = self.split(code0) + j = self.split(code1) + map = self.map + while i < j: + map[i + 1][new_state] = 1 + i = i + 2 + else: + self.get_special(event)[new_state] = 1 + + def add_set(self, event, new_set, + TupleType = TupleType): + """ + Add transitions to the states in |new_set| on |event|. + """ + if type(event) == TupleType: + code0, code1 = event + i = self.split(code0) + j = self.split(code1) + map = self.map + while i < j: + map[i + 1].update(new_set) + i = i + 2 + else: + self.get_special(event).update(new_set) + + def get_epsilon(self, + none = None): + """ + Return the mapping for epsilon, or None. + """ + return self.special.get('', none) + + def items(self, + len = len): + """ + Return the mapping as a list of ((code1, code2), state_set) and + (special_event, state_set) pairs. + """ + result = [] + map = self.map + else_set = map[1] + i = 0 + n = len(map) - 1 + code0 = map[0] + while i < n: + set = map[i + 1] + code1 = map[i + 2] + if set or else_set: + result.append(((code0, code1), set)) + code0 = code1 + i = i + 2 + for event, set in self.special.items(): + if set: + result.append((event, set)) + return result + + # ------------------- Private methods -------------------- + + def split(self, code, + len = len, maxint = maxint): + """ + Search the list for the position of the split point for |code|, + inserting a new split point if necessary. Returns index |i| such + that |code| == |map[i]|. + """ + # We use a funky variation on binary search. + map = self.map + hi = len(map) - 1 + # Special case: code == map[-1] + if code == maxint: + return hi + # General case + lo = 0 + # loop invariant: map[lo] <= code < map[hi] and hi - lo >= 2 + while hi - lo >= 4: + # Find midpoint truncated to even index + mid = ((lo + hi) / 2) & ~1 + if code < map[mid]: + hi = mid + else: + lo = mid + # map[lo] <= code < map[hi] and hi - lo == 2 + if map[lo] == code: + return lo + else: + map[hi:hi] = [code, map[hi - 1].copy()] + #self.check() ### + return hi + + def get_special(self, event): + """ + Get state set for special event, adding a new entry if necessary. + """ + special = self.special + set = special.get(event, None) + if not set: + set = {} + special[event] = set + return set + + # --------------------- Conversion methods ----------------------- + + def __str__(self): + map_strs = [] + map = self.map + n = len(map) + i = 0 + while i < n: + code = map[i] + if code == -maxint: + code_str = "-inf" + elif code == maxint: + code_str = "inf" + else: + code_str = str(code) + map_strs.append(code_str) + i = i + 1 + if i < n: + map_strs.append(state_set_str(map[i])) + i = i + 1 + special_strs = {} + for event, set in self.special.items(): + special_strs[event] = state_set_str(set) + return "[%s]+%s" % ( + string.join(map_strs, ","), + special_strs + ) + + # --------------------- Debugging methods ----------------------- + + def check(self): + """Check data structure integrity.""" + if not self.map[-3] < self.map[-1]: + print self + assert 0 + + def dump(self, file): + map = self.map + i = 0 + n = len(map) - 1 + while i < n: + self.dump_range(map[i], map[i + 2], map[i + 1], file) + i = i + 2 + for event, set in self.special.items(): + if set: + if not event: + event = 'empty' + self.dump_trans(event, set, file) + + def dump_range(self, code0, code1, set, file): + if set: + if code0 == -maxint: + if code1 == maxint: + k = "any" + else: + k = "< %s" % self.dump_char(code1) + elif code1 == maxint: + k = "> %s" % self.dump_char(code0 - 1) + elif code0 == code1 - 1: + k = self.dump_char(code0) + else: + k = "%s..%s" % (self.dump_char(code0), + self.dump_char(code1 - 1)) + self.dump_trans(k, set, file) + + def dump_char(self, code): + if 0 <= code <= 255: + return repr(chr(code)) + else: + return "chr(%d)" % code + + def dump_trans(self, key, set, file): + file.write(" %s --> %s\n" % (key, self.dump_set(set))) + + def dump_set(self, set): + return state_set_str(set) + +# +# State set manipulation functions +# + +#def merge_state_sets(set1, set2): +# for state in set2.keys(): +# set1[state] = 1 + +def state_set_str(set): + state_list = set.keys() + str_list = [] + for state in state_list: + str_list.append("S%d" % state.number) + return "[%s]" % string.join(str_list, ",") + + + diff --git a/debian/pyrex/pyrex-0.9.9/Pyrex/Plex/__init__.py b/debian/pyrex/pyrex-0.9.9/Pyrex/Plex/__init__.py new file mode 100755 index 00000000..22b9bba3 --- /dev/null +++ b/debian/pyrex/pyrex-0.9.9/Pyrex/Plex/__init__.py @@ -0,0 +1,40 @@ +#======================================================================= +# +# Python Lexical Analyser +# +#======================================================================= + +""" +The Plex module provides lexical analysers with similar capabilities +to GNU Flex. The following classes and functions are exported; +see the attached docstrings for more information. + + Scanner For scanning a character stream under the + direction of a Lexicon. + + Lexicon For constructing a lexical definition + to be used by a Scanner. + + Str, Any, AnyBut, AnyChar, Seq, Alt, Opt, Rep, Rep1, + Bol, Eol, Eof, Empty + + Regular expression constructors, for building pattern + definitions for a Lexicon. + + State For defining scanner states when creating a + Lexicon. + + TEXT, IGNORE, Begin + + Actions for associating with patterns when + creating a Lexicon. +""" + +from Actions import TEXT, IGNORE, Begin +from Lexicons import Lexicon, State +from Regexps import RE, Seq, Alt, Rep1, Empty, Str, Any, AnyBut, AnyChar, Range +from Regexps import Opt, Rep, Bol, Eol, Eof, Case, NoCase +from Scanners import Scanner + + + diff --git a/debian/pyrex/pyrex-0.9.9/Pyrex/Plex/test_tm.py b/debian/pyrex/pyrex-0.9.9/Pyrex/Plex/test_tm.py new file mode 100755 index 00000000..e08c9e68 --- /dev/null +++ b/debian/pyrex/pyrex-0.9.9/Pyrex/Plex/test_tm.py @@ -0,0 +1,24 @@ +import sys +sys.stderr = sys.stdout + +from TransitionMaps import TransitionMap + +m = TransitionMap() +print m + +def add(c, s): + print + print "adding", repr(c), "-->", repr(s) + m.add_transition(c, s) + print m + print "keys:", m.keys() + +add('a','alpha') +add('e', 'eta') +add('f', 'foo') +add('i', 'iota') +add('i', 'imp') +add('eol', 'elephant') + + + |