# cython: language_level=3str # cython: auto_pickle=False """ Python Lexical Analyser Scanning an input stream """ from __future__ import absolute_import import cython cython.declare(BOL=object, EOL=object, EOF=object, NOT_FOUND=object) # noqa:E402 from . import Errors from .Regexps import BOL, EOL, EOF NOT_FOUND = object() class Scanner(object): """ A Scanner is used to read tokens from a stream of characters using the token set specified by a Plex.Lexicon. Constructor: Scanner(lexicon, stream, name = '') See the docstring of the __init__ method for details. Methods: See the docstrings of the individual methods for more information. read() --> (value, text) Reads the next lexical token from the stream. position() --> (name, line, col) Returns the position of the last token read using the read() method. begin(state_name) Causes scanner to change state. produce(value [, text]) Causes return of a token value to the caller of the Scanner. """ # lexicon = None # Lexicon # stream = None # file-like object # name = '' # buffer = '' # # These positions are used by the scanner to track its internal state: # buf_start_pos = 0 # position in input of start of buffer # next_pos = 0 # position in input of next char to read # cur_pos = 0 # position in input of current char # cur_line = 1 # line number of current char # cur_line_start = 0 # position in input of start of current line # start_pos = 0 # position in input of start of token # current_scanner_position_tuple = ("", 0, 0) # tuple of filename, line number and position in line, really mainly for error reporting # # These positions are used to track what was read from the queue # (which may differ from the internal state when tokens are replaced onto the queue) # last_token_position_tuple = ("", 0, 0) # tuple of filename, line number and position in line # text = None # text of last token read # initial_state = None # Node # state_name = '' # Name of initial state # queue = None # list of tokens and positions to be returned # trace = 0 def __init__(self, lexicon, stream, name='', initial_pos=None): """ Scanner(lexicon, stream, name = '') |lexicon| is a Plex.Lexicon instance specifying the lexical tokens to be recognised. |stream| can be a file object or anything which implements a compatible read() method. |name| is optional, and may be the name of the file being scanned or any other identifying string. """ self.trace = 0 self.buffer = u'' self.buf_start_pos = 0 self.next_pos = 0 self.cur_pos = 0 self.cur_line = 1 self.start_pos = 0 self.current_scanner_position_tuple = ("", 0, 0) self.last_token_position_tuple = ("", 0, 0) self.text = None self.state_name = None self.lexicon = lexicon self.stream = stream self.name = name self.queue = [] self.initial_state = None self.begin('') self.next_pos = 0 self.cur_pos = 0 self.cur_line_start = 0 self.cur_char = BOL self.input_state = 1 if initial_pos is not None: self.cur_line, self.cur_line_start = initial_pos[1], -initial_pos[2] def read(self): """ Read the next lexical token from the stream and return a tuple (value, text), where |value| is the value associated with the token as specified by the Lexicon, and |text| is the actual string read from the stream. Returns (None, '') on end of file. """ queue = self.queue while not queue: self.text, action = self.scan_a_token() if action is None: self.produce(None) self.eof() else: value = action.perform(self, self.text) if value is not None: self.produce(value) result, self.last_token_position_tuple = queue[0] del queue[0] return result def unread(self, token, value, position): self.queue.insert(0, ((token, value), position)) def get_current_scan_pos(self): # distinct from the position of the last token due to the queue return self.current_scanner_position_tuple def scan_a_token(self): """ Read the next input sequence recognised by the machine and return (text, action). Returns ('', None) on end of file. """ self.start_pos = self.cur_pos self.current_scanner_position_tuple = ( self.name, self.cur_line, self.cur_pos - self.cur_line_start ) action = self.run_machine_inlined() if action is not None: if self.trace: print("Scanner: read: Performing %s %d:%d" % ( action, self.start_pos, self.cur_pos)) text = self.buffer[ self.start_pos - self.buf_start_pos: self.cur_pos - self.buf_start_pos] return (text, action) else: if self.cur_pos == self.start_pos: if self.cur_char is EOL: self.next_char() if self.cur_char is None or self.cur_char is EOF: return (u'', None) raise Errors.UnrecognizedInput(self, self.state_name) def run_machine_inlined(self): """ Inlined version of run_machine for speed. """ state = self.initial_state cur_pos = self.cur_pos cur_line = self.cur_line cur_line_start = self.cur_line_start cur_char = self.cur_char input_state = self.input_state next_pos = self.next_pos buffer = self.buffer buf_start_pos = self.buf_start_pos buf_len = len(buffer) b_action, b_cur_pos, b_cur_line, b_cur_line_start, b_cur_char, b_input_state, b_next_pos = \ None, 0, 0, 0, u'', 0, 0 trace = self.trace while 1: if trace: print("State %d, %d/%d:%s -->" % ( state['number'], input_state, cur_pos, repr(cur_char))) # Begin inlined self.save_for_backup() action = state['action'] if action is not None: b_action, b_cur_pos, b_cur_line, b_cur_line_start, b_cur_char, b_input_state, b_next_pos = \ action, cur_pos, cur_line, cur_line_start, cur_char, input_state, next_pos # End inlined self.save_for_backup() c = cur_char new_state = state.get(c, NOT_FOUND) if new_state is NOT_FOUND: new_state = c and state.get('else') if new_state: if trace: print("State %d" % new_state['number']) state = new_state # Begin inlined: self.next_char() if input_state == 1: cur_pos = next_pos # Begin inlined: c = self.read_char() buf_index = next_pos - buf_start_pos if buf_index < buf_len: c = buffer[buf_index] next_pos += 1 else: discard = self.start_pos - buf_start_pos data = self.stream.read(0x1000) buffer = self.buffer[discard:] + data self.buffer = buffer buf_start_pos += discard self.buf_start_pos = buf_start_pos buf_len = len(buffer) buf_index -= discard if data: c = buffer[buf_index] next_pos += 1 else: c = u'' # End inlined: c = self.read_char() if c == u'\n': cur_char = EOL input_state = 2 elif not c: cur_char = EOL input_state = 4 else: cur_char = c elif input_state == 2: # after EoL (1) -> BoL (3) cur_char = u'\n' input_state = 3 elif input_state == 3: # start new code line cur_line += 1 cur_line_start = cur_pos = next_pos cur_char = BOL input_state = 1 elif input_state == 4: # after final line (1) -> EoF (5) cur_char = EOF input_state = 5 else: # input_state == 5 (EoF) cur_char = u'' # End inlined self.next_char() else: # not new_state if trace: print("blocked") # Begin inlined: action = self.back_up() if b_action is not None: (action, cur_pos, cur_line, cur_line_start, cur_char, input_state, next_pos) = \ (b_action, b_cur_pos, b_cur_line, b_cur_line_start, b_cur_char, b_input_state, b_next_pos) else: action = None break # while 1 # End inlined: action = self.back_up() self.cur_pos = cur_pos self.cur_line = cur_line self.cur_line_start = cur_line_start self.cur_char = cur_char self.input_state = input_state self.next_pos = next_pos if trace: if action is not None: print("Doing %s" % action) return action def next_char(self): input_state = self.input_state if self.trace: print("Scanner: next: %s [%d] %d" % (" " * 20, input_state, self.cur_pos)) if input_state == 1: self.cur_pos = self.next_pos c = self.read_char() if c == u'\n': self.cur_char = EOL self.input_state = 2 elif not c: self.cur_char = EOL self.input_state = 4 else: self.cur_char = c elif input_state == 2: self.cur_char = u'\n' self.input_state = 3 elif input_state == 3: self.cur_line += 1 self.cur_line_start = self.cur_pos = self.next_pos self.cur_char = BOL self.input_state = 1 elif input_state == 4: self.cur_char = EOF self.input_state = 5 else: # input_state = 5 self.cur_char = u'' if self.trace: print("--> [%d] %d %r" % (input_state, self.cur_pos, self.cur_char)) def position(self): """ Return a tuple (name, line, col) representing the location of the last token read using the read() method. |name| is the name that was provided to the Scanner constructor; |line| is the line number in the stream (1-based); |col| is the position within the line of the first character of the token (0-based). """ return self.last_token_position_tuple def get_position(self): """ Python accessible wrapper around position(), only for error reporting. """ return self.position() def begin(self, state_name): """Set the current state of the scanner to the named state.""" self.initial_state = ( self.lexicon.get_initial_state(state_name)) self.state_name = state_name def produce(self, value, text=None): """ Called from an action procedure, causes |value| to be returned as the token value from read(). If |text| is supplied, it is returned in place of the scanned text. produce() can be called more than once during a single call to an action procedure, in which case the tokens are queued up and returned one at a time by subsequent calls to read(), until the queue is empty, whereupon scanning resumes. """ if text is None: text = self.text self.queue.append(((value, text), self.current_scanner_position_tuple)) def eof(self): """ Override this method if you want something to be done at end of file. """ pass @property def start_line(self): return self.last_token_position_tuple[1]