158 lines
4 KiB
Python
158 lines
4 KiB
Python
#=======================================================================
|
|
#
|
|
# Python Lexical Analyser
|
|
#
|
|
# Traditional Regular Expression Syntax
|
|
#
|
|
#=======================================================================
|
|
|
|
from __future__ import absolute_import
|
|
|
|
from .Regexps import Alt, Seq, Rep, Rep1, Opt, Any, AnyBut, Bol, Eol, Char
|
|
from .Errors import PlexError
|
|
|
|
|
|
class RegexpSyntaxError(PlexError):
|
|
pass
|
|
|
|
|
|
def re(s):
|
|
"""
|
|
Convert traditional string representation of regular expression |s|
|
|
into Plex representation.
|
|
"""
|
|
return REParser(s).parse_re()
|
|
|
|
|
|
class REParser(object):
|
|
def __init__(self, s):
|
|
self.s = s
|
|
self.i = -1
|
|
self.end = 0
|
|
self.next()
|
|
|
|
def parse_re(self):
|
|
re = self.parse_alt()
|
|
if not self.end:
|
|
self.error("Unexpected %s" % repr(self.c))
|
|
return re
|
|
|
|
def parse_alt(self):
|
|
"""Parse a set of alternative regexps."""
|
|
re = self.parse_seq()
|
|
if self.c == '|':
|
|
re_list = [re]
|
|
while self.c == '|':
|
|
self.next()
|
|
re_list.append(self.parse_seq())
|
|
re = Alt(*re_list)
|
|
return re
|
|
|
|
def parse_seq(self):
|
|
"""Parse a sequence of regexps."""
|
|
re_list = []
|
|
while not self.end and not self.c in "|)":
|
|
re_list.append(self.parse_mod())
|
|
return Seq(*re_list)
|
|
|
|
def parse_mod(self):
|
|
"""Parse a primitive regexp followed by *, +, ? modifiers."""
|
|
re = self.parse_prim()
|
|
while not self.end and self.c in "*+?":
|
|
if self.c == '*':
|
|
re = Rep(re)
|
|
elif self.c == '+':
|
|
re = Rep1(re)
|
|
else: # self.c == '?'
|
|
re = Opt(re)
|
|
self.next()
|
|
return re
|
|
|
|
def parse_prim(self):
|
|
"""Parse a primitive regexp."""
|
|
c = self.get()
|
|
if c == '.':
|
|
re = AnyBut("\n")
|
|
elif c == '^':
|
|
re = Bol
|
|
elif c == '$':
|
|
re = Eol
|
|
elif c == '(':
|
|
re = self.parse_alt()
|
|
self.expect(')')
|
|
elif c == '[':
|
|
re = self.parse_charset()
|
|
self.expect(']')
|
|
else:
|
|
if c == '\\':
|
|
c = self.get()
|
|
re = Char(c)
|
|
return re
|
|
|
|
def parse_charset(self):
|
|
"""Parse a charset. Does not include the surrounding []."""
|
|
char_list = []
|
|
invert = 0
|
|
if self.c == '^':
|
|
invert = 1
|
|
self.next()
|
|
if self.c == ']':
|
|
char_list.append(']')
|
|
self.next()
|
|
while not self.end and self.c != ']':
|
|
c1 = self.get()
|
|
if self.c == '-' and self.lookahead(1) != ']':
|
|
self.next()
|
|
c2 = self.get()
|
|
for a in range(ord(c1), ord(c2) + 1):
|
|
char_list.append(chr(a))
|
|
else:
|
|
char_list.append(c1)
|
|
chars = ''.join(char_list)
|
|
if invert:
|
|
return AnyBut(chars)
|
|
else:
|
|
return Any(chars)
|
|
|
|
def next(self):
|
|
"""Advance to the next char."""
|
|
s = self.s
|
|
i = self.i = self.i + 1
|
|
if i < len(s):
|
|
self.c = s[i]
|
|
else:
|
|
self.c = ''
|
|
self.end = 1
|
|
|
|
def get(self):
|
|
if self.end:
|
|
self.error("Premature end of string")
|
|
c = self.c
|
|
self.next()
|
|
return c
|
|
|
|
def lookahead(self, n):
|
|
"""Look ahead n chars."""
|
|
j = self.i + n
|
|
if j < len(self.s):
|
|
return self.s[j]
|
|
else:
|
|
return ''
|
|
|
|
def expect(self, c):
|
|
"""
|
|
Expect to find character |c| at current position.
|
|
Raises an exception otherwise.
|
|
"""
|
|
if self.c == c:
|
|
self.next()
|
|
else:
|
|
self.error("Missing %s" % repr(c))
|
|
|
|
def error(self, mess):
|
|
"""Raise exception to signal syntax error in regexp."""
|
|
raise RegexpSyntaxError("Syntax error in regexp %s at position %d: %s" % (
|
|
repr(self.s), self.i, mess))
|
|
|
|
|
|
|