204 lines
7.0 KiB
Python
204 lines
7.0 KiB
Python
|
"""
|
||
|
babel.messages.jslexer
|
||
|
~~~~~~~~~~~~~~~~~~~~~~
|
||
|
|
||
|
A simple JavaScript 1.5 lexer which is used for the JavaScript
|
||
|
extractor.
|
||
|
|
||
|
:copyright: (c) 2013-2023 by the Babel Team.
|
||
|
:license: BSD, see LICENSE for more details.
|
||
|
"""
|
||
|
from __future__ import annotations
|
||
|
|
||
|
import re
|
||
|
from collections.abc import Generator
|
||
|
from typing import NamedTuple
|
||
|
|
||
|
operators: list[str] = sorted([
|
||
|
'+', '-', '*', '%', '!=', '==', '<', '>', '<=', '>=', '=',
|
||
|
'+=', '-=', '*=', '%=', '<<', '>>', '>>>', '<<=', '>>=',
|
||
|
'>>>=', '&', '&=', '|', '|=', '&&', '||', '^', '^=', '(', ')',
|
||
|
'[', ']', '{', '}', '!', '--', '++', '~', ',', ';', '.', ':',
|
||
|
], key=len, reverse=True)
|
||
|
|
||
|
escapes: dict[str, str] = {'b': '\b', 'f': '\f', 'n': '\n', 'r': '\r', 't': '\t'}
|
||
|
|
||
|
name_re = re.compile(r'[\w$_][\w\d$_]*', re.UNICODE)
|
||
|
dotted_name_re = re.compile(r'[\w$_][\w\d$_.]*[\w\d$_.]', re.UNICODE)
|
||
|
division_re = re.compile(r'/=?')
|
||
|
regex_re = re.compile(r'/(?:[^/\\]*(?:\\.[^/\\]*)*)/[a-zA-Z]*', re.DOTALL)
|
||
|
line_re = re.compile(r'(\r\n|\n|\r)')
|
||
|
line_join_re = re.compile(r'\\' + line_re.pattern)
|
||
|
uni_escape_re = re.compile(r'[a-fA-F0-9]{1,4}')
|
||
|
hex_escape_re = re.compile(r'[a-fA-F0-9]{1,2}')
|
||
|
|
||
|
|
||
|
class Token(NamedTuple):
|
||
|
type: str
|
||
|
value: str
|
||
|
lineno: int
|
||
|
|
||
|
|
||
|
_rules: list[tuple[str | None, re.Pattern[str]]] = [
|
||
|
(None, re.compile(r'\s+', re.UNICODE)),
|
||
|
(None, re.compile(r'<!--.*')),
|
||
|
('linecomment', re.compile(r'//.*')),
|
||
|
('multilinecomment', re.compile(r'/\*.*?\*/', re.UNICODE | re.DOTALL)),
|
||
|
('dotted_name', dotted_name_re),
|
||
|
('name', name_re),
|
||
|
('number', re.compile(r'''(
|
||
|
(?:0|[1-9]\d*)
|
||
|
(\.\d+)?
|
||
|
([eE][-+]?\d+)? |
|
||
|
(0x[a-fA-F0-9]+)
|
||
|
)''', re.VERBOSE)),
|
||
|
('jsx_tag', re.compile(r'(?:</?[^>\s]+|/>)', re.I)), # May be mangled in `get_rules`
|
||
|
('operator', re.compile(r'(%s)' % '|'.join(map(re.escape, operators)))),
|
||
|
('template_string', re.compile(r'''`(?:[^`\\]*(?:\\.[^`\\]*)*)`''', re.UNICODE)),
|
||
|
('string', re.compile(r'''(
|
||
|
'(?:[^'\\]*(?:\\.[^'\\]*)*)' |
|
||
|
"(?:[^"\\]*(?:\\.[^"\\]*)*)"
|
||
|
)''', re.VERBOSE | re.DOTALL)),
|
||
|
]
|
||
|
|
||
|
|
||
|
def get_rules(jsx: bool, dotted: bool, template_string: bool) -> list[tuple[str | None, re.Pattern[str]]]:
|
||
|
"""
|
||
|
Get a tokenization rule list given the passed syntax options.
|
||
|
|
||
|
Internal to this module.
|
||
|
"""
|
||
|
rules = []
|
||
|
for token_type, rule in _rules:
|
||
|
if not jsx and token_type and 'jsx' in token_type:
|
||
|
continue
|
||
|
if not template_string and token_type == 'template_string':
|
||
|
continue
|
||
|
if token_type == 'dotted_name':
|
||
|
if not dotted:
|
||
|
continue
|
||
|
token_type = 'name'
|
||
|
rules.append((token_type, rule))
|
||
|
return rules
|
||
|
|
||
|
|
||
|
def indicates_division(token: Token) -> bool:
|
||
|
"""A helper function that helps the tokenizer to decide if the current
|
||
|
token may be followed by a division operator.
|
||
|
"""
|
||
|
if token.type == 'operator':
|
||
|
return token.value in (')', ']', '}', '++', '--')
|
||
|
return token.type in ('name', 'number', 'string', 'regexp')
|
||
|
|
||
|
|
||
|
def unquote_string(string: str) -> str:
|
||
|
"""Unquote a string with JavaScript rules. The string has to start with
|
||
|
string delimiters (``'``, ``"`` or the back-tick/grave accent (for template strings).)
|
||
|
"""
|
||
|
assert string and string[0] == string[-1] and string[0] in '"\'`', \
|
||
|
'string provided is not properly delimited'
|
||
|
string = line_join_re.sub('\\1', string[1:-1])
|
||
|
result: list[str] = []
|
||
|
add = result.append
|
||
|
pos = 0
|
||
|
|
||
|
while True:
|
||
|
# scan for the next escape
|
||
|
escape_pos = string.find('\\', pos)
|
||
|
if escape_pos < 0:
|
||
|
break
|
||
|
add(string[pos:escape_pos])
|
||
|
|
||
|
# check which character is escaped
|
||
|
next_char = string[escape_pos + 1]
|
||
|
if next_char in escapes:
|
||
|
add(escapes[next_char])
|
||
|
|
||
|
# unicode escapes. trie to consume up to four characters of
|
||
|
# hexadecimal characters and try to interpret them as unicode
|
||
|
# character point. If there is no such character point, put
|
||
|
# all the consumed characters into the string.
|
||
|
elif next_char in 'uU':
|
||
|
escaped = uni_escape_re.match(string, escape_pos + 2)
|
||
|
if escaped is not None:
|
||
|
escaped_value = escaped.group()
|
||
|
if len(escaped_value) == 4:
|
||
|
try:
|
||
|
add(chr(int(escaped_value, 16)))
|
||
|
except ValueError:
|
||
|
pass
|
||
|
else:
|
||
|
pos = escape_pos + 6
|
||
|
continue
|
||
|
add(next_char + escaped_value)
|
||
|
pos = escaped.end()
|
||
|
continue
|
||
|
else:
|
||
|
add(next_char)
|
||
|
|
||
|
# hex escapes. conversion from 2-digits hex to char is infallible
|
||
|
elif next_char in 'xX':
|
||
|
escaped = hex_escape_re.match(string, escape_pos + 2)
|
||
|
if escaped is not None:
|
||
|
escaped_value = escaped.group()
|
||
|
add(chr(int(escaped_value, 16)))
|
||
|
pos = escape_pos + 2 + len(escaped_value)
|
||
|
continue
|
||
|
else:
|
||
|
add(next_char)
|
||
|
|
||
|
# bogus escape. Just remove the backslash.
|
||
|
else:
|
||
|
add(next_char)
|
||
|
pos = escape_pos + 2
|
||
|
|
||
|
if pos < len(string):
|
||
|
add(string[pos:])
|
||
|
|
||
|
return ''.join(result)
|
||
|
|
||
|
|
||
|
def tokenize(source: str, jsx: bool = True, dotted: bool = True, template_string: bool = True, lineno: int = 1) -> Generator[Token, None, None]:
|
||
|
"""
|
||
|
Tokenize JavaScript/JSX source. Returns a generator of tokens.
|
||
|
|
||
|
:param jsx: Enable (limited) JSX parsing.
|
||
|
:param dotted: Read dotted names as single name token.
|
||
|
:param template_string: Support ES6 template strings
|
||
|
:param lineno: starting line number (optional)
|
||
|
"""
|
||
|
may_divide = False
|
||
|
pos = 0
|
||
|
end = len(source)
|
||
|
rules = get_rules(jsx=jsx, dotted=dotted, template_string=template_string)
|
||
|
|
||
|
while pos < end:
|
||
|
# handle regular rules first
|
||
|
for token_type, rule in rules: # noqa: B007
|
||
|
match = rule.match(source, pos)
|
||
|
if match is not None:
|
||
|
break
|
||
|
# if we don't have a match we don't give up yet, but check for
|
||
|
# division operators or regular expression literals, based on
|
||
|
# the status of `may_divide` which is determined by the last
|
||
|
# processed non-whitespace token using `indicates_division`.
|
||
|
else:
|
||
|
if may_divide:
|
||
|
match = division_re.match(source, pos)
|
||
|
token_type = 'operator'
|
||
|
else:
|
||
|
match = regex_re.match(source, pos)
|
||
|
token_type = 'regexp'
|
||
|
if match is None:
|
||
|
# woops. invalid syntax. jump one char ahead and try again.
|
||
|
pos += 1
|
||
|
continue
|
||
|
|
||
|
token_value = match.group()
|
||
|
if token_type is not None:
|
||
|
token = Token(token_type, token_value, lineno)
|
||
|
may_divide = indicates_division(token)
|
||
|
yield token
|
||
|
lineno += len(line_re.findall(token_value))
|
||
|
pos = match.end()
|