# flake8: noqa
# mypy: ignore-errors
import sre_parse
import string
from intxeger.core import Choice, Concatenate, Constant, Group, GroupRef, Node, Repeat
from intxeger.optimize import optimize
CATEGORY_MAP = {
sre_parse.CATEGORY_SPACE: " \t\n\r\x0b\x0c",
sre_parse.CATEGORY_NOT_SPACE: "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~",
sre_parse.CATEGORY_DIGIT: "0123456789",
sre_parse.CATEGORY_NOT_DIGIT: "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c",
sre_parse.CATEGORY_WORD: "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_",
sre_parse.CATEGORY_NOT_WORD: "!\"#$%&'()*+,-./:;<=>?@[\\]^`{|}~ \t\n\r\x0b\x0c",
}
def _to_node(op, args, max_repeat):
if op == sre_parse.IN:
nodes = []
for op, args in args:
nodes.append(_to_node(op, args, max_repeat))
if nodes[0] == "NEGATE":
values = [c[i] for c in nodes[1:] for i in range(c.length)]
nodes = [Constant(c) for c in string.printable if c not in values]
return Choice(nodes)
elif op == sre_parse.RANGE:
min_value, max_value = args
return Choice(
[Constant(chr(value)) for value in range(min_value, max_value + 1)]
)
elif op == sre_parse.LITERAL:
return Constant(chr(args))
elif op == sre_parse.NEGATE:
return "NEGATE"
elif op == sre_parse.CATEGORY:
return Choice([Constant(c) for c in CATEGORY_MAP[args]])
elif op == sre_parse.ANY:
return Choice([Constant(c) for c in string.printable])
elif op == sre_parse.ASSERT:
nodes = []
for op, args in args[1]:
nodes.append(_to_node(op, args, max_repeat))
return Concatenate(nodes)
elif op == sre_parse.BRANCH:
nodes = []
for group in args[1]:
subnodes = []
for op, args in group:
subnodes.append(_to_node(op, args, max_repeat))
nodes.append(Concatenate(subnodes))
return Choice(nodes)
elif op == sre_parse.SUBPATTERN:
nodes = []
ref_id = args[0]
for op, args in args[3]:
nodes.append(_to_node(op, args, max_repeat))
return Group(Concatenate(nodes), ref_id)
elif op == sre_parse.GROUPREF:
return GroupRef(ref_id=args)
elif op == sre_parse.MAX_REPEAT or op == sre_parse.MIN_REPEAT:
min_, max_, args = args
op, args = args[0]
if max_ == sre_parse.MAXREPEAT:
max_ = max_repeat
return Repeat(_to_node(op, args, max_repeat), min_, max_)
elif op == sre_parse.NOT_LITERAL:
return Choice([Constant(c) for c in string.printable if c != chr(args)])
else:
raise ValueError(f"{op} {args}")
[docs]def build(regex: str, use_optimization: bool = True, max_repeat: int = 10) -> Node:
"""Parse the regex and return the root node.
This parses the regex into an internal tree structure and returns the root
node; the root node can then be used to generate samples.
Args:
regex: The regular expression string.
use_optimization: Whether to apply the optimization routine.
max_repeat: The maximum number of repeats when using the zero-or-more
(``*``) or one-or-more (``+``) operators in the regex.
Returns:
The root node of the tree.
"""
nodes = []
tokens = sre_parse.parse(regex)
for op, args in tokens:
nodes.append(_to_node(op, args, max_repeat))
node = Concatenate(nodes)
if use_optimization:
return optimize(node)
return node