Skip to content

Commit

Permalink
Use simple equality string matching when possible (#14)
Browse files Browse the repository at this point in the history
The `compile_matcher` function now returns a `Callable`. This allows us
to return one of two possible implementations:

- `re.Pattern.fullmatch()`
- `str.__eq__`

This allows significantly faster compilation and matching for non-regex
patterns.
  • Loading branch information
etianen authored Jan 20, 2024
1 parent 3c8b091 commit 5e71d51
Show file tree
Hide file tree
Showing 3 changed files with 99 additions and 79 deletions.
43 changes: 31 additions & 12 deletions logot/_match.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
from __future__ import annotations

import re
from functools import partial
from typing import Callable

# Compiled matcher callable.
Matcher = Callable[[str], bool]

# Regex matching a simplified conversion specifier.
_RE_CONVERSION = re.compile(r"%(.|$)")
Expand Down Expand Up @@ -34,17 +39,31 @@
}


def _compile_replace(match: re.Match[str]) -> str:
try:
return _CONVERSION_MAP[match.group(1)]
except KeyError:
raise ValueError(f"Unsupported format character {match.group(1)!r} at index {match.start(1)}") from None
def _match_regex(pattern: re.Pattern[str], value: str) -> bool:
return pattern.fullmatch(value) is not None


def compile(pattern: str) -> re.Pattern[str]:
# Escape the pattern. This leaves simplified conversion specifiers intact.
pattern = re.escape(pattern)
# Substitute simplified conversion specifiers with regex matchers.
pattern = _RE_CONVERSION.sub(_compile_replace, pattern)
# Compile to regex.
return re.compile(pattern, re.DOTALL)
def compile_matcher(pattern: str) -> Matcher:
parts: list[str] = _RE_CONVERSION.split(pattern)
parts_len = len(parts)
# If there is more than one part, at least one conversion specifier was found and we might need a regex matcher.
if parts_len > 1:
is_regex = False
# Replace conversion types with regex matchers.
for n in range(1, parts_len, 2):
part = parts[n]
try:
parts[n] = _CONVERSION_MAP[part]
except KeyError:
part_index = sum(map(len, parts[:n:2])) + ((n // 2) * 2) + 1
raise ValueError(f"Unsupported format character {part!r} at index {part_index}") from None
# A "%" is used as an escape sequence, and doesn't require a regex matcher. Anything else does.
is_regex |= part != "%"
# Create regex matcher.
if is_regex:
parts[::2] = map(re.escape, parts[::2])
return partial(_match_regex, re.compile("".join(parts), re.DOTALL))
# Recreate the pattern with all escape sequences replaced.
pattern = "".join(parts)
# Create simple matcher.
return pattern.__eq__
Loading

0 comments on commit 5e71d51

Please sign in to comment.