'''
Implementation of :class:`IndexMap`,
:class:`CharacterMap` and :class:`ByteMap`.
'''
from __future__ import annotations
from abc import ABC, ABCMeta
from collections.abc import Callable, Iterable
from dataclasses import astuple, dataclass
from functools import partial
from itertools import chain
from typing import (
Any,
cast,
ClassVar,
Generic,
overload,
TYPE_CHECKING,
TypeGuard,
TypeVar
)
from typing_extensions import Self
from character_range.intervals import (
ByteInterval, CharacterInterval, Interval, NotAByte,
NotACharacter
)
_T = TypeVar('_T')
_Char = TypeVar('_Char', str, bytes)
_Index = int
_LookupChar = Callable[[_Char], _Index]
_LookupIndex = Callable[[_Index], _Char]
def _is_char_of_type(
value: str | bytes, expected_type: type[_Char], /
) -> TypeGuard[_Char]:
return isinstance(value, expected_type) and len(value) == 1
[docs]
class NoIntervals(ValueError):
'''
Raised when no intervals are passed
to the map constructor.
'''
pass
[docs]
class OverlappingIntervals(ValueError):
'''
Raised when there are at least two overlapping intervals
in the list of intervals passed to the map constructor.
'''
def __init__(self) -> None:
super().__init__('Intervals must not overlap')
[docs]
class ConfigurationConflict(ValueError):
'''
Raised when the map constructor is passed:
* A list of intervals whose elements don't have the same type.
* Only one lookup function but not the other.
'''
pass
[docs]
class InvalidIndex(LookupError):
'''
Raised when the index returned
by a ``lookup_char`` function
is not a valid index.
'''
def __init__(self, length: int, actual_index: object) -> None:
super().__init__(
f'Expected lookup_char to return an integer '
f'in the interval [0, {length}], got {actual_index!r}'
)
[docs]
class InvalidChar(LookupError):
'''
Raised when the character returned
by a ``lookup_index`` function
is not of the type expected by the map.
'''
def __init__(
self,
actual_char: object,
expected_type: type[str] | type[bytes]
) -> None:
if issubclass(expected_type, str):
expected_type_name = 'string'
else:
expected_type_name = 'bytes object'
super().__init__(
f'Expected lookup_index to return '
f'a {expected_type_name}, got {actual_char!r}'
)
@dataclass(frozen = True, slots = True)
class _Searchers(Generic[_Char]):
lookup_char: _LookupChar[_Char] | None
lookup_index: _LookupIndex[_Char] | None
@property
def both_given(self) -> bool:
'''
Whether both functions are not ``None``.
'''
return self.lookup_char is not None and self.lookup_index is not None
@property
def both_omitted(self) -> bool:
'''
Whether both functions are ``None``.
'''
return self.lookup_char is None and self.lookup_index is None
@property
def only_one_given(self) -> bool:
'''
Whether only one of the functions is ``None``.
'''
return not self.both_given and not self.both_omitted
class _RunCallbackAfterInitialization(type):
'''
:class:`_HasPrebuiltMembers`'s metaclass (a.k.a. a metametaclass).
Runs a callback defined at the instance's level.
'''
_callback_method_name: str
def __call__(cls, *args: object, **kwargs: object) -> Any:
class_with_prebuilt_members = super().__call__(*args, **kwargs)
callback = getattr(cls, cls._callback_method_name)
callback(class_with_prebuilt_members)
return class_with_prebuilt_members
# This cannot be generic due to
# https://github.com/python/mypy/issues/11672
class _HasPrebuiltMembers(
ABCMeta,
metaclass = _RunCallbackAfterInitialization
):
'''
:class:`CharacterMap` and :class:`ByteMap`'s metaclass.
'''
_callback_method_name: ClassVar[str] = '_instantiate_members'
# When the `cls` (`self`) argument is typed as 'type[_T]',
# mypy refuses to understand that it also has the '_member_names'
# attribute, regardless of assertions and castings.
# The way to circumvent this is to use 'getattr()',
# as demonstrated below in '__getitem__' and 'members'.
_member_names: list[str]
def __new__(
mcs,
name: str,
bases: tuple[type, ...],
namespace: dict[str, Any],
**kwargs: Any
) -> _HasPrebuiltMembers:
new_class = super().__new__(mcs, name, bases, namespace, **kwargs)
if ABC in bases:
return new_class
new_class._member_names = [
name for name, value in new_class.__dict__.items()
if not name.startswith('_') and not callable(value)
]
return new_class
def __getitem__(cls: type[_T], item: str) -> _T:
member_names: list[str] = getattr(cls, '_member_names')
if item not in member_names:
raise LookupError(f'No such member: {item!r}')
return cast(_T, getattr(cls, item))
@property
def members(cls: type[_T]) -> tuple[_T, ...]:
'''
Returns a tuple of pre-built members of the class.
'''
member_names: list[str] = getattr(cls, '_member_names')
return tuple(getattr(cls, name) for name in member_names)
def _instantiate_members(cls) -> None:
for member_name in cls._member_names:
value = getattr(cls, member_name)
if isinstance(value, tuple):
setattr(cls, member_name, cls(*value))
else:
setattr(cls, member_name, cls(value))
[docs]
class IndexMap(Generic[_Char], ABC):
'''
A two-way mapping between character or byte
to its corresponding index.
'''
__slots__ = (
'_intervals', '_char_to_index',
'_searchers', '_index_to_char',
'_maps_populated', '_element_type',
'_not_a_char_exception',
'_len', '_repr'
)
_intervals: tuple[Interval[_Char], ...]
_char_to_index: dict[_Char, _Index]
_index_to_char: dict[_Index, _Char]
_searchers: _Searchers[_Char]
_element_type: type[_Char]
_maps_populated: bool
_not_a_char_exception: type[NotACharacter] | type[NotAByte]
_len: int | None
_repr: str | None
def __init__(
self,
intervals: Iterable[Interval[_Char]],
lookup_char: _LookupChar[_Char] | None = None,
lookup_index: _LookupIndex[_Char] | None = None
) -> None:
r'''
Construct a new map from a number of intervals.
The underlying character-to-index and
index-to-character maps will not be populated if
lookup functions are given.
Lookup functions are expected to be the optimized
versions of the naive, brute-force lookup algorithm.
This relationship is similar to that of
``__method__``\ s and built-ins; for example,
while ``__contains__`` is automatically
implemented when a class defines both ``__len__``
and ``__getitem__``, a ``__contains__`` may still
be needed if manual iterations are too
unperformant, unnecessary or unwanted.
Lookup functions must raise either
:class:`LookupError` or :class:`ValueError`
if the index or character cannot be found.
If the index returned by ``lookup_char`` is
not in the interval ``[0, len(self) - 1]``,
a :class:`ValueError` is raised.
:raise ConfigurationConflict: \
If only one lookup function is given.
:raise NoIntervals: \
If no intervals are given.
'''
self._len = None
self._repr = None
self._intervals = tuple(intervals)
self._searchers = _Searchers(lookup_char, lookup_index)
self._char_to_index = {}
self._index_to_char = {}
if self._searchers.only_one_given:
raise ConfigurationConflict(
'The two lookup functions must be either '
'both given or both omitted'
)
if not self._intervals:
raise NoIntervals('At least one interval expected')
self._intervals_must_have_same_type()
self._element_type = self._intervals[0].element_type
if issubclass(self._element_type, str):
self._not_a_char_exception = NotACharacter
else:
self._not_a_char_exception = NotAByte
if self._searchers.both_given:
self._intervals_must_not_overlap()
self._maps_populated = False
return
self._populate_maps()
self._maps_populated = True
[docs]
def __hash__(self) -> int:
return hash(self._intervals)
[docs]
def __len__(self) -> int:
if self._len is not None:
return self._len
if self._maps_populated:
self._len = len(self._char_to_index)
else:
self._len = sum(len(interval) for interval in self._intervals)
return self._len
def __repr__(self) -> str:
if self._repr is not None:
return self._repr
joined_ranges = ''.join(str(interval) for interval in self._intervals)
self._repr = f'{self.__class__.__name__}({joined_ranges})'
return self._repr
@overload
def __getitem__(self, item: _Char) -> _Index:
...
@overload
def __getitem__(self, item: _Index) -> _Char:
...
[docs]
def __getitem__(self, item: _Char | _Index) -> _Index | _Char:
'''
Either look for the character/index in the underlying maps,
or delegate that task to the look-up functions given.
Results are cached.
:raise ValueError: \
If ``item`` is neither a character/byte nor an index.
:raise IndexError: \
If ``lookup_char``
'''
if isinstance(item, int):
return self._get_char_given_index(item)
if isinstance(item, self._element_type):
return self._get_index_given_char(item)
raise TypeError(f'Expected a character or an index, got {item!r}')
[docs]
def __contains__(self, item: object) -> bool:
if not isinstance(item, self._element_type | int):
return False
if isinstance(item, int):
return 0 <= item < len(self)
try:
# This is necessary for PyCharm,
# deemed redundant by mypy,
# and makes Pyright think that 'item' is of type 'object'.
item = cast(_Char, item) # type: ignore
_ = self._get_index_given_char(item) # pyright: ignore
except (LookupError, ValueError):
return False
else:
return True
def __eq__(self, other: object) -> bool:
if not isinstance(other, self.__class__):
return NotImplemented
return self._intervals == other._intervals
# Needed for testing and type hint convenience
def __add__(self, other: Self | IndexMap[_Char] | Interval[_Char]) -> Self:
if not isinstance(other, IndexMap | Interval):
return NotImplemented
if other.element_type is not self.element_type:
raise ConfigurationConflict('Different element types')
lookup_char, lookup_index = astuple(self._searchers)
if isinstance(other, Interval):
return self.__class__(
self._intervals + tuple([other]),
lookup_char = lookup_char,
lookup_index = lookup_index
)
if self._searchers != other._searchers:
raise ConfigurationConflict(
'Maps having different lookup functions '
'cannot be combined'
)
return self.__class__(
self._intervals + other._intervals,
lookup_char = lookup_char,
lookup_index = lookup_index
)
@property
def intervals(self) -> tuple[Interval[_Char], ...]:
'''
The intervals that make up the map.
'''
return self._intervals
@property
def element_type(self) -> type[_Char]:
'''
The type of the map's elements.
'''
return self._element_type
def _intervals_must_have_same_type(self) -> None:
interval_types = {type(interval) for interval in self._intervals}
if len(interval_types) > 1:
raise ConfigurationConflict('Intervals must be of same types')
def _intervals_must_not_overlap(self) -> None:
seen: list[Interval[_Char]] = []
for current_interval in self._intervals:
overlapped = any(
current_interval.intersects(seen_interval)
for seen_interval in seen
)
if overlapped:
raise OverlappingIntervals
seen.append(current_interval)
def _populate_maps(self) -> None:
chained_intervals = chain.from_iterable(self._intervals)
for index, char in enumerate(chained_intervals):
if char in self._char_to_index:
raise OverlappingIntervals
self._char_to_index[char] = index
self._index_to_char[index] = char
def _get_char_given_index(self, index: _Index, /) -> _Char:
if index not in self:
raise IndexError(f'Index {index} is out of range')
if index in self._index_to_char:
return self._index_to_char[index]
lookup_index = self._searchers.lookup_index
assert lookup_index is not None
result = lookup_index(index)
if not _is_char_of_type(result, self._element_type):
raise InvalidChar(result, self._element_type)
self._index_to_char[index] = result
return self._index_to_char[index]
def _get_index_given_char(self, char: _Char, /) -> _Index:
if not _is_char_of_type(char, self._element_type):
raise self._not_a_char_exception(char)
if char in self._char_to_index:
return self._char_to_index[char]
lookup_char = self._searchers.lookup_char
if lookup_char is None:
raise LookupError(f'Char {char!r} is not in the map')
result = lookup_char(char)
if not isinstance(result, int) or result not in self: # pyright: ignore
raise InvalidIndex(len(self), result)
self._char_to_index[char] = result
return self._char_to_index[char]
def _ascii_index_from_char_or_byte(char_or_byte: str | bytes) -> int:
codepoint = ord(char_or_byte)
if not 0 <= codepoint <= 0xFF:
raise ValueError('Not an ASCII character or byte')
return codepoint
@overload
def _ascii_char_or_byte_from_index(constructor: type[str], index: int) -> str:
...
@overload
def _ascii_char_or_byte_from_index(
constructor: type[bytes],
index: int
) -> bytes:
...
def _ascii_char_or_byte_from_index(
constructor: type[str] | type[bytes],
index: int
) -> str | bytes:
if issubclass(constructor, str):
return constructor(chr(index))
else:
# \x80 and higher would be converted
# to two bytes with .encode() alone.
return constructor(index.to_bytes(1, 'big'))
_ascii_char_from_index = cast(
Callable[[int], str],
partial(_ascii_char_or_byte_from_index, str)
)
_ascii_byte_from_index = cast(
Callable[[int], bytes],
partial(_ascii_char_or_byte_from_index, bytes)
)
if TYPE_CHECKING:
class CharacterMap(IndexMap[str], metaclass = _HasPrebuiltMembers):
# At runtime, this is a read-only class-level property.
members: ClassVar[tuple[CharacterMap, ...]]
ASCII_LOWERCASE: ClassVar[CharacterMap]
ASCII_UPPERCASE: ClassVar[CharacterMap]
ASCII_LETTERS: ClassVar[CharacterMap]
ASCII_DIGITS: ClassVar[CharacterMap]
LOWERCASE_HEX_DIGITS: ClassVar[CharacterMap]
UPPERCASE_HEX_DIGITS: ClassVar[CharacterMap]
LOWERCASE_BASE_36: ClassVar[CharacterMap]
UPPERCASE_BASE_36: ClassVar[CharacterMap]
ASCII: ClassVar[CharacterMap]
NON_ASCII: ClassVar[CharacterMap]
UNICODE: ClassVar[CharacterMap]
class ByteMap(IndexMap[bytes], metaclass = _HasPrebuiltMembers):
# At runtime, this is a read-only class-level property.
members: ClassVar[tuple[ByteMap, ...]]
ASCII_LOWERCASE: ClassVar[ByteMap]
ASCII_UPPERCASE: ClassVar[ByteMap]
ASCII_LETTERS: ClassVar[ByteMap]
ASCII_DIGITS: ClassVar[ByteMap]
LOWERCASE_HEX_DIGITS: ClassVar[ByteMap]
UPPERCASE_HEX_DIGITS: ClassVar[ByteMap]
LOWERCASE_BASE_36: ClassVar[ByteMap]
UPPERCASE_BASE_36: ClassVar[ByteMap]
ASCII: ClassVar[ByteMap]
else:
[docs]
class CharacterMap(IndexMap[str], metaclass = _HasPrebuiltMembers):
ASCII_LOWERCASE = [CharacterInterval('a', 'z')]
ASCII_UPPERCASE = [CharacterInterval('A', 'Z')]
ASCII_LETTERS = ASCII_LOWERCASE + ASCII_UPPERCASE
ASCII_DIGITS = [CharacterInterval('0', '9')]
LOWERCASE_HEX_DIGITS = ASCII_DIGITS + [CharacterInterval('a', 'f')]
UPPERCASE_HEX_DIGITS = ASCII_DIGITS + [CharacterInterval('A', 'F')]
LOWERCASE_BASE_36 = ASCII_DIGITS + ASCII_LOWERCASE
UPPERCASE_BASE_36 = ASCII_DIGITS + ASCII_UPPERCASE
ASCII = (
[CharacterInterval('\x00', '\xFF')],
_ascii_index_from_char_or_byte,
_ascii_char_from_index
)
NON_ASCII = (
[CharacterInterval('\u0100', '\U0010FFFF')],
lambda char: ord(char) - 0x100,
lambda index: chr(index + 0x100)
)
UNICODE = ([CharacterInterval('\x00', '\U0010FFFF')], ord, chr)
[docs]
class ByteMap(IndexMap[bytes], metaclass = _HasPrebuiltMembers):
ASCII_LOWERCASE = [ByteInterval(b'a', b'z')]
ASCII_UPPERCASE = [ByteInterval(b'A', b'Z')]
ASCII_LETTERS = ASCII_LOWERCASE + ASCII_UPPERCASE
ASCII_DIGITS = [ByteInterval(b'0', b'9')]
LOWERCASE_HEX_DIGITS = ASCII_DIGITS + [ByteInterval(b'a', b'f')]
UPPERCASE_HEX_DIGITS = ASCII_DIGITS + [ByteInterval(b'A', b'F')]
LOWERCASE_BASE_36 = ASCII_DIGITS + ASCII_LOWERCASE
UPPERCASE_BASE_36 = ASCII_DIGITS + ASCII_UPPERCASE
ASCII = (
[ByteInterval(b'\x00', b'\xFF')],
_ascii_index_from_char_or_byte,
_ascii_byte_from_index
)