'''
Implementation of :class:`Interval`,
:class:`CharacterInterval` and :class:`ByteInterval`.
'''
from abc import ABC, abstractmethod
from collections.abc import Iterator
from typing import Any, ClassVar, Generic, overload, SupportsIndex, TypeGuard, TypeVar
from typing_extensions import override, Self
_Char = TypeVar('_Char', str, bytes)
def _ascii_repr(char: str | bytes) -> str:
if isinstance(char, str):
char_is_ascii_printable = ' ' <= char <= '~'
else:
char_is_ascii_printable = b' ' <= char <= b'~'
if char in ('\\', b'\\'):
return r'\\'
if char in ('-', b'-'):
return r'\-'
if char_is_ascii_printable:
return char.decode() if isinstance(char, bytes) else char
codepoint = ord(char)
if codepoint <= 0xFF:
return fr'\x{codepoint:02X}'
if codepoint <= 0xFFFF:
return fr'\u{codepoint:04X}'
return fr'\U{codepoint:08X}'
def _is_char_of_type(
value: object, expected_type: type[_Char], /
) -> TypeGuard[_Char]:
return isinstance(value, expected_type) and len(value) == 1
def _is_valid_codepoint_range(
codepoint_range: range, /,
upper_limit: int
) -> bool:
start, stop, step = (
codepoint_range.start,
codepoint_range.stop,
codepoint_range.step
)
return 0 <= start < stop <= upper_limit and step == 1
[docs]
class InvalidIntervalDirection(ValueError):
'''
Raised when an interval constructor is passed
a ``start`` whose value is greater than that of ``end``.
'''
def __init__(self, start: _Char, stop: _Char) -> None:
super().__init__(
f'Expected stop to be greater than or equals to start, '
f'got {start!r} > {stop!r}'
)
[docs]
class NotACharacter(ValueError):
'''
Raised when an object is expected to be a character
(a :class:`str` of length 1) but it is not one.
'''
def __init__(self, actual: object) -> None:
if isinstance(actual, str):
value_repr = f'string of length {len(actual)}'
else:
value_repr = repr(actual)
super().__init__(f'Expected a character, got {value_repr}')
[docs]
class NotAByte(ValueError):
'''
Raised when an object is expected to be a byte
(a :class:`bytes` object of length 1) but it is not one.
'''
def __init__(self, actual: object) -> None:
if isinstance(actual, bytes):
value_repr = f'a bytes object of length {len(actual)}'
else:
value_repr = repr(actual)
super().__init__(f'Expected a single byte, got {value_repr!r}')
[docs]
class InvalidCodepointRange(ValueError):
'''
Raised when a :class:`range` cannot be interpreted as
an :class:`Interval`'s codepoint range.
'''
pass
[docs]
class Interval(Generic[_Char], ABC):
'''
An interval (both ends inclusive) of characters,
represented using either :class:`str` or :class:`bytes`.
'''
_not_a_char_exception: ClassVar[type[ValueError]]
'''
Exception raised when an object is expected
to be a character but it is not.
'''
_max_value: ClassVar[int]
'''
The maximum integral value that can be converted to a character.
'''
__slots__ = ('_start', '_end')
_start: _Char
_end: _Char
def __new__(cls, start: _Char, end: _Char) -> 'Self':
'''
Construct a new interval.
:param start: The start of the interval, inclusive.
:param end: The end of the interval, inclusive.
'''
instance = super().__new__(cls)
instance._start = start
instance._end = end
not_a_char_exception = cls._not_a_char_exception
element_type = instance.element_type
if not _is_char_of_type(start, element_type):
raise not_a_char_exception(start)
if not _is_char_of_type(end, element_type):
raise not_a_char_exception(end)
if start > end:
raise InvalidIntervalDirection(start, end)
return instance
[docs]
def __hash__(self) -> int:
return hash((self.element_type, self.start, self.end))
[docs]
def __iter__(self) -> Iterator[_Char]:
'''
Lazily yield each character or byte.
'''
for codepoint in self.to_codepoint_range():
yield self._make_element(codepoint)
[docs]
def __reversed__(self) -> Iterator[_Char]:
'''
Lazily yield each character or byte in reverse order.
'''
for codepoint in reversed(self.to_codepoint_range()):
yield self._make_element(codepoint)
@overload
def __getitem__(self, item: slice) -> Self:
...
@overload
def __getitem__(self, item: SupportsIndex) -> _Char:
...
[docs]
def __getitem__(self, item: slice | SupportsIndex) -> Self | _Char:
'''
``O(1)`` indexing of character or byte.
:class:`slice` objects are also supported.
'''
if isinstance(item, SupportsIndex):
item = int(item)
integral_element = self.to_codepoint_range()[item]
return self._make_element(integral_element)
new_codepoint_range = self.to_codepoint_range()[item]
try:
return self.__class__.from_codepoint_range(new_codepoint_range)
except InvalidCodepointRange as exception:
outer_exception = InvalidCodepointRange(
f'The interval derived from slicing self '
f'with {item!r} is invalid'
)
raise outer_exception from exception
[docs]
def __len__(self) -> int:
'''
The length of the interval, equivalent to
``codepoint(end) - codepoint(start) + 1``.
'''
return len(self.to_codepoint_range())
[docs]
def __contains__(self, item: Any) -> bool:
'''
Assert that ``item`` is a valid element
and that it is lexicographically
greater than or equals to that of ``start``
and less than or equals to that of ``end``.
'''
if not _is_char_of_type(item, self.element_type):
return False
return self._start <= item <= self._end
[docs]
def __repr__(self) -> str:
return f'{self.__class__.__name__}({self})'
[docs]
def __str__(self) -> str:
r'''
Return an ASCII representation of the range, typically
looks like ``\x00-a``, ``\--\uFFFD`` or ``\U00100000``.
'''
if len(self) == 1:
return _ascii_repr(self._start)
return f'{_ascii_repr(self._start)}-{_ascii_repr(self._end)}'
[docs]
def __eq__(self, other: object) -> bool:
'''
Two intervals are equal if one is an instance of
the other's class and their endpoints have the
same integral values.
'''
if not isinstance(other, self.__class__):
return NotImplemented
return self.to_codepoint_range() == other.to_codepoint_range()
[docs]
def __and__(self, other: Self) -> bool:
'''
See :meth:`.intersects`.
'''
if not isinstance(other, self.__class__):
return NotImplemented
earlier_end = min(self._end, other._end)
later_start = max(self._start, other._start)
return later_start <= earlier_end
@property
def start(self) -> _Char:
'''
The starting endpoint of the interval.
'''
return self._start
@property
def end(self) -> _Char:
'''
The ending endpoint of the interval.
'''
return self._end
@property
@abstractmethod
def element_type(self) -> type[_Char]:
'''
A class-based property that returns
the type of the interval's elements.
'''
raise NotImplementedError
@classmethod
@abstractmethod
def _make_element(cls, value: int, /) -> _Char:
'''
Convert an integral value to the interval's
element type.
Subclasses must raise :class:`ValueError`
if ``value`` cannot be converted to an element.
'''
raise NotImplementedError
[docs]
def to_codepoint_range(self) -> range:
'''
Convert the interval to a native :class:`range` that
would yield the codepoints of the elements of the interval.
'''
return range(ord(self.start), ord(self.end) + 1)
[docs]
def intersects(self, other: Self) -> bool:
'''
Whether two intervals intersect each other.
'''
return self & other
[docs]
@classmethod
def from_codepoint_range(cls, codepoint_range: range, /) -> Self:
'''
Construct an interval from a :class:`range` of codepoints.
As a technical limit, for a :class:`CharacterInterval`,
the codepoint of an endpoint must not be negative or
greater than ``0x10FFFF``. Similarly, for a
:class:`ByteInterval`, the integral value of an endpoint
must be in the interval ``[0, 255]``.
'''
upper_limit = cls._max_value + 1
if not _is_valid_codepoint_range(codepoint_range, upper_limit):
raise InvalidCodepointRange(
f'Expected 0 <= start < stop <= {upper_limit} '
f'and step == 1, got {codepoint_range!r}'
)
start, stop = codepoint_range.start, codepoint_range.stop
start_element = cls._make_element(start)
stop_element = cls._make_element(max(0, stop - 1))
return cls(start_element, stop_element)
[docs]
class CharacterInterval(Interval[str]):
_not_a_char_exception = NotACharacter
_max_value = 0x10FFFF
'''
The maximum integral value of a Unicode codepoint: ``0x10FFFF``.
'''
@property
@override
def element_type(self) -> type[str]:
return str
@classmethod
@override
def _make_element(cls, value: int, /) -> str:
return chr(value)
[docs]
class ByteInterval(Interval[bytes]):
_not_a_char_exception = NotAByte
_max_value = 0xFF
'''
The maximum integral value of a byte (16 bits): ``0xFF``.
'''
@property
@override
def element_type(self) -> type[bytes]:
return bytes
@classmethod
@override
def _make_element(cls, value: int, /) -> bytes:
return value.to_bytes(1, 'big')