diff --git a/cheuph/__init__.py b/cheuph/__init__.py index c63ff02..fbc12e0 100644 --- a/cheuph/__init__.py +++ b/cheuph/__init__.py @@ -10,6 +10,7 @@ from .element_supply import * from .exceptions import * from .markup import * from .rendered_element_cache import * +from .utils import * __all__: List[str] = [] @@ -23,3 +24,4 @@ __all__ += element_supply.__all__ __all__ += exceptions.__all__ __all__ += markup.__all__ __all__ += rendered_element_cache.__all__ +__all__ += utils.__all__ diff --git a/cheuph/attributed_lines.py b/cheuph/attributed_lines.py index bf55fa9..970ecab 100644 --- a/cheuph/attributed_lines.py +++ b/cheuph/attributed_lines.py @@ -1,5 +1,6 @@ # TODO retrieve attributes of any (x, y) coordinates # TODO retrieve attributes of closest existing line (by y coordinate) +# TODO use ulen and unicode string splitting import collections from typing import Any, Deque, Iterator, List, Optional, Set, Tuple diff --git a/cheuph/utils.py b/cheuph/utils.py new file mode 100644 index 0000000..c17cbc1 --- /dev/null +++ b/cheuph/utils.py @@ -0,0 +1,32 @@ +import unicodedata + +__all__ = ["ulen"] + +# See http://www.unicode.org/reports/tr11/#ED7 +# +# "In a broad sense, wide characters include W, F, and A (when in East Asian +# context), and narrow characters include N, Na, H, and A (when not in East +# Asian context)." +_WIDE = {"W", "F", "A"} # when in East Asian context +_NARROW = {"N", "Na", "H", "A"} # when not in East Asian context + +def ulen(string: str, east_asian_context: bool = False) -> int: + length = 0 + + if east_asian_context: + for char in string: + if char in _WIDE: + length += 2 + else: + length += 1 + + else: + for char in string: + if char in _NARROW: + length += 1 + else: + length += 2 + + return length + +# TODO unicode string splitting based on the same principle as above