Calculate length of unicode string

2019-06-09 14:22:28 +00:00 · 2019-06-09 14:22:28 +00:00 · 0775d17357
commit 0775d17357
parent 17e49d529c
3 changed files with 35 additions and 0 deletions
--- a/cheuph/init.py
+++ b/cheuph/init.py
@ -10,6 +10,7 @@ from .element_supply import *
 from .exceptions import *
 from .markup import *
 from .rendered_element_cache import *
 from .utils import *
 __all__: List[str] = []
@ -23,3 +24,4 @@ __all__ += element_supply.__all__
 __all__ += exceptions.__all__
 __all__ += markup.__all__
 __all__ += rendered_element_cache.__all__
 __all__ += utils.__all__
--- a/cheuph/attributed_lines.py
+++ b/cheuph/attributed_lines.py
@ -1,5 +1,6 @@
 # TODO retrieve attributes of any (x, y) coordinates
 # TODO retrieve attributes of closest existing line (by y coordinate)
 # TODO use ulen and unicode string splitting
 import collections
 from typing import Any, Deque, Iterator, List, Optional, Set, Tuple
--- a/cheuph/utils.py
+++ b/cheuph/utils.py
@ -0,0 +1,32 @@
 import unicodedata
 __all__ = ["ulen"]
 # See http://www.unicode.org/reports/tr11/#ED7
 #
 # "In a broad sense, wide characters include W, F, and A (when in East Asian
 # context), and narrow characters include N, Na, H, and A (when not in East
 # Asian context)."
 _WIDE = {"W", "F", "A"} # when in East Asian context
 _NARROW = {"N", "Na", "H", "A"} # when not in East Asian context
 def ulen(string: str, east_asian_context: bool = False) -> int:
    length = 0
    if east_asian_context:
        for char in string:
            if char in _WIDE:
                length += 2
            else:
                length += 1
    else:
        for char in string:
            if char in _NARROW:
                length += 1
            else:
                length += 2
    return length
 # TODO unicode string splitting based on the same principle as above