Calculate length of unicode string

This commit is contained in:
Joscha 2019-06-09 14:22:28 +00:00
parent 17e49d529c
commit 0775d17357
3 changed files with 35 additions and 0 deletions

View file

@ -10,6 +10,7 @@ from .element_supply import *
from .exceptions import * from .exceptions import *
from .markup import * from .markup import *
from .rendered_element_cache import * from .rendered_element_cache import *
from .utils import *
__all__: List[str] = [] __all__: List[str] = []
@ -23,3 +24,4 @@ __all__ += element_supply.__all__
__all__ += exceptions.__all__ __all__ += exceptions.__all__
__all__ += markup.__all__ __all__ += markup.__all__
__all__ += rendered_element_cache.__all__ __all__ += rendered_element_cache.__all__
__all__ += utils.__all__

View file

@ -1,5 +1,6 @@
# TODO retrieve attributes of any (x, y) coordinates # TODO retrieve attributes of any (x, y) coordinates
# TODO retrieve attributes of closest existing line (by y coordinate) # TODO retrieve attributes of closest existing line (by y coordinate)
# TODO use ulen and unicode string splitting
import collections import collections
from typing import Any, Deque, Iterator, List, Optional, Set, Tuple from typing import Any, Deque, Iterator, List, Optional, Set, Tuple

32
cheuph/utils.py Normal file
View file

@ -0,0 +1,32 @@
import unicodedata
__all__ = ["ulen"]
# See http://www.unicode.org/reports/tr11/#ED7
#
# "In a broad sense, wide characters include W, F, and A (when in East Asian
# context), and narrow characters include N, Na, H, and A (when not in East
# Asian context)."
_WIDE = {"W", "F", "A"} # when in East Asian context
_NARROW = {"N", "Na", "H", "A"} # when not in East Asian context
def ulen(string: str, east_asian_context: bool = False) -> int:
length = 0
if east_asian_context:
for char in string:
if char in _WIDE:
length += 2
else:
length += 1
else:
for char in string:
if char in _NARROW:
length += 1
else:
length += 2
return length
# TODO unicode string splitting based on the same principle as above