Calculate length of unicode string
This commit is contained in:
parent
17e49d529c
commit
0775d17357
3 changed files with 35 additions and 0 deletions
|
|
@ -10,6 +10,7 @@ from .element_supply import *
|
||||||
from .exceptions import *
|
from .exceptions import *
|
||||||
from .markup import *
|
from .markup import *
|
||||||
from .rendered_element_cache import *
|
from .rendered_element_cache import *
|
||||||
|
from .utils import *
|
||||||
|
|
||||||
__all__: List[str] = []
|
__all__: List[str] = []
|
||||||
|
|
||||||
|
|
@ -23,3 +24,4 @@ __all__ += element_supply.__all__
|
||||||
__all__ += exceptions.__all__
|
__all__ += exceptions.__all__
|
||||||
__all__ += markup.__all__
|
__all__ += markup.__all__
|
||||||
__all__ += rendered_element_cache.__all__
|
__all__ += rendered_element_cache.__all__
|
||||||
|
__all__ += utils.__all__
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,6 @@
|
||||||
# TODO retrieve attributes of any (x, y) coordinates
|
# TODO retrieve attributes of any (x, y) coordinates
|
||||||
# TODO retrieve attributes of closest existing line (by y coordinate)
|
# TODO retrieve attributes of closest existing line (by y coordinate)
|
||||||
|
# TODO use ulen and unicode string splitting
|
||||||
|
|
||||||
import collections
|
import collections
|
||||||
from typing import Any, Deque, Iterator, List, Optional, Set, Tuple
|
from typing import Any, Deque, Iterator, List, Optional, Set, Tuple
|
||||||
|
|
|
||||||
32
cheuph/utils.py
Normal file
32
cheuph/utils.py
Normal file
|
|
@ -0,0 +1,32 @@
|
||||||
|
import unicodedata
|
||||||
|
|
||||||
|
__all__ = ["ulen"]
|
||||||
|
|
||||||
|
# See http://www.unicode.org/reports/tr11/#ED7
|
||||||
|
#
|
||||||
|
# "In a broad sense, wide characters include W, F, and A (when in East Asian
|
||||||
|
# context), and narrow characters include N, Na, H, and A (when not in East
|
||||||
|
# Asian context)."
|
||||||
|
_WIDE = {"W", "F", "A"} # when in East Asian context
|
||||||
|
_NARROW = {"N", "Na", "H", "A"} # when not in East Asian context
|
||||||
|
|
||||||
|
def ulen(string: str, east_asian_context: bool = False) -> int:
|
||||||
|
length = 0
|
||||||
|
|
||||||
|
if east_asian_context:
|
||||||
|
for char in string:
|
||||||
|
if char in _WIDE:
|
||||||
|
length += 2
|
||||||
|
else:
|
||||||
|
length += 1
|
||||||
|
|
||||||
|
else:
|
||||||
|
for char in string:
|
||||||
|
if char in _NARROW:
|
||||||
|
length += 1
|
||||||
|
else:
|
||||||
|
length += 2
|
||||||
|
|
||||||
|
return length
|
||||||
|
|
||||||
|
# TODO unicode string splitting based on the same principle as above
|
||||||
Loading…
Add table
Add a link
Reference in a new issue