-
Notifications
You must be signed in to change notification settings - Fork 0
/
convert.py
184 lines (163 loc) · 7.17 KB
/
convert.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
import re
import unicodedata
from collections.abc import Iterable
REPLACEMENTS_HINDI = [
# LIGATURES
# Pokémon GO's text system does not allow for the selection of context-sensitive ligatures like these,
# so they are each given a separate codepoint in the PUA.
# Conjuncts
('\uF01A', 'द्भु'), # dbhu
('\uF337', 'क्ष'), # ksha
('\uF339', 'ज्ञ'), # gya
('\uF363', 'क्र'), # kra (alternate)
('\uF364', 'ख्र'), # khra
('\uF365', 'ग्र'), # gra
('\uF366', 'घ्र'), # ghra
('\uF367', 'च्र'), # chra
('\uF368', 'ज्र'), # jra
('\uF369', 'झ्र'), # jhra
('\uF36B', 'ञ्र'), # ñra
('\uF36C', 'ण्र'), # ṇra
('\uF36E', 'त्र'), # tra
('\uF36F', 'थ्र'), # thra
('\uF370', 'द्र'), # dra
('\uF371', 'ध्र'), # dhra
('\uF373', 'न्र'), # nra
('\uF374', 'प्र'), # pra
('\uF375', 'फ्र'), # phra
('\uF376', 'ब्र'), # bra
('\uF377', 'भ्र'), # bhra
('\uF379', 'म्र'), # mra
('\uF37A', 'य्र'), # yra
('\uF37B', 'ल्र'), # lra
('\uF37C', 'व्र'), # vra
('\uF37D', 'श्र'), # śra
('\uF37E', 'ष्र'), # ṣra
('\uF37F', 'स्र'), # sra
('\uF380', 'ह्र'), # hra
('\uF3D1', 'क्र'), # kra (standard)
('\uF3E3', 'द्म'), # dma
('\uF3E4', 'द्य'), # dya
('\uF449', 'त्न'), # tna
('\uF45A', 'द्द'), # dda
('\uF46A', 'द्ध'), # ddha
('\uF493', 'द्व'), # dva
('\uF4F5', 'स्न'), # sna
('\uF555', 'रू'), # rū
('\uF564', 'हु'), # hu
('\uF565', 'हू'), # hū
# Full forms
('\uF334', 'छ'), # cha (unused, GO uses the half form in all contexts)
# Half forms
# We add a halant to allow them to ligate with the following character.
('\uF33A', 'क्'), # k
('\uF33B', 'ख्'), # kh
('\uF33C', 'ग्'), # g
('\uF33D', 'घ्'), # gh
('\uF33E', 'च्'), # c
('\uF33F', 'ज्'), # j
('\uF340', 'झ्'), # jh
('\uF343', 'ञ्'), # ñ
('\uF344', 'ण्'), # ṇ
('\uF346', 'त्'), # t
('\uF347', 'थ्'), # th
('\uF348', 'ढ्'), # ḍh
('\uF34A', 'न्'), # n
('\uF34B', 'प्'), # p
('\uF34C', 'फ्'), # ph
('\uF34D', 'ब्'), # b
('\uF34E', 'भ्'), # bh
('\uF350', 'म्'), # m
('\uF351', 'य्'), # y
('\uF352', 'र्'), # r (eyelash)
('\uF353', 'ल्'), # l
('\uF355', 'ळ्'), # ḷ
('\uF356', 'व्'), # v
('\uF357', 'श्'), # ś
('\uF35A', 'ष्'), # ṣ
('\uF35B', 'स्'), # s
('\uF35C', 'ह्'), # h
# Stacked forms
# We add a halant to the upper forms to allow them to ligate with the following consonant.
('\uF5A2', 'ट्'), # ṭ upper
('\uF5A3', 'ठ्'), # ṭh upper
('\uF5A4', 'ड्'), # ḍ upper
('\uF5A5', 'ढ्'), # ḍh upper
('\uF61F', 'ट'), # ṭ lower
('\uF625', 'ठ'), # ṭh lower
('\uF62B', 'ड'), # ḍ lower
('\uF633', 'ढ'), # ḍh lower
# COMBINING CHARACTERS
(re.compile(r'[\uF008-\uF00F]'), '\u0947'), # long e
(re.compile(r'[\uF010-\uF017]'), '\u0948'), # ai
('\uF018', '\u0941'), # short u
('\uF019', '\u0942'), # long u
('\uF01B', '\u0902'), # ṁ
('\uF300', '\u0901'), # chandrabindu
(re.compile(r'[\uF321-\uF32C]'), '\u0940'), # long i
# Pokémon GO writes "short i" in visual order, before the corresponding consonant cluster.
# In Unicode, "short i" should be written in logical order, after the corresponding consonant cluster.
# To find a consonant cluster, we match zero or more consonant + halant sequences followed by a consonant.
(re.compile(r'[\u093F\uF30C-\uF320]((?:[\u0915-\u0939\u0958-\u095F\uF000-\uF007\uF1B0\uF306]\u094D)*[\u0915-\u0939\u0958-\u095F\uF000-\uF007\uF1B0\uF306])'), '\\1\u093F'), # combining i
# Pokémon GO writes "initial r" in visual order, at the end of the corresponding consonant cluster.
# In Unicode, "initial r" should be written in logical order, at the start of the corresponding consonant cluster.
# To find a consonant cluster, we match zero or more consonant + halant sequences followed by a consonant.
# To allow the ligature to join properly, we add a halant between the r and the rest of the consonant cluster.
(re.compile(r'((?:[\u0915-\u0939\u0958-\u095F]\u094D)*[\u0915-\u0939\u0958-\u095F])[\uF000-\uF007\uF306]'), r'र्\1'), # combining r (repha)
# Pokémon GO writes "final r" in visual order, at the end of the corresponding consonant cluster.
# In Unicode, "final r" should be written in logical order, also at the end of the corresponding consonant cluster.
# To allow the ligature to join properly, we add a halant between the rest of the consonant cluster and the r.
(re.compile('([\u0915-\u0939\u0958-\u095F])\uF1B0'), '\\1\u094Dर'), # combining r (rakar)
]
REPLACEMENTS_THAI = [
# CONSONANTS
# Without lower curves to allow a vowel to be written below them
('\uF700', 'ฐ'),
('\uF70F', 'ญ'),
# VOWELS
# Shifted left to compensate for ascenders (ปฝฟฬ)
('\uF701', '\u0E34'), # short i
('\uF702', '\u0E35'), # long i
('\uF703', '\u0E36'), # short ue
('\uF704', '\u0E37'), # long ue
('\uF710', '\u0E31'), # mai han akat
('\uF711', '\u0E4D'), # nikkhahit
('\uF712', '\u0E47'), # mai tai khu
# Shifted down to compensate for descenders (ฎฏ, currently unused)
('\uF718', '\u0E38'), # short u
('\uF719', '\u0E39'), # long u
('\uF71A', '\u0E3A'), # phinthu
# TONE MARKS
# Shifted down and left to compensate for ascenders (ปฝฟฬ + ◌ุู◌ู)
('\uF705', '\u0E48'), # mai ek
('\uF706', '\u0E49'), # mai tho
('\uF707', '\u0E4A'), # mai tri
('\uF708', '\u0E4B'), # mai chattawa
('\uF709', '\u0E4C'), # thanthakhat (silent letter)
# Shifted down to compensate for shorter letters
# (กขคฆงจฉชซญฐฑณดตถทธนบผพมยรลวศษสหอฮ + ◌ุู◌ู)
('\uF70A', '\u0E48'), # mai ek
('\uF70B', '\u0E49'), # mai tho
('\uF70C', '\u0E4A'), # mai tri
('\uF70D', '\u0E4B'), # mai chattawa
('\uF70E', '\u0E4C'), # thanthakhat (silent letter)
# Compensates for ascenders (unused due to being the same as regular codepoint)
('\uF713', '\u0E48'), # mai ek
('\uF714', '\u0E49'), # mai tho
('\uF715', '\u0E4A'), # mai tri
('\uF716', '\u0E4B'), # mai chattawa
('\uF717', '\u0E4C'), # thanthakhat (silent letter)
]
def decode(s: str, key: Iterable[tuple[str | re.Pattern, str]]):
for pattern, repl in key:
if isinstance(pattern, re.Pattern):
s = pattern.sub(repl, s)
else:
s = s.replace(pattern, repl)
return unicodedata.normalize('NFC', s)
def decode_file(src, dst, key: Iterable[tuple[str | re.Pattern, str]]):
with open(src, 'r', encoding='utf-8') as f:
data = f.read()
data = decode(data, key)
with open(dst, 'w', encoding='utf-8') as f:
f.write(data)