Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sync with HarfBuzz 2.7.4 #75

Merged
merged 26 commits into from
Sep 5, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
15a37b9
Update gen-tag-table.py
bluebear94 Aug 17, 2023
e16c83b
Update gen-universal-table.py
bluebear94 Aug 17, 2023
da30f45
Fix missing constants
bluebear94 Aug 17, 2023
ae7c691
aat/feature_mappings.rs: Update comment for FEATURE_MAPPINGS
bluebear94 Aug 17, 2023
12aeeec
Buffer::serialize: surround serialized form with brackets
bluebear94 Aug 17, 2023
3a00bdd
Script::from_iso15924_tag: Add variants supported in HB 2.7.4
bluebear94 Aug 17, 2023
00ec212
has_arabic_joining: Update to match Harfbuzz implementation
bluebear94 Aug 17, 2023
247e240
Sync commit 6a38ade from HarfBuzz
bluebear94 Aug 17, 2023
5efcdcb
Add changes to complex shaping
bluebear94 Aug 18, 2023
8e9ad3b
Update IndicShapingInvalidCluster.txt file
bluebear94 Aug 18, 2023
553a133
Update generated shaping tests
bluebear94 Aug 18, 2023
4556187
Update language tag tests
bluebear94 Aug 18, 2023
42fe27e
Update test format
bluebear94 Aug 18, 2023
522a2c4
Fix out-of-bounds index in find_syllables
bluebear94 Aug 18, 2023
418b439
Fix myanmar_misc_001 test
bluebear94 Aug 18, 2023
1c4c6bd
Restore manual changes to fallback_positioning_001 test
bluebear94 Aug 18, 2023
f0b9f2e
Buffer: match work limits with HarfBuzz 2.7.4
bluebear94 Aug 18, 2023
8ecfceb
Revert changes to buffer serialization
bluebear94 Aug 18, 2023
d523ea2
Make initial attempt to avoid Vec allocation in find_syllables
bluebear94 Aug 18, 2023
aa3b70b
Explicitly pass buffer ref to included, next_glyph, and prev_glyph
bluebear94 Aug 19, 2023
6a607ec
Pass &[GlyphInfo] to included, prev_glyph, and next_glyph
bluebear94 Aug 19, 2023
71ba9a7
prev_glyph: Return 0 if we go out of bounds
bluebear94 Aug 19, 2023
87b9dab
Merge branch 'master' into mf/sync-with-2.7.4
bluebear94 Aug 25, 2023
4a10e3c
Get rid of the Vec allocation *again*
bluebear94 Aug 25, 2023
883c4e3
gen-tag-table: Remove override for Gumuz (SIL fonts)
bluebear94 Aug 26, 2023
8c9accb
Ignore morx_34_001 and morx_36_001 tests
bluebear94 Sep 4, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions scripts/.gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
ArabicShaping.txt
Blocks.txt
DerivedCoreProperties.txt
IndicPositionalCategory.txt
IndicSyllabicCategory.txt
Scripts.txt
Expand Down
18 changes: 13 additions & 5 deletions scripts/gen-shaping-tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@
'indic_decompose_001',
# ttf-parser doesn't support phantom points
'variations_space_001',
# Resource exhaustion tests with large outputs
'morx_34_001',
'morx_36_001',

# text-rendering-tests tests
# Unknown issue. Investigate.
Expand Down Expand Up @@ -65,13 +68,17 @@ def convert_test(hb_dir, hb_shape_exe, tests_name, file_name, idx, data, fonts):

unicodes_rs = convert_unicodes(unicodes)

test_name = file_name.replace('.tests', '').replace('-', '_') + f'_{idx:03d}'
test_name = file_name.replace(
'.tests', '').replace('-', '_') + f'_{idx:03d}'
test_name = test_name.lower()

options = options.replace('--shaper=ot', '')
options = options.replace(' --font-funcs=ft', '').replace('--font-funcs=ft', '')
options = options.replace(' --font-funcs=ot', '').replace('--font-funcs=ot', '')
options = options.replace('--font-size=1000', '') # we don't support font scaling
options = options.replace(
' --font-funcs=ft', '').replace('--font-funcs=ft', '')
options = options.replace(
' --font-funcs=ot', '').replace('--font-funcs=ot', '')
# we don't support font scaling
options = options.replace('--font-size=1000', '')
options = options.strip()

# We have to actually run hb-shape instead of using predefined results,
Expand Down Expand Up @@ -178,7 +185,8 @@ def convert(hb_dir, hb_shape_exe, tests_dir, tests_name):

used_fonts += convert(hb_dir, hb_shape_exe, tests_dir, test_dir_name)

font_files += os.listdir(hb_dir / f'test/shaping/data/{test_dir_name}/fonts')
font_files += os.listdir(hb_dir /
f'test/shaping/data/{test_dir_name}/fonts')

# Check for unused fonts.
unused_fonts = sorted(list(set(font_files).difference(used_fonts)))
Expand Down
102 changes: 65 additions & 37 deletions scripts/gen-tag-table.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ def expect(condition, message=None):
raise AssertionError
raise AssertionError(message)

DEFAULT_LANGUAGE_SYSTEM = ''

# from http://www-01.sil.org/iso639-3/iso-639-3.tab
ISO_639_3_TO_1 = {
Expand Down Expand Up @@ -457,11 +458,8 @@ def inherit_from_macrolanguages(self):
if ot_macrolanguages:
for ot_macrolanguage in ot_macrolanguages:
for language in languages:
# Remove the following condition if e.g. nn should map to NYN,NOR
# instead of just NYN.
if language not in original_ot_from_bcp_47:
self.add_language(language, ot_macrolanguage)
self.ranks[ot_macrolanguage] += 1
self.add_language(language, ot_macrolanguage)
self.ranks[ot_macrolanguage] += 1
else:
for language in languages:
if language in original_ot_from_bcp_47:
Expand Down Expand Up @@ -546,7 +544,7 @@ def parse(self, filename):
self.grandfathered.add(subtag.lower())
elif line.startswith('Description: '):
description = line.split(' ', 1)[1].replace('(individual language)', '')
description = re.sub('(\((individual |macro)language\)|languages)$', '',
description = re.sub('(\(family\)|\((individual |macro)language\)|languages)$', '',
description)
if subtag in self.names:
self.names[subtag] += '\n' + description
Expand Down Expand Up @@ -581,7 +579,9 @@ def parse(self, filename):
elif not has_preferred_value and line.startswith('Macrolanguage: '):
self._add_macrolanguage(line.split(' ')[1], subtag)
elif subtag_type == 'variant':
if line.startswith('Prefix: '):
if line.startswith('Deprecated: '):
self.scopes[subtag] = ' (retired code)' + self.scopes.get(subtag, '')
elif line.startswith('Prefix: '):
self.prefixes[subtag].add(line.split(' ')[1])
elif line.startswith('File-Date: '):
self.header = line
Expand Down Expand Up @@ -612,6 +612,15 @@ def remove_extra_macrolanguages(self):
for macrolanguage in macrolanguages:
self._add_macrolanguage(biggest_macrolanguage, macrolanguage)

def _get_name_piece(self, subtag):
"""Return the first name of a subtag plus its scope suffix.
Args:
subtag (str): A BCP 47 subtag.
Returns:
The name form of ``subtag``.
"""
return self.names[subtag].split('\n')[0] + self.scopes.get(subtag, '')

def get_name(self, lt):
"""Return the names of the subtags in a language tag.

Expand All @@ -621,13 +630,13 @@ def get_name(self, lt):
Returns:
The name form of ``lt``.
"""
name = self.names[lt.language].split('\n')[0]
name = self._get_name_piece(lt.language)
if lt.script:
name += '; ' + self.names[lt.script.title()].split('\n')[0]
name += '; ' + self._get_name_piece(lt.script.title())
if lt.region:
name += '; ' + self.names[lt.region.upper()].split('\n')[0]
name += '; ' + self._get_name_piece(lt.region.upper())
if lt.variant:
name += '; ' + self.names[lt.variant].split('\n')[0]
name += '; ' + self._get_name_piece(lt.variant)
return name


Expand Down Expand Up @@ -664,22 +673,18 @@ def get_name(self, lt):
ot.remove_language_ot('IRT')
ot.add_language('ga-Latg', 'IRT')

ot.add_language('hy-arevmda', 'HYE')

ot.remove_language_ot('KGE')
ot.add_language('und-Geok', 'KGE')

bluebear94 marked this conversation as resolved.
Show resolved Hide resolved
ot.add_language('guk', 'GUK')
ot.names['GUK'] = 'Gumuz(SIL fonts)'
ot.ranks['GUK'] = ot.ranks['GMZ'] + 1

bcp_47.macrolanguages['id'] = {'in'}

bcp_47.macrolanguages['ijo'] = {'ijc'}

ot.add_language('kht', 'KHN')
ot.names['KHN'] = ot.names['KHT'] + '(Microsoft fonts)'
ot.names['KHT'] = ot.names['KHT'] + '(OpenType spec and SIL fonts)'
ot.ranks['KHN'] = ot.ranks['KHT']
ot.ranks['KHT'] += 1
ot.ranks['KHN'] = ot.ranks['KHT'] + 1

ot.ranks['LCR'] = ot.ranks['MCR'] + 1

Expand All @@ -689,6 +694,11 @@ def get_name(self, lt):
bcp_47.names['mhv'] = 'Arakanese'
bcp_47.scopes['mhv'] = '(retired code)'

# Downstream change due to note for Thailand Mon in Microsoft’s
# page of language tags.
ot.remove_language_ot('MONT')
ot.add_language('mnw', 'MONT')

Comment on lines +699 to +701
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This doesn't appear in the original set of Harfbuzz changes, what is this for?

Copy link
Collaborator Author

@bluebear94 bluebear94 Aug 26, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The current version of Microsoft’s page of language tags has a note for Thailand Mon that chokes the parser. I had to add this for the script to finish successfully; otherwise, I get this error:

Traceback (most recent call last):
  File "/home/felirovas/moddev/rustybuzz/scripts/gen-tag-table.py", line 1036, in <module>
    print('                // %s' % bcp_47.get_name(lt))
                                    ^^^^^^^^^^^^^^^^^^^
  File "/home/felirovas/moddev/rustybuzz/scripts/gen-tag-table.py", line 633, in get_name
    name = self._get_name_piece(lt.language)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/felirovas/moddev/rustybuzz/scripts/gen-tag-table.py", line 622, in _get_name_piece
    return self.names[subtag].split('\n')[0] + self.scopes.get(subtag, '')
           ~~~~~~~~~~^^^^^^^^
KeyError: 'mnwseehttps://www.unicode.org/l2/l2020/20163'

ot.add_language('no', 'NOR')

ot.add_language('oc-provenc', 'PRO')
Expand All @@ -697,6 +707,7 @@ def get_name(self, lt):
ot.add_language('qub', 'QWH')
ot.add_language('qud', 'QVI')
ot.add_language('qug', 'QVI')
ot.add_language('qul', 'QUH')
ot.add_language('qup', 'QVI')
ot.add_language('qur', 'QWH')
ot.add_language('qus', 'QUH')
Expand Down Expand Up @@ -727,10 +738,6 @@ def get_name(self, lt):
bcp_47.macrolanguages['ro'].remove('mo')
bcp_47.macrolanguages['ro-MD'].add('mo')

ot.add_language('sgw', 'SGW')
ot.names['SGW'] = ot.names['CHG'] + '(SIL fonts)'
ot.ranks['SGW'] = ot.ranks['CHG'] + 1

ot.remove_language_ot('SYRE')
ot.remove_language_ot('SYRJ')
ot.remove_language_ot('SYRN')
Expand All @@ -747,14 +754,17 @@ def get_name(self, lt):
ot.remove_language_ot('ZHH')
ot.remove_language_ot('ZHP')
ot.remove_language_ot('ZHT')
ot.remove_language_ot('ZHTM')
bcp_47.macrolanguages['zh'].remove('lzh')
bcp_47.macrolanguages['zh'].remove('yue')
ot.add_language('zh-Hant-MO', 'ZHH')
ot.add_language('zh-Hant-MO', 'ZHTM')
ot.add_language('zh-Hant-HK', 'ZHH')
ot.add_language('zh-Hans', 'ZHS')
ot.add_language('zh-Hant', 'ZHT')
ot.add_language('zh-HK', 'ZHH')
ot.add_language('zh-MO', 'ZHH')
ot.add_language('zh-MO', 'ZHTM')
ot.add_language('zh-TW', 'ZHT')
ot.add_language('lzh', 'ZHT')
ot.add_language('lzh-Hans', 'ZHS')
Expand Down Expand Up @@ -788,6 +798,7 @@ def rank_delta(bcp_47, ot):
disambiguation = {
'ALT': 'alt',
'ARK': 'rki',
'ATH': 'ath',
'BHI': 'bhb',
'BLN': 'bjt',
'BTI': 'beb',
Expand All @@ -799,6 +810,7 @@ def rank_delta(bcp_47, ot):
'ECR': 'crj',
'HAL': 'cfm',
'HND': 'hnd',
'HYE': 'hyw',
'KIS': 'kqs',
'LRC': 'bqi',
'NDB': 'nd',
Expand All @@ -810,15 +822,23 @@ def rank_delta(bcp_47, ot):
'QVI': 'qvi',
'QWH': 'qwh',
'SIG': 'stv',
'TNE': 'yrk',
'SRB': 'sr',
'ZHH': 'zh-HK',
'ZHS': 'zh-Hans',
'ZHT': 'zh-Hant',
'ZHTM': 'zh-Hant-MO',
}

ot.inherit_from_macrolanguages()
bcp_47.remove_extra_macrolanguages()
ot.inherit_from_macrolanguages()
ot.names[DEFAULT_LANGUAGE_SYSTEM] = '*/'
ot.ranks[DEFAULT_LANGUAGE_SYSTEM] = max(ot.ranks.values()) + 1
for tricky_ot_tag in filter(lambda tag: re.match('[A-Z]{3}$', tag), ot.names):
possible_bcp_47_tag = tricky_ot_tag.lower()
if possible_bcp_47_tag in bcp_47.names and not ot.from_bcp_47[possible_bcp_47_tag]:
ot.add_language(possible_bcp_47_tag, DEFAULT_LANGUAGE_SYSTEM)
bcp_47.macrolanguages[possible_bcp_47_tag] = set()
ot.sort_languages()

print('// WARNING: this file was generated by ../scripts/gen-tag-table.py')
Expand All @@ -834,6 +854,8 @@ def rank_delta(bcp_47, ot):


def hb_tag(tag):
if tag == DEFAULT_LANGUAGE_SYSTEM:
return 'Tag(0)\t '
return 'Tag::from_bytes(b\"%s%s%s%s\")' % tuple(('%-4s' % tag)[:4])


Expand Down Expand Up @@ -889,14 +911,18 @@ def same_tag(bcp_47_tag, ot_tags):
print(' // ', end='')
bcp_47_name = bcp_47.names.get(language, '')
bcp_47_name_candidates = bcp_47_name.split('\n')
intersection = language_name_intersection(bcp_47_name, ot.names[tag])
ot_name = ot.names[tag]
scope = bcp_47.scopes.get(language, '')
if not intersection:
print('%s%s -> %s' % (bcp_47_name_candidates[0], scope, ot.names[tag]))
if tag == DEFAULT_LANGUAGE_SYSTEM:
print(f'{bcp_47_name_candidates[0]}{scope} != {ot.names[language.upper()]}')
else:
name = get_matching_language_name(intersection, bcp_47_name_candidates)
bcp_47.names[language] = name
print('%s%s' % (name if len(name) > len(ot.names[tag]) else ot.names[tag], scope))
intersection = language_name_intersection(bcp_47_name, ot_name)
if not intersection:
print('%s%s -> %s' % (bcp_47_name_candidates[0], scope, ot_name))
else:
name = get_matching_language_name(intersection, bcp_47_name_candidates)
bcp_47.names[language] = name
print('%s%s' % (name if len(name) > len(ot_name) else ot_name, scope))

print('];')
print()
Expand Down Expand Up @@ -988,23 +1014,25 @@ def print_subtag_matches(subtag, new_line):
print(" b'%s' => {" % initial)
for lt, tags in items:
print(' if ', end='')
script = lt.script
region = lt.region
if lt.grandfathered:
print('&language[1..] == "%s" ' % lt.language[1:], end='')
else:
string_literal = lt.language[1:] + '-'
if lt.script:
string_literal += lt.script
lt.script = None
if lt.region:
string_literal += '-' + lt.region
lt.region = None
if script:
string_literal += script
script = None
if region:
string_literal += '-' + region
region = None
if string_literal[-1] == '-':
print('strncmp(&language[1..], "%s", %i)' % (string_literal, len(string_literal)), end='')
else:
print('lang_matches(&language[1..], "%s")' % string_literal, end='')

print_subtag_matches(lt.script, True)
print_subtag_matches(lt.region, True)
print_subtag_matches(script, True)
print_subtag_matches(region, True)
print_subtag_matches(lt.variant, True)
print('{')
print(' // %s' % bcp_47.get_name(lt))
Expand Down
Loading
Loading