-
-
Notifications
You must be signed in to change notification settings - Fork 2
/
spellcheck_xml.py
79 lines (61 loc) · 3.44 KB
/
spellcheck_xml.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import xml.etree.ElementTree as ET
from xml.dom import minidom
from spellchecker import SpellChecker
import re
import os
import argparse
# List of transliterated foreign words; abbreviations; and unusual words, compounds or inflections.
# Note: "l8" (that's a lowercae 'L', not the numeral one) is skipped because we need to retain that typo in a hidden alternative.
words_to_skip = {"euangelion", "theos", "apostolos", "ethnê", "dunamis", "denarii", "diabolos", "katharizw", "hina", "thaumazw", "sheol", "logikos", "hierateuma", "peritemnw", "abussos", "km", "vv", "etc", "lbs", "floodwaters", "firepot", "firepans", "lampstand", "lampstands", "slingstones", "breastpiece", "algum", "spiritist", "spiritists", "versifications", "fowler", "fowlers", "dreamt", "waterer", "offerers", "righteousnesses", "backslidings", "compassions", "fornications", "revealer", "desolations", "l8"}
def spell_check_text(text, spell):
def check_word(word):
wordLc = word.lower()
if wordLc in words_to_skip or 'ê' in wordLc or 'ô' in wordLc:
return word
if word[0].isalpha() and not (word[0].isupper() or word.endswith("s'") or word.endswith("'s") or '-' in word): # Don't spellcheck capitalized words, possessives, or hyphenated words
if wordLc in spell:
corrected_word = spell.correction(wordLc)
return corrected_word if corrected_word else f"***{word}***"
else:
return f"***{word}***"
else:
return word # Return non-word tokens unchanged
# Tokenize the text into words and non-words (spaces, numbers, punctuation)
tokens = re.findall(r'\w[\w\'-]*|[^\w]+', text)
corrected_tokens = []
for token in tokens:
corrected_tokens.append(check_word(token))
# Rejoin the tokens to form the corrected text
corrected_text = ''.join(corrected_tokens)
return corrected_text
def spell_check_xml(file_path, max_elements=None, output_file=None):
tree = ET.parse(file_path)
root = tree.getroot()
spell = SpellChecker()
elements = list(root.iter())
total_elements = len(elements)
if max_elements is not None:
elements = elements[:max_elements]
checked_elements = len(elements)
for i, elem in enumerate(elements):
if elem.text:
elem.text = spell_check_text(elem.text, spell)
if elem.tail:
elem.tail = spell_check_text(elem.tail, spell)
# Display progress
progress = (i + 1) / checked_elements * 100
print(f"Progress: {progress:.2f}%", end='\r')
if output_file is None:
output_file = file_path
rough_string = ET.tostring(root, encoding="utf-8")
with open(output_file, "wb") as f:
f.write(rough_string)
print(f"\nSpell-checked file saved as {output_file}")
if __name__ == "__main__":
default_file_path = os.path.join("Transcelerator", "TxlQuestions.xml")
parser = argparse.ArgumentParser(description="Spell check an XML file.")
parser.add_argument("file_path", nargs='?', default=default_file_path, help="Path to the XML file to spell check.")
parser.add_argument("--max_elements", type=int, default=None, help="Maximum number of elements to check.")
parser.add_argument("-o", "--output_file", type=str, default=None, help="Output file name. If not specified, the input file will be overwritten.")
args = parser.parse_args()
spell_check_xml(args.file_path, args.max_elements, args.output_file)