Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

including kab locale #111

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/corporacreator/preprocessors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@
from .cy import cy
from .de import de
from .ky import ky
from .kab import kab
123 changes: 123 additions & 0 deletions src/corporacreator/preprocessors/kab.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
# created by Mestafa Kamal

import string

"""
Keep Upper case
Keep Punctuation

Replace wrong characters
Remove bad spaces

Strip
Unvalidate sentences containing not allowed characters
"""

allowed = list(string.ascii_lowercase)
allowed.extend(list("ẓṛṭɛṣḍǧḥɣč"))

majuscule = []

for i in allowed:
majuscule.append(i.upper())

allowed.extend(list(majuscule))

replacer = {
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe give the unicode code points in a comment here, e.g.

    "Ğ": "Ǧ", # 011E → 01E6

"Ğ": "Ǧ",
"ğ": "ǧ",
"Γ": "Ɣ",
"γ": "ɣ",
"Σ": "Ɛ",
"Ԑ": "Ɛ",
"εσ": "ɛ",
"«»“”": "\"",
}

punctuation = [
" ",
"-",
".",
"?",
",",
"!",
";",
"_",
":",
"/",
"(",
")",
"{",
"}",
"[",
"]",
"\"",
]

replacements = {}

for all, replacement in replacer.items():
for to_replace in all:
replacements[to_replace] = replacement

def remplaceSymbols(word):
result = word
for to_replace, replacement in replacements.items():
result = result.replace(to_replace, replacement)
return result

def removeBadSpace(sentence):
sentence = sentence.replace(" -", "-")
sentence = sentence.replace("- ", "-")
return sentence

def replaceTs(word):
if word.endswith("ţţ"):
word = word[0:-2] + "t"
elif word.endswith("-ţ"):
word = word[0:-2] + "-tt"
elif word.endswith("ţ"):
word = word[0:-1] + "t"
word = word.replace("ţţ", "tt")
word = word.replace("ţ", "tt")
return word

def checkSentence (sentence):
for i in sentence:
if i not in allowed and i not in punctuation:
return False
return True

def cleanSentence(sentence):

sentence = removeBadSpace(sentence)
sentence = remplaceSymbols(sentence)

words = sentence.strip().split(" ")
cleanedWords = []

for word in words:
word = replaceTs(word)
word = word.strip()
cleanedWords.append(word)

result = " ".join(cleanedWords)

if (checkSentence(result)==False):
return " "

return result

def kab(client_id, sentence):
"""Cleans up the passed sentence, removing or reformatting invalid data.

Args:
client_id (str): Client ID of sentence's speaker
sentence (str): Sentence to be cleaned up.

Returns:
(str): Cleaned up sentence. Returning None or a `str` of whitespace flags the sentence as invalid.
"""
# TODO: Clean up kab data
sentence = cleanSentence(sentence)
return sentence