-
Notifications
You must be signed in to change notification settings - Fork 31
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
setup basic FR preprocessing #87
base: master
Are you sure you want to change the base?
Changes from all commits
3933cd0
2cd6595
f4156ff
600715d
7c34361
40fe406
fe6d5b9
934ec38
5b1a6fa
168a555
7f37c78
5090d1c
149e960
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -47,3 +47,4 @@ MANIFEST | |
|
||
# Per-project virtualenvs | ||
.virtualenv/ | ||
.python-version |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,53 @@ | ||
import re | ||
|
||
from corporacreator.utils import maybe_normalize, FIND_MULTIPLE_SPACES_REG | ||
|
||
|
||
SPELLED_ACRONYMS = { | ||
'ANPE', | ||
'APL', | ||
'CDI', | ||
'CICE', | ||
'DRH', | ||
'EDF', | ||
'HLM', | ||
'IGN', | ||
'INPI', | ||
'ISF', | ||
'IUT', | ||
'PHP', | ||
'PMA', | ||
'PME', | ||
'RSA', | ||
'RSI', | ||
'RTE', | ||
'SNCF', | ||
'TGV', | ||
'TVA', | ||
'UDI', | ||
'UMP', | ||
'USA', | ||
} | ||
REPLACE_SPELLED_ACRONYMS = [ | ||
re.compile(r'(^|\s|\'|’)(' + '|'.join(SPELLED_ACRONYMS) + r')(\s|\.|,|\?|!|$)'), | ||
lambda match: f"{match.group(1)}{' '.join(match.group(2))}{match.group(3)}", | ||
] | ||
|
||
|
||
FR_NORMALIZATIONS = [ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this exhaustive? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same as before There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fine here too |
||
['Jean-Paul II', 'Jean-Paul deux'], | ||
[re.compile(r'(^|\s)/an(\s|\.|,|\?|!|$)'), r'\1par an\2'], | ||
[re.compile(r'(^|\s)km(\s|\.|,|\?|!|$)'), r'\1 kilomètres \2'], | ||
['%', ' pourcent'], | ||
[re.compile(r'(^|\s)\+(\s|\.|,|\?|!|$)'), r'\1 plus \2'], | ||
nicolaspanel marked this conversation as resolved.
Show resolved
Hide resolved
|
||
[re.compile(r'(^|\s)m(?:2|²)(\s|\.|,|\?|!|$)'), r'\1mètre carré\2'], | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Looks like there are a few leftover digits being searched for here. |
||
[re.compile(r'/\s?m(?:2|²)(\s|\.|,|\?|!|$)'), r' par mètre carré\1'], | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Looks like there are a few leftover digits being searched for here. |
||
[re.compile(r'\s?€(.+)'), r' euros\1'], | ||
[re.compile(r'\s?€$'), r' euros'], | ||
[re.compile(r'(^| )(n)(?:°|º|°)(\s)?', flags=re.IGNORECASE), r'\1\2uméro '], | ||
] | ||
|
||
|
||
def fr(client_id, sentence): | ||
"""Cleans up the passed sentence, removing or reformatting invalid data. | ||
|
||
|
@@ -8,5 +58,8 @@ def fr(client_id, sentence): | |
Returns: | ||
(str): Cleaned up sentence. Returning None or a `str` of whitespace flags the sentence as invalid. | ||
""" | ||
# TODO: Clean up fr data | ||
return sentence | ||
text = maybe_normalize(sentence, mapping=FR_NORMALIZATIONS + [REPLACE_SPELLED_ACRONYMS]) | ||
# TODO: restore this once we are clear on which punctuation marks should be kept or removed | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Right now we should only remove punctuation used incorrectly. For example in the following sentence "!" is used incorrectly and should be removed...
otherwise we should keep all punctuation. So I'd suggest removed this commented out code. |
||
# text = FIND_PUNCTUATIONS_REG.sub(' ', text) | ||
text = FIND_MULTIPLE_SPACES_REG.sub(' ', text) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This has no effect as multiple white spaces are already removed here. So it seems like it should be removed. |
||
return text.strip() |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
import re | ||
from typing import Pattern | ||
|
||
|
||
FIND_MULTIPLE_SPACES_REG = re.compile(r'\s{2,}') | ||
FIND_PUNCTUATIONS_REG = re.compile(r"[/°\-,;!?.()\[\]*…—«»]") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In parallel with removal of the above commented out code, I'd ask that this is also removed as it's not used. |
||
|
||
|
||
def maybe_normalize(value: str, mapping): | ||
for norm in mapping: | ||
if isinstance(norm[0], str): | ||
value = value.replace(norm[0], norm[1]) | ||
elif isinstance(norm[0], Pattern): | ||
value = norm[0].sub(norm[1], value) | ||
else: | ||
raise ValueError(f'expect first parameter to be a string or a regex, not {norm[0]}') | ||
|
||
return value |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
import pytest | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 👍 |
||
|
||
from corporacreator import preprocessors | ||
|
||
|
||
@pytest.mark.parametrize('locale, client_id, sentence, expected', [ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd suggest removing tests that no longer make sense in like of digits being banned. For example ('fr', '*', "donc, ce sera 299 € + 99 €", "donc, ce sera deux cent quatre-vingt-dix-neuf euros plus quatre-vingt-dix-neuf euros"), Some of the tests here, for example ('fr', '*', "Jean-Paul II.", "Jean-Paul deux.") have nothing to do with digits and can actually be run independent of this comment. |
||
('fr', '*', 'Faisons donc attention à utiliser les bons mots.', 'Faisons donc attention à utiliser les bons mots.'), | ||
('fr', '*', "bah 98%", "bah quatre-vingt-dix-huit pourcent"), | ||
('fr', '*', "prix au m2", "prix au mètre carré"), | ||
('fr', '*', "prix au m²", "prix au mètre carré"), | ||
('fr', '*', "prix /m²", "prix par mètre carré"), | ||
('fr', '*', "10 m²", "dix mètre carré"), | ||
('fr', '*', "2éme page", "deuxième page"), | ||
('fr', '*', "donc, ce sera 299 € + 99 €", "donc, ce sera deux cent quatre-vingt-dix-neuf euros plus quatre-vingt-dix-neuf euros"), | ||
('fr', '*', "ok pour 18h", "ok pour dix-huit heure"), | ||
('fr', '*', '2 0 200', "deux zéro deux cents"), | ||
('fr', '*', 'rue Coq-Héron au nº13', "rue Coq-Héron au numéro treize"), | ||
('fr', '*', "En comparaison, la Lune orbite en moyenne à 390 000 km de la Terre", "En comparaison, la Lune orbite en moyenne à trois cent quatre-vingt-dix mille kilomètres de la Terre"), | ||
('fr', '*', "le vendredi 13 mars à 11 h 10.", "le vendredi treize mars à onze heure dix."), | ||
('fr', '*', "le 13 mars à 11 h.", "le treize mars à onze heure ."), | ||
('fr', '*', "Demain%2C il n’y aura plus d’entreprises", "Demain, il n’y aura plus d’entreprises"), | ||
('fr', '*', "À la 5è rue", "À la cinquième rue"), | ||
('fr', '*', "Telle est la raison d’être du CICE.", "Telle est la raison d’être du C I C E."), | ||
('fr', '*', "Tout le monde titrait sur « la bataille de l’ISF ». ", "Tout le monde titrait sur « la bataille de l’I S F »."), | ||
('fr', '*', "Nous parlons de CDI saisonnier", "Nous parlons de C D I saisonnier"), | ||
('fr', '*', "Nous nous accordons tous à dire que dix-huit milliards d’A P L, ce n’est pas tenable.", "Nous nous accordons tous à dire que dix-huit milliards d’A P L, ce n’est pas tenable."), | ||
('fr', '*', "Quelques-uns seulement bénéficient du RSA.", "Quelques-uns seulement bénéficient du R S A."), | ||
('fr', '*', "Jean-Paul II.", "Jean-Paul deux."), | ||
('fr', '*', "nº deux", "numéro deux"), | ||
('fr', '*', "Une capacité qui pourrait être équivalente à une production de 120 000T de poudre de lait /an.", "Une capacité qui pourrait être équivalente à une production de cent vingt mille tonnes de poudre de lait par an."), | ||
('fr', '*', "30 euros/m2", "trente euros par mètre carré"), | ||
]) | ||
def test_preprocessor(locale, client_id, sentence, expected): | ||
preprocessor = getattr(preprocessors, locale.replace('-', '')) | ||
is_valid, sentence = preprocessors.common(sentence) | ||
if not is_valid: | ||
pytest.skip('not supported right now (see https://github.com/mozilla/CorporaCreator/pull/87#issuecomment-466296310 for more info)') | ||
assert expected == preprocessor(client_id, sentence) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this exhaustive?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No, the goal here is to fix existing issues
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does this contain all acronyms in the current set of French texts?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
no, the one from clips.tsv for
fr
languageThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If this contains all the acronyms in fr for the current clips.tsv, then we're fine.