Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

setup basic FR preprocessing #87

Open
wants to merge 13 commits into
base: master
Choose a base branch
from
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,4 @@ MANIFEST

# Per-project virtualenvs
.virtualenv/
.python-version
57 changes: 55 additions & 2 deletions src/corporacreator/preprocessors/fr.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,53 @@
import re

from corporacreator.utils import maybe_normalize, FIND_MULTIPLE_SPACES_REG


SPELLED_ACRONYMS = {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this exhaustive?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, the goal here is to fix existing issues

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this contain all acronyms in the current set of French texts?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no, the one from clips.tsv for fr language

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If this contains all the acronyms in fr for the current clips.tsv, then we're fine.

'ANPE',
'APL',
'CDI',
'CICE',
'DRH',
'EDF',
'HLM',
'IGN',
'INPI',
'ISF',
'IUT',
'PHP',
'PMA',
'PME',
'RSA',
'RSI',
'RTE',
'SNCF',
'TGV',
'TVA',
'UDI',
'UMP',
'USA',
}
REPLACE_SPELLED_ACRONYMS = [
re.compile(r'(^|\s|\'|’)(' + '|'.join(SPELLED_ACRONYMS) + r')(\s|\.|,|\?|!|$)'),
lambda match: f"{match.group(1)}{' '.join(match.group(2))}{match.group(3)}",
]


FR_NORMALIZATIONS = [
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this exhaustive?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same as before

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fine here too

['Jean-Paul II', 'Jean-Paul deux'],
[re.compile(r'(^|\s)/an(\s|\.|,|\?|!|$)'), r'\1par an\2'],
[re.compile(r'(^|\s)km(\s|\.|,|\?|!|$)'), r'\1 kilomètres \2'],
['%', ' pourcent'],
[re.compile(r'(^|\s)\+(\s|\.|,|\?|!|$)'), r'\1 plus \2'],
nicolaspanel marked this conversation as resolved.
Show resolved Hide resolved
[re.compile(r'(^|\s)m(?:2|²)(\s|\.|,|\?|!|$)'), r'\1mètre carré\2'],
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like there are a few leftover digits being searched for here.

[re.compile(r'/\s?m(?:2|²)(\s|\.|,|\?|!|$)'), r' par mètre carré\1'],
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like there are a few leftover digits being searched for here.

[re.compile(r'\s?€(.+)'), r' euros\1'],
[re.compile(r'\s?€$'), r' euros'],
[re.compile(r'(^| )(n)(?:°|º|°)(\s)?', flags=re.IGNORECASE), r'\1\2uméro '],
]


def fr(client_id, sentence):
"""Cleans up the passed sentence, removing or reformatting invalid data.

Expand All @@ -8,5 +58,8 @@ def fr(client_id, sentence):
Returns:
(str): Cleaned up sentence. Returning None or a `str` of whitespace flags the sentence as invalid.
"""
# TODO: Clean up fr data
return sentence
text = maybe_normalize(sentence, mapping=FR_NORMALIZATIONS + [REPLACE_SPELLED_ACRONYMS])
# TODO: restore this once we are clear on which punctuation marks should be kept or removed
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right now we should only remove punctuation used incorrectly.

For example in the following sentence "!" is used incorrectly and should be removed...

Hello! world.

otherwise we should keep all punctuation.

So I'd suggest removed this commented out code.

# text = FIND_PUNCTUATIONS_REG.sub(' ', text)
text = FIND_MULTIPLE_SPACES_REG.sub(' ', text)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This has no effect as multiple white spaces are already removed here. So it seems like it should be removed.

return text.strip()
18 changes: 18 additions & 0 deletions src/corporacreator/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import re
from typing import Pattern


FIND_MULTIPLE_SPACES_REG = re.compile(r'\s{2,}')
FIND_PUNCTUATIONS_REG = re.compile(r"[/°\-,;!?.()\[\]*…—«»]")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In parallel with removal of the above commented out code, I'd ask that this is also removed as it's not used.



def maybe_normalize(value: str, mapping):
for norm in mapping:
if isinstance(norm[0], str):
value = value.replace(norm[0], norm[1])
elif isinstance(norm[0], Pattern):
value = norm[0].sub(norm[1], value)
else:
raise ValueError(f'expect first parameter to be a string or a regex, not {norm[0]}')

return value
38 changes: 38 additions & 0 deletions tests/test_preprocessors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import pytest
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍


from corporacreator import preprocessors


@pytest.mark.parametrize('locale, client_id, sentence, expected', [
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd suggest removing tests that no longer make sense in like of digits being banned. For example

('fr', '*', "donc, ce sera 299 € + 99 €", "donc, ce sera deux cent quatre-vingt-dix-neuf euros plus quatre-vingt-dix-neuf euros"),

Some of the tests here, for example

('fr', '*', "Jean-Paul II.", "Jean-Paul deux.")

have nothing to do with digits and can actually be run independent of this comment.

('fr', '*', 'Faisons donc attention à utiliser les bons mots.', 'Faisons donc attention à utiliser les bons mots.'),
('fr', '*', "bah 98%", "bah quatre-vingt-dix-huit pourcent"),
('fr', '*', "prix au m2", "prix au mètre carré"),
('fr', '*', "prix au m²", "prix au mètre carré"),
('fr', '*', "prix /m²", "prix par mètre carré"),
('fr', '*', "10 m²", "dix mètre carré"),
('fr', '*', "2éme page", "deuxième page"),
('fr', '*', "donc, ce sera 299 € + 99 €", "donc, ce sera deux cent quatre-vingt-dix-neuf euros plus quatre-vingt-dix-neuf euros"),
('fr', '*', "ok pour 18h", "ok pour dix-huit heure"),
('fr', '*', '2 0 200', "deux zéro deux cents"),
('fr', '*', 'rue Coq-Héron au nº13', "rue Coq-Héron au numéro treize"),
('fr', '*', "En comparaison, la Lune orbite en moyenne à 390 000 km de la Terre", "En comparaison, la Lune orbite en moyenne à trois cent quatre-vingt-dix mille kilomètres de la Terre"),
('fr', '*', "le vendredi 13 mars à 11 h 10.", "le vendredi treize mars à onze heure dix."),
('fr', '*', "le 13 mars à 11 h.", "le treize mars à onze heure ."),
('fr', '*', "Demain%2C il n’y aura plus d’entreprises", "Demain, il n’y aura plus d’entreprises"),
('fr', '*', "À la 5è rue", "À la cinquième rue"),
('fr', '*', "Telle est la raison d’être du CICE.", "Telle est la raison d’être du C I C E."),
('fr', '*', "Tout le monde titrait sur « la bataille de l’ISF ». ", "Tout le monde titrait sur « la bataille de l’I S F »."),
('fr', '*', "Nous parlons de CDI saisonnier", "Nous parlons de C D I saisonnier"),
('fr', '*', "Nous nous accordons tous à dire que dix-huit milliards d’A P L, ce n’est pas tenable.", "Nous nous accordons tous à dire que dix-huit milliards d’A P L, ce n’est pas tenable."),
('fr', '*', "Quelques-uns seulement bénéficient du RSA.", "Quelques-uns seulement bénéficient du R S A."),
('fr', '*', "Jean-Paul II.", "Jean-Paul deux."),
('fr', '*', "nº deux", "numéro deux"),
('fr', '*', "Une capacité qui pourrait être équivalente à une production de 120 000T de poudre de lait /an.", "Une capacité qui pourrait être équivalente à une production de cent vingt mille tonnes de poudre de lait par an."),
('fr', '*', "30 euros/m2", "trente euros par mètre carré"),
])
def test_preprocessor(locale, client_id, sentence, expected):
preprocessor = getattr(preprocessors, locale.replace('-', ''))
is_valid, sentence = preprocessors.common(sentence)
if not is_valid:
pytest.skip('not supported right now (see https://github.com/mozilla/CorporaCreator/pull/87#issuecomment-466296310 for more info)')
assert expected == preprocessor(client_id, sentence)