-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
131 lines (116 loc) · 5.22 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
from lxml import etree as ET
import csv
def save_list(regestes, csv_file):
"""
:param regestes: List or regestes
:param csv_file: file to save that list
:return: file csv with regeste by row
"""
with open(csv_file, 'w', encoding="utf-8", newline='') as f:
spamwriter = csv.writer(f, delimiter='\t')
for analyse in regestes:
spamwriter.writerow(list(analyse))
def extract_regeste(fichier_cei):
"""
:param fichier_cei: path for xml file
:return: dictionnaries with regestes contains in that file and issuer balised
"""
NAMESPACES = {
"cei": "http://www.monasterium.net/NS/cei"
}
xml_doc = ET.parse(fichier_cei)
regestes = xml_doc.xpath("//cei:abstract", namespaces=NAMESPACES)
regestes_plein, regestes_inutilisables, issuer_balised = {}, {}, {}
for r in regestes:
balise_id = r.xpath('./parent::cei:chDesc/preceding-sibling::cei:idno', namespaces=NAMESPACES)
if balise_id[0].xpath("./@id"):
id = balise_id[0].xpath("./@id")[0]
else:
id = balise_id[0].xpath("normalize-space()")
issuer = r.xpath('./cei:issuer', namespaces=NAMESPACES)
if issuer:
issuer_balised[id] = issuer[0].xpath("normalize-space()")
else:
issuer_balised[id] = ""
if issuer_balised[id]:
while issuer_balised[id][-1] in [",", " "]:
issuer_balised[id] = issuer_balised[id][:-1]
text = r.xpath("normalize-space()").replace("Régeste", "")
if text and text[0] in [" ", "."]:
text = text[1:]
if text and "Chirographe" not in text:
regestes_plein[id] = text
else:
regestes_inutilisables[id] = [r.xpath("normalize-space()"), issuer_balised[id]]
return regestes_plein, regestes_inutilisables, issuer_balised
import spacy
def cut_first_word_groupe(text, regeste):
i = 0
while i == 0 or text[i].pos_ != "VERB":
# and not (text[i].pos_ == "PUNCT" and text[i + 1].pos_ == "ADP") -> + 2 reconnus en français, - 10 en allemand
i += 1
if text[i].text == ",": # Pour éviter de séctionner sur un caractère répétable
i += 1
premier_groupe_mot = regeste.split(text[i].text)[0]
while premier_groupe_mot[-1] in [" ", ","]:
premier_groupe_mot = premier_groupe_mot[:-1]
return premier_groupe_mot
def nlp_regestes(regestes, language):
"""
:param regestes: list of regestes
:return: entities and part of speech
"""
model_spacy = language + "_core_news_lg"
nlp = spacy.load(model_spacy)
trial_with_part_of_speech, regestes_inutilisables = {}, {}
for id_regeste in regestes:
regeste = regeste_original = regestes[id_regeste]
while "[" in regeste:
crochet, i = "[", 1
if regeste[0] != crochet:
while regeste.split("[", 1)[0][-i] == " ":
crochet = " " + crochet
i += 1
regeste = regeste.split(crochet, 1)[0] + regeste.split("]", 1)[1]
text = nlp(regeste)
if language == "fr": # Filtres différents en fonction de la langue
if "VERB" in [tok.pos_ for tok in text] \
and text.ents and (text[0].pos_ == "PROPN" or len(text[0].text) == 1):
trial_with_part_of_speech[id_regeste] = [regeste_original,
cut_first_word_groupe(text, regeste_original)]
else:
regestes_inutilisables[id_regeste] = [regeste_original]
elif language == "de":
if "VERB" in [tok.pos_ for tok in text] and text.ents:
trial_with_part_of_speech[id_regeste] = [regeste_original,
cut_first_word_groupe(text, regeste_original)]
else:
regestes_inutilisables[id_regeste] = [regeste_original]
return regestes_inutilisables, trial_with_part_of_speech
from langdetect import detect
import click
@click.command()
@click.argument("path", type=str)
def reco_issuer(path):
"""
:param path: path for cei source file
:return: 2 files, 1 for extracting issuers and 1 for unusable regests
"""
result_unused, result_pos = 'unused.csv', 'results.csv'
list_regestes, no_regeste, balises = extract_regeste(path)
language = detect(list_regestes[list(list_regestes.keys())[0]])
unusables_regestes, regestes_pos = nlp_regestes(list_regestes, language)
for result in regestes_pos:
balise = balises[result]
if regestes_pos[result][1] == balise:
yf = "Same"
else:
yf = "Not_same"
regestes_pos[result] += [balise, yf]
save_list([["id", "regeste", "issuer"]] +
[[key] + values for key, values in no_regeste.items()] +
[[key] + values for key, values in unusables_regestes.items()], result_unused)
save_list([["id", "regeste", "string_before_first_verb", "issuer_jacky", "is_same"]] +
[[key] + values for key, values in regestes_pos.items()], result_pos)
if __name__ == '__main__':
reco_issuer()