-
Notifications
You must be signed in to change notification settings - Fork 78
/
utilities.py
78 lines (63 loc) · 2.36 KB
/
utilities.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# =========================
# ==== Helper Methods =====
import re
import numpy as np
from nltk import ngrams
# Clean/Normalize Arabic Text
def clean_str(text):
search = ["أ","إ","آ","ة","_","-","/",".","،"," و "," يا ",'"',"ـ","'","ى","\\",'\n', '\t','"','?','؟','!']
replace = ["ا","ا","ا","ه"," "," ","","",""," و"," يا","","","","ي","",' ', ' ',' ',' ? ',' ؟ ',' ! ']
#remove tashkeel
p_tashkeel = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
text = re.sub(p_tashkeel,"", text)
#remove longation
p_longation = re.compile(r'(.)\1+')
subst = r"\1\1"
text = re.sub(p_longation, subst, text)
text = text.replace('وو', 'و')
text = text.replace('يي', 'ي')
text = text.replace('اا', 'ا')
for i in range(0, len(search)):
text = text.replace(search[i], replace[i])
#trim
text = text.strip()
return text
def get_vec(n_model,dim, token):
vec = np.zeros(dim)
is_vec = False
if token not in n_model.wv:
_count = 0
is_vec = True
for w in token.split("_"):
if w in n_model.wv:
_count += 1
vec += n_model.wv[w]
if _count > 0:
vec = vec / _count
else:
vec = n_model.wv[token]
return vec
def calc_vec(pos_tokens, neg_tokens, n_model, dim):
vec = np.zeros(dim)
for p in pos_tokens:
vec += get_vec(n_model,dim,p)
for n in neg_tokens:
vec -= get_vec(n_model,dim,n)
return vec
## -- Retrieve all ngrams for a text in between a specific range
def get_all_ngrams(text, nrange=3):
text = re.sub(r'[\,\.\;\(\)\[\]\_\+\#\@\!\?\؟\^]', ' ', text)
tokens = [token for token in text.split(" ") if token.strip() != ""]
ngs = []
for n in range(2,nrange+1):
ngs += [ng for ng in ngrams(tokens, n)]
return ["_".join(ng) for ng in ngs if len(ng)>0 ]
## -- Retrieve all ngrams for a text in a specific n
def get_ngrams(text, n=2):
text = re.sub(r'[\,\.\;\(\)\[\]\_\+\#\@\!\?\؟\^]', ' ', text)
tokens = [token for token in text.split(" ") if token.strip() != ""]
ngs = [ng for ng in ngrams(tokens, n)]
return ["_".join(ng) for ng in ngs if len(ng)>0 ]
## -- filter the existed tokens in a specific model
def get_existed_tokens(tokens, n_model):
return [tok for tok in tokens if tok in n_model.wv ]