forked from sajaddarabi/TAPER-EHR
-
Notifications
You must be signed in to change notification settings - Fork 0
/
process_text.py
44 lines (41 loc) · 1.37 KB
/
process_text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import pickle
from nltk.corpus import stopwords
import collections
import string
import os
def preprocess_corpus( corpus):
sw = set(stopwords.words('english'))
data = []
sentences = corpus.split(".")
s = ""
for i in range(len(sentences)):
sentences[i] = sentences[i].strip()
sentence = sentences[i].split()
x = [word.strip(string.punctuation) for word in sentence if word not in sw]
t = " ".join(x)
s += t + "\n"
s = s.strip()
return s
def create_dataset(data, text_type, keys):
tt = ""
for j, k in enumerate(keys):
vv = data[k]
for v in vv:
tt += v['text_{}_raw'.format(text_type)]
return tt
if __name__ == '__main__':
PATH = './data'
PATH_DATA = os.path.join(PATH, 'textcode/biobert_pubmed_raw/')
data = pickle.load(open(os.path.join(PATH_DATA, 'data.pkl'), 'rb'))
data = data['data']
train_idx, valid_idx = pickle.load(open(os.path.join(PATH_DATA, 'splits', 'split_0.pkl'), 'rb'))
text = create_dataset(data, 'discharge', train_idx)
text = preprocess_corpus(text)
file = open(os.path.join(PATH, 'input_discharge.txt'), 'w')
file.writelines(text)
file.close()
text = create_dataset(data, 'rest', train_idx)
text = preprocess_corpus(text)
file = open(os.path.join(PATH, 'input_rest.txt'), 'w')
file.writelines(text)
file.close()