-
Notifications
You must be signed in to change notification settings - Fork 0
/
finetune_pretrained.py
102 lines (81 loc) · 3.54 KB
/
finetune_pretrained.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import numpy as np
import evaluate
from datasets import Dataset
from transformers import TrainingArguments, Trainer, AutoTokenizer, AutoModelForSequenceClassification
from sklearn.preprocessing import LabelEncoder
from utils.load_datasets import load_MR, load_Semeval2017A
DATASET = 'MR' # 'MR' or 'Semeval2017A'
# PRETRAINED_MODEL = 'bert-base-cased'
metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1)
return metric.compute(predictions=predictions, references=labels)
def tokenize_function(examples):
if(DATASET=='Semeval2017A'):
return tokenizer(examples["text"], padding="max_length" ,max_length = 512, truncation=True)
else:
return tokenizer(examples["text"], padding="max_length", truncation=True)
def prepare_dataset(X, y):
texts, labels = [], []
for text, label in zip(X, y):
texts.append(text)
labels.append(label)
return Dataset.from_dict({'text': texts, 'label': labels})
if __name__ == '__main__':
# load the raw data
if DATASET == "Semeval2017A":
pretrained_models = ['cardiffnlp/twitter-roberta-base-sentiment',
'mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis',
'Seethal/sentiment_analysis_generic_dataset']
elif DATASET == "MR":
pretrained_models = ['siebert/sentiment-roberta-large-english',
'textattack/bert-base-uncased-imdb',
'textattack/bert-base-uncased-yelp-polarity']
else:
raise ValueError("Invalid dataset")
for PRETRAINED_MODEL in pretrained_models:
if DATASET == "Semeval2017A":
X_train, y_train, X_test, y_test = load_Semeval2017A()
else:
X_train, y_train, X_test, y_test = load_MR()
# encode labels
le = LabelEncoder()
le.fit(list(set(y_train)))
y_train = le.transform(y_train)
y_test = le.transform(y_test)
n_classes = len(list(le.classes_))
# prepare datasets
train_set = prepare_dataset(X_train, y_train)
test_set = prepare_dataset(X_test, y_test)
# define model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL)
model = AutoModelForSequenceClassification.from_pretrained(
PRETRAINED_MODEL, num_labels=n_classes)
# tokenize datasets
tokenized_train_set = train_set.map(tokenize_function)
tokenized_test_set = test_set.map(tokenize_function)
# TODO: Main-lab-Q7 - remove this section once you are ready to execute on a GPU
# create a smaller subset of the dataset
n_samples = 40
small_train_dataset = tokenized_train_set.shuffle(
seed=42).select(range(n_samples))
small_eval_dataset = tokenized_test_set.shuffle(
seed=42).select(range(n_samples))
# TODO: Main-lab-Q7 - customize hyperparameters once you are ready to execute on a GPU
# training setup
args = TrainingArguments(
output_dir="output",
evaluation_strategy="epoch",
num_train_epochs=5,
per_device_train_batch_size=8
)
trainer = Trainer(
model=model,
args=args,
train_dataset=small_train_dataset, #tokenized_train_set,
eval_dataset=small_eval_dataset, #tokenized_test_set,
compute_metrics=compute_metrics,
)
# train
trained_model = trainer.train()