xinyadu · LouisCastricato · Sep 10, 2020 · Sep 18, 2020
diff --git a/.gitignore b/.gitignore
@@ -133,4 +133,7 @@ dmypy.json
 *_output
 *.o*
 archive
-code/script_trigger_debug.sh
+code/script_trigger_debug.sh
+
+.txt
+proc/FFO/Stories/
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -0,0 +1,3 @@
+{
+    "python.pythonPath": "/usr/bin/python3.7"
+}
diff --git a/code/run_args_qa_thresh.py b/code/run_args_qa_thresh.py
@@ -87,8 +87,9 @@ def read_ace_examples(input_file, is_training):
     """Read a ACE json file into a list of AceExample."""
     examples = []
     with open(input_file, "r", encoding='utf-8') as f:
-        for line in f:
-            example = json.loads(line)
+        print(input_file)
+        lines = json.load(f)
+        for example in lines:
             sentence, events, s_start = example["sentence"], example["event"], example["s_start"]
             example = AceExample(sentence=sentence, events=events, s_start=s_start)
             examples.append(example)

diff --git a/code/run_trigger_qa.py b/code/run_trigger_qa.py
@@ -27,6 +27,9 @@
 from pytorch_pretrained_bert.tokenization import (BasicTokenizer,
                                                   BertTokenizer,
                                                   whitespace_tokenize)
+from spacy.lang.en import English # updated
+nlp = English()
+nlp.add_pipe(nlp.create_pipe('sentencizer')) # updated
 
 logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
                     datefmt='%m/%d/%Y %H:%M:%S',
@@ -56,9 +59,10 @@ def create_vocab(self, files_list):
         self.category_to_index["None"] = 0
         self.index_to_category[0] = "None"
         for file in files_list:
+            print(file)
             with open(file) as f:
-                for line in f:
-                    example = json.loads(line)
+                lines = json.load(f)
+                for example in lines:
                     events, sentence = example["event"], example["sentence"] 
                     if len(sentence) > self.max_sent_length: self.max_sent_length = len(sentence)
                     for event in events:
@@ -103,14 +107,95 @@ def __init__(self,
         self.labels = labels
 
 
+#Used when only sentences are available
+def read_arb_examples(nth_query, input_files, tokenizer, category_vocab, is_training):
+    #Read arbitrary examples
+    features = []
+    examples = []
+    sentence_id = 0
+    for dir in input_files:
+        with open(dir, "r", encoding='utf-8') as f:
+            raw_text = f.read()
+            doc = nlp(raw_text)
+            sentences = [sent.string.strip() for sent in doc.sents]
+            for sentence in sentences:           
+                sentence = sentence.split()
+
+                tokens = []
+                segment_ids = []
+                in_sentence = []
+                labels = []
+
+                # add [CLS]
+                tokens.append("[CLS]")
+                segment_ids.append(0)
+                in_sentence.append(0)
+
+                # add query
+                query = candidate_queries[nth_query]
+                for (i, token) in enumerate(query):
+                    sub_tokens = tokenizer.tokenize(token)
+                    tokens.append(sub_tokens[0])
+                    segment_ids.append(0)
+                    in_sentence.append(0)
+
+                # add [SEP]
+                tokens.append("[SEP]")
+                segment_ids.append(0)
+                in_sentence.append(0)
+
+                # add sentence
+                for (i, token) in enumerate(sentence):
+                    sub_tokens = tokenizer.tokenize(token)
+                    tokens.append(sub_tokens[0])
+                    segment_ids.append(1)
+                    in_sentence.append(1)
+
+                # add [SEP]
+                tokens.append("[SEP]")
+                segment_ids.append(1)
+                in_sentence.append(0)
+
+                input_ids = tokenizer.convert_tokens_to_ids(tokens)
+                input_mask = [1] * len(input_ids)
+                while len(input_ids) < category_vocab.max_sent_length:
+                    input_ids.append(0)
+                    input_mask.append(0)
+                    segment_ids.append(0)
+                    in_sentence.append(0)
+
+                # print(len(input_ids), category_vocab.max_sent_length)
+                assert len(input_ids) == category_vocab.max_sent_length
+                assert len(segment_ids) == category_vocab.max_sent_length
+                assert len(in_sentence) == category_vocab.max_sent_length
+                assert len(input_mask) == category_vocab.max_sent_length
+
+                features.append(
+                    InputFeatures(
+                        # unique_id=unique_id,
+                        # example_index=example_index,
+                        sentence_id=sentence_id,
+                        tokens=tokens,
+                        # token_to_orig_map=token_to_orig_map,
+                        # token_is_max_context=token_is_max_context,
+                        input_ids=input_ids,
+                        input_mask=input_mask,
+                        segment_ids=segment_ids,
+                        in_sentence=in_sentence,
+                        labels=labels))
+                examples.append(sentence)
+                # if len(tokens) > 20 and sum(labels) > 0:
+                    # import ipdb; ipdb.set_trace()
+                sentence_id += 1
+    return examples, features   
 def read_ace_examples(nth_query, input_file, tokenizer, category_vocab, is_training):
     """Read an ACE json file, transform to features"""
     features = []
     examples = []
     sentence_id = 0
     with open(input_file, "r", encoding='utf-8') as f:
-        for line in f:
-            example = json.loads(line)
+        lines = json.load(f)
+        for example in lines:
             sentence, events, s_start = example["sentence"], example["event"], example["s_start"]
             offset_category = dict()
             for event in events:
@@ -197,7 +282,35 @@ def read_ace_examples(nth_query, input_file, tokenizer, category_vocab, is_train
 
     return examples, features   
 
+def infer(args, eval_examples, category_vocab, model, device, eval_dataloader):
+    # eval_examples, eval_features, na_prob_thresh=1.0, pred_only=False):
+    all_results = []
+    model.eval()
 
+    # get predictions
+    pred_triggers = dict()
+    for _, (sentence_id, input_ids, segmend_ids, in_sentence, input_mask) in enumerate(eval_dataloader):
+        input_ids = input_ids.to(device)
+        segmend_ids = segmend_ids.to(device)
+        input_mask = input_mask.to(device)
+        with torch.no_grad():
+            logits = model(input_ids, token_type_ids =  segmend_ids, attention_mask = input_mask)
+        for i, in_sent in enumerate(in_sentence):
+            logits_i = logits[i].detach().cpu()
+            _, tag_seq = torch.max(logits_i, 1)
+            tag_seq = tag_seq.tolist()
+
+            decoded_tag_seg = []
+            for idj, j in enumerate(in_sent):
+                if j:
+                    decoded_tag_seg.append(category_vocab.index_to_category[tag_seq[idj]])
+            sentence_triggers = []
+            for offset, tag in enumerate(decoded_tag_seg):
+                if tag != "None":
+                    sentence_triggers.append([offset, tag])
+
+            pred_triggers[sentence_id[i]] = sentence_triggers            
+    return pred_triggers        
 
 def evaluate(args, eval_examples, category_vocab, model, device, eval_dataloader, pred_only=False):
     # eval_examples, eval_features, na_prob_thresh=1.0, pred_only=False):
@@ -538,6 +651,33 @@ def main(args):
                                         writer.write("%s = %s\n" % (key, str(best_result[key])))
 
             del model
+    if args.do_infer:
+        #To be updated later
+        files = ["proc/FFO/Stories/32_The_Snow-White_Heart.txt"]
+        eval_examples, eval_features = read_arb_examples(input_files=files, nth_query=args.nth_query, tokenizer=tokenizer, category_vocab=category_vocab, is_training=False)
+        all_sentence_id = torch.tensor([f.sentence_id for f in eval_features], dtype=torch.long)
+        all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
+        all_segmend_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
+        all_in_sentence = torch.tensor([f.in_sentence for f in eval_features], dtype=torch.long)
+        all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
+
+        eval_data = TensorDataset(all_sentence_id, all_input_ids, all_segmend_ids, all_in_sentence, all_input_mask)
+        eval_dataloader = DataLoader(eval_data, batch_size=1)
+
+
+        model = BertForTriggerClassification.from_pretrained(args.output_dir, num_labels=len(category_vocab.index_to_category))
+        if args.fp16:
+            model.half()
+        model.to(device)
+        preds = infer(args, eval_examples, category_vocab, model, device, eval_dataloader)
+
+
+        with open(os.path.join(args.output_dir, "trigger_predictions.json"), "w") as writer:
+            to_write=[]
+            for line in preds:
+                to_write.append(line)
+            writer.write(json.dumps(to_write, default=int))
+
 
     if args.do_eval:
         if args.eval_test:
@@ -571,8 +711,10 @@ def main(args):
             for key in result:
                 writer.write("%s = %s\n" % (key, str(result[key])))
         with open(os.path.join(args.output_dir, "trigger_predictions.json"), "w") as writer:
+            to_write=[]
             for line in preds:
-                writer.write(json.dumps(line, default=int) + "\n")
+                to_write.append(line)
+            writer.write(json.dumps(to_write, default=int))
 
 if __name__ == "__main__":
         parser = argparse.ArgumentParser()
@@ -586,6 +728,7 @@ def main(args):
                             help="How many times it evaluates on dev set per epoch")
         parser.add_argument("--do_train", action='store_true', help="Whether to run training.")
         parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.")
+        parser.add_argument("--do_infer", action='store_true', help="Whether to run inference on a set of files given.")
         parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.")
         parser.add_argument("--eval_test", action='store_true', help='Wehther to run eval on the test set.')
         parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.")

diff --git a/code/script_args_qa_thresh.sh b/code/script_args_qa_thresh.sh
@@ -19,10 +19,10 @@ python code/run_args_qa_thresh.py \
   --do_train \
   --do_eval \
   --model bert-base-uncased \
-  --train_file $ACE_DIR/toy.json \
+  --train_file $ACE_DIR/train_convert.json \
   --dev_file $ACE_PRE_DIR/trigger_predictions.json \
   --test_file $ACE_PRE_DIR/trigger_predictions.json \
-  --gold_file $ACE_DIR/toy.json \
+  --gold_file $ACE_DIR/test_convert.json \
   --train_batch_size 8 \
   --eval_batch_size 8  \
   --learning_rate 4e-5 \

diff --git a/code/script_trigger_qa.sh b/code/script_trigger_qa.sh
@@ -8,13 +8,13 @@ echo "                                          query 5 'verb'
 echo "=========================================================================================="
 
 python code/run_trigger_qa.py \
-  --do_train \
+  --do_infer \
   --do_eval \
   --eval_test \
   --model bert-base-uncased \
-  --train_file $ACE_DIR/toy.json \
-  --dev_file $ACE_DIR/toy.json  \
-  --test_file $ACE_DIR/toy.json \
+  --train_file $ACE_DIR/train_convert.json \
+  --dev_file $ACE_DIR/dev_convert.json  \
+  --test_file $ACE_DIR/test_convert.json \
   --train_batch_size 8 \
   --eval_batch_size 8  \
   --eval_per_epoch 20 \
@@ -23,4 +23,4 @@ python code/run_trigger_qa.py \
   --learning_rate 4e-5 \
   --nth_query 5 \
   --warmup_proportion 0.1 \
-
+
diff --git a/proc/data/ace-event/processed-data/json/toy.json b/proc/data/ace-event/processed-data/json/toy.json
@@ -1,2 +1,3 @@
 {"sentence": ["Tom", "visited", "all", "their", "friends", "."], "s_start": 462, "ner": [[466, 466, "PER"]], "relation": [], "event": [[[463, "Contact.Meet"], [466, 466, "Entity"]]]}
-{"sentence": ["Mary", "visited", "all", "her", "friends", "."], "s_start": 462, "ner": [[466, 466, "PER"]], "relation": [], "event": [[[463, "Contact.Meet"], [466, 466, "Entity"]]]}
+{"sentence": ["Mary", "visited", "all", "her", "friends", "."], "s_start": 462, "ner": [[466, 466, "PER"]], "relation": [], "event": [[[463, "Contact.Meet"], [466, 466, "Entity"]]]}
+
diff --git a/proc/scripts/data/ace-event/convert_examples.py b/proc/scripts/data/ace-event/convert_examples.py
@@ -1,13 +1,15 @@
 from os import path
 import json
 import collections
+import sys
 
-output_dir = "./data/ace-event/processed-data/json"
+output_dir = "./data/ace-event/processed-data/default-settings/json"
 for fold in ["train", "dev", "test"]:
     g_convert = open(path.join(output_dir, fold + "_convert.json"), "w")
+    to_write = []
     with open(path.join(output_dir, fold + ".json"), "r") as g:
-        for line in g:
-            line = json.loads(line)
+        lines = json.load(g)
+        for line in lines:
             sentences = line["sentences"]
             ner = line["ner"]
             relations = line["relations"]
@@ -25,6 +27,8 @@
                 sentence_annotated["ner"] = ner
                 sentence_annotated["relation"] = relation
                 sentence_annotated["event"] = event
-
+                
                 # if sentence_annotated["s_start"]>5:
-                g_convert.write(json.dumps(sentence_annotated, default=int) + "\n")
+                to_write.append(sentence_annotated)
+
+        g_convert.write(json.dumps(to_write, default=int))
diff --git a/proc/scripts/data/ace-event/parse_ace_event.py b/proc/scripts/data/ace-event/parse_ace_event.py
@@ -738,15 +738,16 @@ def one_fold(fold, output_dir, heads_only=True, real_entities_only=True, include
     with open(path.join(split_path, fold + ".filelist")) as f:
         for line in f:
             doc_keys.append(line.strip())
-
+    to_file = []
     with open(path.join(output_dir, fold + ".json"), "w") as g:
         for doc_key in doc_keys:
             annotation_path = path.join(doc_path, doc_key + ".apf.xml")
             text_path = path.join(doc_path, doc_key + ".sgm")
             document = Document(annotation_path, text_path, doc_key, fold, heads_only,
                                 real_entities_only, include_pronouns)
             js = document.to_json()
-            g.write(json.dumps(js, default=int, indent = 4) + "\n")
+            to_file.append(js)
+        g.write(json.dumps(to_file, default=int, indent=4))
 
 
 def main():