-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_pipeline.sh
146 lines (126 loc) · 4.32 KB
/
run_pipeline.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#!/usr/bin/env bash
set -e
set -o pipefail
# Load params
PARAMS=$1
. "$PARAMS"
# Setup
source $CONDA_PATH
conda activate "$CDSE_COVID_ENV_NAME"
mkdir -p "$CHECKPOINT_DIR"
# Preprocessing
if [[ ! -e "$CHECKPOINT_DIR"/spacified_ckpt ]]; then
echo "Starting preprocessing..."
mkdir -p "$SPACIFIED_OUTPUT"
python "$PROJECT_DIR"/cdse_covid/pegasus_pipeline/ingesters/aida_txt_ingester.py \
--corpus "$CORPUS_PATH" \
--output "$SPACIFIED_OUTPUT" \
--spacy-model "$SPACY_MODEL_PATH"
touch "$CHECKPOINT_DIR"/spacified_ckpt
echo "Spacified files are now in $SPACIFIED_OUTPUT"
fi
# EDL ingestion
if [[ ! -e "$CHECKPOINT_DIR"/edl_ckpt ]]; then
echo "Starting EDL ingestion..."
mkdir -p "$EDL_DIR"
python "$PROJECT_DIR"/cdse_covid/pegasus_pipeline/ingesters/edl_output_ingester.py \
--edl-output "$EDL_FINAL" \
--output "$EDL_MAPPING_FILE"
touch "$CHECKPOINT_DIR"/edl_ckpt
echo "Ingested EDL is now saved to $EDL_MAPPING_FILE"
fi
# AMR-all
conda activate transition-amr-parser
if [[ ! -e "$CHECKPOINT_DIR"/amr_all_ckpt ]]; then
echo "Starting AMR parsing over all sentences..."
mkdir -p "$AMR_All_OUTPUT"
python "$PROJECT_DIR"/cdse_covid/pegasus_pipeline/run_amr_parsing_all.py \
--corpus "$CORPUS_PATH" \
--output "$AMR_All_OUTPUT" \
--amr-parser-model "$PROJECT_DIR"/../transition-amr-parser \
--max-tokens "$MAX_TOKENS"
touch "$CHECKPOINT_DIR"/amr_all_ckpt
echo "AMR graphs are now in $AMR_All_OUTPUT"
fi
# Claim detection
conda activate $CDSE_COVID_ENV_NAME
if [[ ! -e "$CHECKPOINT_DIR"/claims_ckpt ]]; then
echo "Starting claims detection..."
mkdir -p "$CLAIMS_OUTPUT"
python "$PROJECT_DIR"/cdse_covid/claim_detection/run_claim_detection.py \
--input "$SPACIFIED_OUTPUT" \
--patterns "$PATTERNS_FILE" \
--out "$CLAIMS_OUTPUT" \
--spacy-model "$SPACY_MODEL_PATH"
touch "$CHECKPOINT_DIR"/claims_ckpt
echo "Claims are now in $CLAIMS_OUTPUT"
fi
# AMR-claims
conda activate transition-amr-parser
if [[ ! -e "$CHECKPOINT_DIR"/amr_ckpt ]]; then
echo "Starting AMR parsing on claims..."
mkdir -p "$AMR_OUTPUT"
python "$PROJECT_DIR"/cdse_covid/semantic_extraction/run_amr_parsing.py \
--input "$CLAIMS_OUTPUT" \
--output "$AMR_OUTPUT" \
--amr-parser-model "$PROJECT_DIR"/../transition-amr-parser \
--state-dict "$STATE_DICT" \
--max-tokens "$MAX_TOKENS" \
--domain "$DOMAIN" \
--device "$DEVICE"
touch "$CHECKPOINT_DIR"/amr_ckpt
echo "Claim data from AMR is now in $AMR_OUTPUT"
fi
# SRL
conda activate $CDSE_COVID_ENV_NAME
if [[ ! -e "$CHECKPOINT_DIR"/srl_ckpt ]]; then
echo "Starting SRL..."
mkdir -p "$SRL_OUTPUT"
python "$PROJECT_DIR"/cdse_covid/semantic_extraction/run_srl.py \
--input "$AMR_OUTPUT" \
--output "$SRL_OUTPUT" \
--spacy-model "$SPACY_MODEL_PATH"
touch "$CHECKPOINT_DIR"/srl_ckpt
echo "Claim data from SRL is now in $SRL_OUTPUT"
fi
# Wikidata linking
conda activate transition-amr-parser
if [[ ! -e "$CHECKPOINT_DIR"/wikidata_ckpt ]]; then
echo "Starting Wikidata linking..."
mkdir -p "$WIKIDATA_OUTPUT"
python "$PROJECT_DIR"/cdse_covid/semantic_extraction/run_wikidata_linking.py \
--claim-input "$CLAIMS_OUTPUT" \
--amr-input "$AMR_OUTPUT" \
--srl-input "$SRL_OUTPUT" \
--state-dict "$STATE_DICT" \
--output "$WIKIDATA_OUTPUT" \
--device "$DEVICE"
touch "$CHECKPOINT_DIR"/wikidata_ckpt
echo "Output from Wikidata linking is now in $WIKIDATA_OUTPUT"
fi
# Entity unification
conda activate $CDSE_COVID_ENV_NAME
if [[ ! -e "$CHECKPOINT_DIR"/entity_ckpt ]]; then
echo "Starting entity unification..."
mkdir -p "$ENTITY_OUTPUT"
python "$PROJECT_DIR"/cdse_covid/semantic_extraction/run_entity_merging.py \
--edl "$EDL_MAPPING_FILE" \
--qnode-freebase "$QNODE_FREEBASE" \
--freebase-to-qnodes "$FREEBASE_TO_QNODES" \
--claims "$WIKIDATA_OUTPUT" \
--output "$ENTITY_OUTPUT" \
--include-contains
touch "$CHECKPOINT_DIR"/entity_ckpt
echo "Entity output is now in $ENTITY_OUTPUT"
fi
# Postprocessing
echo "Merging output..."
python "$PROJECT_DIR"/cdse_covid/pegasus_pipeline/merge.py \
--input "$ENTITY_OUTPUT" \
--output "$FINAL_OUTPUT_FILE"
echo "Final output has been saved to $FINAL_OUTPUT_FILE"
# Finished; remove checkpoints
for f in "$CHECKPOINT_DIR"/*; do
echo "Removing checkpoint file $f"
rm "$f"
done