-
Notifications
You must be signed in to change notification settings - Fork 0
/
run-pipeline.sh
executable file
·156 lines (118 loc) · 3.82 KB
/
run-pipeline.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
#!/bin/bash
#list of paths to genotype array data
INPUT_LIST=$1
#path to .ped file for beagle4 use
PEDIGREE=$2
#path to intermediate and result files
WORKING_DIRECTORY=./inputs
#path to binaries
BIN_DIRECTORY=./bin
#unique identifier for a pipeline run, signature of inputs, the MD5 hash of the input list file
INPUT_MD5=`md5sum ${INPUT_LIST} | awk '{print $1}'`
#INPUT_MD5=`md5 -q ${INPUT_LIST}`
#path to the 2vcf binary
TO_VCF="${BIN_DIRECTORY}/2vcf"
# path to 2vcf reference
REFERENCE="${BIN_DIRECTORY}/2vcf-v2.1.vcf.gz"
# path to beagle4 jar
BEAGLE4_JAR="${BIN_DIRECTORY}/beagle.r1399.jar"
# path to beagle5 jar
BEAGLE5_JAR="${BIN_DIRECTORY}/beagle.24Aug19.3e8.jar"
# path to refined IBD jar
REFINED_IBD_JAR="${BIN_DIRECTORY}/refined-ibd.16May19.ad5.jar"
# path to merged VCF after conversion of inputs
MERGED_VCF="${WORKING_DIRECTORY}/${INPUT_MD5}-merged.vcf.gz"
# path to phased VCF
PHASED_VCF="${WORKING_DIRECTORY}/${INPUT_MD5}-merged-phased"
# path to output VCF of IBD calling
PHASED_IBD_OUT="${WORKING_DIRECTORY}/${INPUT_MD5}-merged-phased-ibd-called"
PHASED_IBD_ESTIMATED="${PHASED_IBD_OUT}-estimated"
# human genetic map
GENETIC_MAP="${BIN_DIRECTORY}/plink.GRCh37.map"
CM_THRESHOLD="1"
RELATEDNESS="${BIN_DIRECTORY}/IBD_relatedness/relatedness_v1.py"
RELATEDNESS_OUTPUT="${WORKING_DIRECTORY}/${INPUT_MD5}-relatedness"
function indexVCF () {
local VCF=$1
tabix ${VCF}
}
function getVCFName () {
local SUBJECT=$1
local SUBJECT_NAME=`basename ${SUBJECT} ".zip"`
VCF_PATH="${WORKING_DIRECTORY}/${SUBJECT_NAME}.vcf.gz"
}
# convert incoming genotype array calls to vcf
function convert2VCF () {
local SUBJECT=$1
getVCFName ${SUBJECT}
echo ${SUBJECT}
local CMD="${TO_VCF} conv 23andme --ref ${REFERENCE} --input ${SUBJECT} --output ${VCF_PATH} --fixAllos"
eval ${CMD}
indexVCF ${VCF_PATH}
}
# Stage 1 - convert input into VCF
function ingestion () {
for SUBJECT in $(cat ${INPUT_LIST})
do
convert2VCF ${SUBJECT}
done
}
# Stage 2 - merge VCFs
function merge () {
local CMD="bcftools merge -O z -o ${MERGED_VCF}"
for SUBJECT in $(cat ${INPUT_LIST})
do
getVCFName ${SUBJECT}
CMD="${CMD} ${VCF_PATH}"
done
eval "${CMD}"
indexVCF ${MERGED_VCF}
}
# Stage 3 - phasing
function phasingB4 () {
local CMD="java -Xmx7g -jar ${BEAGLE4_JAR} nthreads=4 gt=${MERGED_VCF} impute=false gprobs=false ped=${PEDIGREE} ibd=true ibdlod=4.0 out=${PHASED_VCF}"
eval ${CMD}
}
function phasingB5 () {
local CMD="java -Xmx7g -jar ${BEAGLE5_JAR} nthreads=2 gt=${MERGED_VCF} impute=false out=${PHASED_VCF}"
eval ${CMD}
}
function refinedIBD () {
local CMD="java -Xmx7g -jar ${REFINED_IBD_JAR} nthreads=4 gt=${PHASED_VCF}.vcf.gz out=${PHASED_IBD_OUT}"
eval ${CMD}
}
function estimateCM () {
cat ${PHASED_VCF}.hbd | awk '{print $0"\t"($7-$6)/1000000}' > ${PHASED_VCF}.estimated.hbd
cat ${PHASED_VCF}.ibd | awk '{print $0"\t"($7-$6)/1000000}' > ${PHASED_VCF}.estimated.ibd
}
function relatednessB4 () {
local CMD="cat ${PHASED_VCF}.estimated.hbd | python ${RELATEDNESS} ${GENETIC_MAP} ${CM_THRESHOLD} > ${RELATEDNESS_OUTPUT}.ibd.txt"
echo "CMD=${CMD}"
eval ${CMD}
}
function relatednessB5 () {
local CMD="zcat ${PHASED_IBD_OUT}.hbd.gz | python ${RELATEDNESS} ${GENETIC_MAP} ${CM_THRESHOLD} > ${RELATEDNESS_OUTPUT}.hbd.txt"
echo "CMD=${CMD}"
eval ${CMD}
CMD="zcat ${PHASED_IBD_OUT}.ibd.gz | python ${RELATEDNESS} ${GENETIC_MAP} ${CM_THRESHOLD} > ${RELATEDNESS_OUTPUT}.ibd.txt"
echo "CMD=${CMD}"
eval ${CMD}
}
#################
# main pipeline #
#################
echo "inputs/family.ped MD5: ${INPUT_MD5}"
# run stage 1 - convert input to VCF
ingestion
# run stage 2 - merge VCF
merge
# run stage 3 - phasing with beagle 4.1
phasingB4
estimateCM
# run stage 3 - phasing with beagle 5.1 and refinedIBD
#phasingB5
#refinedIBD
# run stage 4 - relatedness
relatednessB4
# run stage 4 - relatedness
#relatednessB5