-
Notifications
You must be signed in to change notification settings - Fork 0
/
Snakefile
executable file
·156 lines (123 loc) · 7.5 KB
/
Snakefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import pandas as pd
import os
sys.stderr.write("\n############################################################ \n\n")
sys.stderr.write(" ++++++ \n")
sys.stderr.write(" ++++++++++++ \n")
sys.stderr.write(" ++++++++++++++++++ \n")
sys.stderr.write(" +++++++++++++++++++++++++ \n")
sys.stderr.write(" ++++++++++++++++++++++++++++++ \n")
sys.stderr.write(" ++++| _ \(_)/ ____| __ \++++ \n")
sys.stderr.write(" ++++| |+) |_| |++++| |++) |+++ \n")
sys.stderr.write(" ++++| _ <| | |+|_ | _ /++++ \n")
sys.stderr.write(" ++++| |+) | | |++| | |+\ \++++ \n")
sys.stderr.write(" ++++|____/|_|\_____|_|++\_\+++ \n")
sys.stderr.write(" ++++++++++++++++++++++++++++++ \n")
sys.stderr.write(" +++++++++++++++++++++++++ \n")
sys.stderr.write(" +++++++++++++++++++ \n")
sys.stderr.write(" +++++++++++++ \n")
sys.stderr.write(" +++++++ \n")
sys.stderr.write("\n Editing RNA pipeline \n")
sys.stderr.write("\nFor any question, sent an email to [email protected]")
sys.stderr.write("\n############################################################ \n\n")
### parameters ###################################################################################################################################
#pipeline directory
PIPELINE_DIR = workflow.snakefile
PIPELINE_DIR = PIPELINE_DIR.replace("/Snakefile", "")
sys.stderr.write("\n################### Checking Config and Design File ###################\n\n")
#output directory
if "output_dir" in config:
OUTPUT_DIR = config["output_dir"]
sys.stderr.write("Output directory set to " + OUTPUT_DIR + ".\n")
else:
OUTPUT_DIR = os.getcwd()
sys.stderr.write("No 'output_dir' found in config file, output directory set to " + OUTPUT_DIR + ".\n")
#design file
if "design" in config :
if os.path.isfile(config["design"]):
sys.stderr.write("Design file found: " + config["design"] + ".\n")
else:
sys.exit("Error: 'design' file not found at the given path. Check your configfile.\n")
else:
sys.exit("Error: no 'design' found in config file. Check your configfile.\n")
#reference
if "reference" in config and config["reference"] in ["hg19", "hg38", "mm10", "mm9"]:
sys.stderr.write("Reference set to: " + config["reference"] + ".\n")
else:
sys.exit("Error in 'reference' set! Check your configfile. Reference can be: hg19, hg38, mm10 or mm9.\n")
#read design file
design=pd.read_table(config["design"], sep=",")
#check if input_format and variant_calling_mode are agree with design format
format_design=["sample_id","R1_fastq","R2_fastq"]
if set(format_design).issubset(design.columns):
sys.stderr.write("Design file well formated.\n")
design=design[format_design]
else: sys.exit("Error in the format of the design file: missing column(s).")
#check if all sample_id are differents
if not len(set(design["sample_id"])) == len(design["sample_id"]): sys.exit("Error: All sample_id have to be different! Check your design file.")
#check if all files are differents on each line
for i in range(0, len(design["sample_id"]), 1):
if not len(set(design.iloc[i])) == len(design.iloc[i]): sys.exit("Error: All files have to be different! Check your design file.")
#make the data structure
SAMPLE_NAME = []
FQ_NAME = []
ORIG_FILE = []
SYMLINK_FILES = []
for col in design.drop(columns = "sample_id").columns.tolist():
for line in range(0,len(design[col].tolist()),1):
SAMPLE_NAME.append(design["sample_id"].iloc[line])
ORIG_FILE.append(design[col].iloc[line])
if col == "R1_fastq": SUPPL_NAME = "_R1"
if col == "R2_fastq": SUPPL_NAME = "_R2"
FQ_NAME.append(design["sample_id"].iloc[line] + SUPPL_NAME)
SYMLINK_FILES.append(os.path.normpath(OUTPUT_DIR + "/symlink_input/" + design["sample_id"].iloc[line] + SUPPL_NAME + ".fastq.gz"))
#samples_order_for_ggplot
if not "samples_order_for_ggplot" in config:
config["samples_order_for_ggplot"] = ",".join(sorted(design["sample_id"]))
else:
if config["samples_order_for_ggplot"] == "" :
config["samples_order_for_ggplot"] = ",".join(sorted(design["sample_id"]))
if not set(config["samples_order_for_ggplot"].split(",")).issubset(design["sample_id"]):
sys.exit("Error sampes names of 'samples_order_for_ggplot' from config file don't match 'Sample_id' of design file.")
#extra parameters:
if not "SPRINT_extra" in config:
config["SPRINT_extra"] = ""
if not "RNAEditingIndexer_extra" in config:
config["RNAEditingIndexer_extra"] = ""
#SPRINT type of results
RES_TYPE=["identified_all", "identified_hyper", "identified_regular"]
sys.stderr.write("\n########################### Run ############################\n\n")
### rule all ###
rule all:
input:
#symlink_qc_filtering
expand(os.path.normpath(OUTPUT_DIR + "/fastp/trimmed/{sample_name}_R1.fastq"), sample_name=SAMPLE_NAME),
expand(os.path.normpath(OUTPUT_DIR + "/fastp/trimmed/{sample_name}_R2.fastq"), sample_name=SAMPLE_NAME),
os.path.normpath(OUTPUT_DIR + "/multiqc_report.html"),
#SPRINT
expand(os.path.normpath(OUTPUT_DIR + "/SPRINT/{sample_name}/tmp/genome/all.bam"), sample_name=SAMPLE_NAME),
expand(os.path.normpath(OUTPUT_DIR + "/SPRINT/{sample_name}/PARAMETER.txt"), sample_name=SAMPLE_NAME),
expand(os.path.normpath(OUTPUT_DIR + "/SPRINT/{sample_name}/SPRINT_identified_all.res"), sample_name=SAMPLE_NAME),
expand(os.path.normpath(OUTPUT_DIR + "/SPRINT/{sample_name}/SPRINT_identified_hyper.res"), sample_name=SAMPLE_NAME),
expand(os.path.normpath(OUTPUT_DIR + "/SPRINT/{sample_name}/SPRINT_identified_regular.res"), sample_name=SAMPLE_NAME),
expand(os.path.normpath(OUTPUT_DIR + "/SPRINT/Number_of_all_edition_by_sample_{res_type}.png"), res_type=RES_TYPE),
expand(os.path.normpath(OUTPUT_DIR + "/SPRINT/Number_of_each_edition_by_sample_log_y_scale_{res_type}.png"), res_type=RES_TYPE),
expand(os.path.normpath(OUTPUT_DIR + "/SPRINT/Number_of_each_edition_by_sample_{res_type}.png"), res_type=RES_TYPE),
expand(os.path.normpath(OUTPUT_DIR + "/SPRINT/Number_of_reads_supporting_the_edition_for_each_sample_{res_type}.png"), res_type=RES_TYPE),
expand(os.path.normpath(OUTPUT_DIR + "/SPRINT/Percentages_of_reads_supporting_the_edition_for_each_sample_{res_type}.png"), res_type=RES_TYPE),
expand(os.path.normpath(OUTPUT_DIR + "/SPRINT/Summary_table_counts_SPRINT_{res_type}_by_strand.tsv"), res_type=RES_TYPE),
expand(os.path.normpath(OUTPUT_DIR + "/SPRINT/Summary_table_counts_SPRINT_{res_type}.tsv"), res_type=RES_TYPE),
#RNAEditingIndexer
os.path.normpath(OUTPUT_DIR + "/RNAEditingIndexer/summary/EditingIndex.csv"),
os.path.normpath(OUTPUT_DIR + "/RNAEditingIndexer/Coverage_by_base_reference_by_sample.png"),
os.path.normpath(OUTPUT_DIR + "/RNAEditingIndexer/Scores_of_all_edition_by_sample.png"),
os.path.normpath(OUTPUT_DIR + "/RNAEditingIndexer/Scores_of_edition_and_background_noise_for_each_edition_type.png")
message:
"Pipeline finished!"
### global wildcard constraints ###
wildcard_constraints:
fq_name = '|'.join([x for x in FQ_NAME]),
sample_name = '|'.join([x for x in SAMPLE_NAME])
### other rules ###
include: "rules/symlink_qc_filtering.smk"
include: "rules/SPRINT.smk"
include: "rules/RNAEditingIndexer.smk"