-
Notifications
You must be signed in to change notification settings - Fork 2
/
MPRA_package.R
342 lines (309 loc) · 21 KB
/
MPRA_package.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
# -----------------MPRA Data analysis-----------------------
# Feng_yuanqing
# This script is used to compare the allele activity using MPRA barcode counts data from DNA and RNA.
#1. Install mpra and its dependencies by running the lines below in R or RStudio.
if (!requireNamespace("BiocManager", quietly=TRUE))
install.packages("BiocManager")
BiocManager::install("mpra")
library("mpra")
library("debrowser")
library("tidyverse")
library("ggrepel")
library("ggpubr")
library("ggExtra")
library("reshape2")
## --------------------------------- -----read in a data frame-----------------------
## read in a data frame that have MPRA counts data, with col1:Oligo_name; col2: Sample1; col3: Sample2; .... MPRA_DNA_RNA_barcode.count.tsv
data <- read.table(file.choose(),header =T)
head(data)
## ------------------------extract Ref and Alt Allele separately------------------------
alt <- data %>% filter(str_detect(Enhancer_name,"^Alt"))
alt$oligo <- gsub("Alt_","",alt$Enhancer_name, perl=TRUE)
ref <- data %>% filter(str_detect(Enhancer_name,"^Ref"))
ref$oligo <- gsub("Ref_","",ref$Enhancer_name, perl=TRUE)
## ------------------------extract SNPs have both Ref and Alt Allele------------------------
alt2 <- dplyr::filter(alt, alt$oligo %in% ref$oligo)
ref2 <- dplyr::filter(ref, ref$oligo %in% alt$oligo)
col_name <- c("ref_1","ref_2","ref_3", "alt_1","alt_2","alt_3")
WM88_DNA <- data.frame(cbind(ref2[,4:6], alt2[,4:6]),row.names = alt2$oligo); colnames(WM88_DNA) <- col_name
colSums(WM88_DNA, na.rm = TRUE)
WM88_RNA <- data.frame(cbind(ref2[,7:9], alt2[,7:9]),row.names = alt2$oligo); colnames(WM88_RNA) <- col_name
colSums(WM88_RNA, na.rm = TRUE)
MNT1_DNA <- data.frame(cbind(ref2[,10:12], alt2[,10:12]),row.names = alt2$oligo); colnames(MNT1_DNA) <- col_name
colSums(MNT1_DNA, na.rm = TRUE)
MNT1_RNA <- data.frame(cbind(ref2[,13:15], alt2[,13:15]),row.names = alt2$oligo); colnames(MNT1_RNA) <- col_name
colSums(MNT1_RNA, na.rm = TRUE)
##---------------------------- construct a MPRASet ------------------------
mpra_MNT <- MPRASet(DNA = MNT1_DNA, RNA = MNT1_RNA, eid = alt2$oligo, eseq = NULL, barcode = NULL)
mpra_WM88 <- MPRASet(DNA = WM88_DNA, RNA = WM88_RNA, eid = alt2$oligo, eseq = NULL, barcode = NULL)
##----------------------------- normalization of the barcode counts bwtween samples------------------
## Calculate the RNA/DNA ratio of normalized counts (both DNA and RNA are sceled to have 10 million counts )
Ratio_mnt1 <- getRNA(normalize_counts(mpra_MNT)) / getDNA(normalize_counts(mpra_MNT))
Ratio_wm88 <- getRNA(normalize_counts(mpra_WM88)) / getDNA(normalize_counts(mpra_WM88))
write.table(Ratio_mnt1, file = "MNT1_Ref_vs_Alt_ratio.txt", row.names = T, sep = "\t", quote = F)
write.table(Ratio_wm88, file = "WM88_Ref_vs_Alt_ratio.txt", row.names = T, sep = "\t", quote = F)
# Should normalized the DNA or RNA counts independently using limma
#------------------------ MPRASet limma_voom_eBayes DEG Analysis for MNT1------------------------
detach("package:tidyverse", unload=TRUE)
mpra_MNT <- MPRASet(DNA = MNT1_DNA, RNA = MNT1_RNA, eid = alt2$oligo, eseq = NULL, barcode = NULL)
# the desgin of the MPRASet, 3 ref, 3 alt.
design <- data.frame(intcpt = 1, alleleB = grepl("alt", colnames(mpra_MNT)))
# block is a vector that is used when the columns of the MPRAset object are paired.
block_vector <- rep(1:3, 2)
# Linear models for differential analysis of MPRA data
mpralm_MNT_fit <- mpralm(object = mpra_MNT, design = design, aggregate = "none", normalize = TRUE, block = block_vector, model_type = "corr_groups", plot = TRUE)
toptab_MNT <- topTable(mpralm_MNT_fit, coef = 2, number = Inf)
head(toptab_MNT)
toptab_MNT_ranked <- toptab_MNT %>% rownames_to_column() %>% arrange(adj.P.Val)
## Write the results into a txt file
write.table(toptab_MNT_ranked, file = "MNT1_Ref_vs_Alt_mpralm_sigdiff.tsv", row.names = F, sep = "\t", quote = F)
#------------------------ MPRASet limma_voom_eBayes DEG Analysis for WM88 ------------------------
mpra_WM88 <- MPRASet(DNA = WM88_DNA, RNA = WM88_RNA, eid = alt2$oligo, eseq = NULL, barcode = NULL)
design <- data.frame(intcpt = 1, alleleB = grepl("alt", colnames(mpra_WM88)))
block_vector <- rep(1:3, 2)
mpralm_WM88_fit <- mpralm(object = mpra_WM88, design = design, aggregate = "none", normalize = TRUE, block = block_vector, model_type = "corr_groups", plot = TRUE) ## here using normalized counts, All columns are scaled to have 10 million counts.
toptab_WM88 <- topTable(mpralm_WM88_fit, coef = 2, number = Inf)
head(toptab_WM88)
toptab_WM88_ranked <- toptab_WM88 %>% rownames_to_column() %>% arrange(adj.P.Val)
## Write the results into a txt file
write.table(toptab_WM88_ranked, file = "WM88_Ref_vs_Alt_mpralm_sigdiff.tsv", row.names = F, sep = "\t", quote = F)
# ----------------------------------------FC P-values plot using a package-------------------------------- ----------
library("EnhancedVolcano")
library("scales")
# wm88 select top snp to highlight
selectLab <- toptab_WM88 %>% filter(adj.P.Val<1e-20 & abs(logFC)>0.25 ) %>% rownames()
summary(-log10(toptab_WM88$adj.P.Val))
summary(toptab_WM88$logFC)
# plotting ggplot
p<-EnhancedVolcano(toptab_WM88_M, lab=rownames(toptab_WM88), selectLab=selectLab, x="logFC", y="adj.P.Val",hlineCol="steelblue",hline= c(1e-20), borderWidth = 0.25, labSize = 4, title = NULL,titleLabSize = 16, subtitle=NULL,axisLabSize=16, pCutoff =1e-2, FCcutoff =0.2, xlim = c(-1.2, 1.2), ylim = c(0, 35), pointSize=1, colAlpha = 0.8,shape = 19,legendLabSize =15, captionLabSize = 15, cutoffLineType = 'longdash', cutoffLineWidth = 0.25, drawConnectors = T)
p
ggsave("Volcano_WM88.pdf", width =5, height = 6, units = "in")
# MNT1
selectLab2 <- toptab_MNT %>% filter(adj.P.Val<1e-20 & abs(logFC)>0.25 ) %>% rownames()
summary(-log10(toptab_MNT$adj.P.Val))
summary(toptab_MNT$logFC)
p2<-EnhancedVolcano(toptab_MNT, lab=rownames(toptab_MNT), selectLab=selectLab2, x="logFC", y="adj.P.Val",hlineCol="steelblue",hline= c(1e-20), borderWidth = 0.25, labSize = 4, title = NULL,titleLabSize = 16, subtitle=NULL,axisLabSize=16, pCutoff =1e-2, FCcutoff =0.2, xlim = c(-2.4, 1), ylim = c(0, 120), pointSize=1, colAlpha = 0.6,shape = 19,legendLabSize =15, captionLabSize = 15, cutoffLineType = 'longdash', cutoffLineWidth = 0.25, drawConnectors = T)
p2
ggsave("Volcano_MNT1.pdf", width =5, height = 6, units = "in")
#-------------- correlation of RNA/DNA ratio mnt and wm88 ------------
panel.hist <- function(x, ...){
usr <- par("usr"); on.exit(par(usr))
par(usr = c(usr[1:2], 0, 1.5) )
h <- hist(x, plot = FALSE,breaks = "FD" )
breaks <- h$breaks; nB <- length(breaks)
y <- h$counts; y <- y/max(y)
rect(breaks[-nB], 0, breaks[-1], y, col = "cyan2", ...)}
allcor <- function (data, cex=2) {
pcor2 <- function(x, y, ...) panel.cor(x, y, cex.cor = cex, prefix = "")
nr <- nrow(data)
if (nr > 2000)
nr <- 2000
pairs(log10(data[1:nr, ]), cex = 0.25, diag.panel = panel.hist, lower.panel = pcor2)}
## Calculate the RNA/DNA ratio of normalized counts
Ratio_mnt1 <- getRNA(normalize_counts(mpra_MNT)) / getDNA(normalize_counts(mpra_MNT))
Ratio_wm88 <- getRNA(normalize_counts(mpra_WM88)) / getDNA(normalize_counts(mpra_WM88))
allcor(Ratio_mnt1)
allcor(Ratio_wm88)
colnames(Ratio_mnt1)<-c("MNT1_Ref1","MNT1_Ref2","MNT1_Ref3","MNT1_Alt1","MNT1_Alt2","MNT1_Alt3")
colnames(Ratio_wm88)<-c("WM88_Ref1","WM88_Ref2","WM88_Ref3","WM88_Alt1","WM88_Alt2","WM88_Alt3")
allratio<-cbind(Ratio_mnt1,Ratio_wm88)
allcor(allratio)
#------------------- PCA plot of RNA/DNA ratios ref.alt in mnt and wm88 ----------------
PCA_plot <- function(x) {
pca_data <- debrowser::run_pca(x)
# x is a read counts data.frame.
p_data <- debrowser::prepPCADat(pca_data)
pcx=1;pcy=2
xaxis <- sprintf("PC%d (%.2f%%)", pcx, round(pca_data$explained[pcx] * 100, 2))
yaxis <- sprintf("PC%d (%.2f%%)", pcy, round(pca_data$explained[pcy] * 100, 2))
plot1 <- ggplot(data = p_data, aes(x = x, y = y)) + geom_point(aes(color = color), size = 4) + labs(x = xaxis, y = yaxis) + theme( legend.text=element_text(size=12), axis.text=element_text(size=12), axis.title=element_text(size=14),plot.margin=margin(t=20, r=20, b=20, l=20, "pt")) + geom_text(aes(label = samples), vjust = 0, nudge_y = -1) + theme_light()
return(plot1)
}
PCA_plot(getDNA(normalize_counts(mpra_WM88)))
PCA_plot(Ratio_wm88)
PCA_plot(allratio)
##------------Compare Fold change and pvalues between two cell lines using 4 Quadrants --------------------
# sorting the data by oligo name
wm_rank <- toptab_WM88 %>% rownames_to_column() %>% arrange(rowname)
mnt_rank <- toptab_MNT %>% rownames_to_column() %>% arrange(rowname)
# build a factor for intersected significant snps and uniq ones.
sig_mnt <- dplyr::filter(mnt_rank,adj.P.Val<0.05)
sig_wm <- dplyr::filter(wm_rank,adj.P.Val<0.05)
both <- dplyr::filter(sig_mnt,sig_mnt$rowname %in% sig_wm$rowname) %>% pull(rowname)
mnt_only <- dplyr::filter(sig_mnt,!(sig_mnt$rowname %in% sig_wm$rowname)) %>% pull(rowname)
wm_only <- dplyr::filter(sig_wm,!(sig_wm$rowname %in% sig_mnt$rowname)) %>% pull(rowname)
snp_rank <-as.data.frame(wm_rank$rowname)
snp_rank= mutate(snp_rank,b =ifelse(snp_rank[,1] %in% both ,1,0))
snp_rank= mutate(snp_rank,mnt =ifelse(snp_rank[,1] %in% mnt_only ,2,0))
snp_rank= mutate(snp_rank,wm =ifelse(snp_rank[,1] %in% wm_only ,3,0))
snp_rank= mutate(snp_rank,sum = rowSums(snp_rank[,2:4]))
head(snp_rank)
# combine the fold change data from MNT and WM88, With factor info.
fc2 <- cbind.data.frame(mnt_rank$logFC,wm_rank$logFC,snp_rank$sum)
colnames(fc2)=c("MNT_log2FC","WM88_log2FC","factor")
# simple plot
p1<-ggscatter(fc2, x = "MNT_log2FC", y = "WM88_log2FC", conf.int = TRUE, cor.coef = TRUE, cor.method = "pearson", main="MNT1_vs_WM88",font.main=c(12, "bold.italic", "blue4"), xlab="MNT_log2FC ", ylab="WM88_log2FC", shape = 20, size=2, color ="dodgerblue4", cor.coeff.args = list(method = "pearson", label.x = -2, label.sep = "\n"))
p1 + geom_hline(yintercept=0,linetype = 2,color="grey") + geom_vline(xintercept=0,linetype = 2,color="grey")
# plot with classification factors.
# using summary(fc2) to determine the axis scale.
p3 <- ggplot(fc2, aes(x=MNT_log2FC, y=WM88_log2FC,color=factor(fc2$factor))) +
theme_bw() + geom_point(size=0.5,alpha=1) +
scale_color_manual(values=c("grey","#1B9E77","#D95F02","#7570B3"),labels = c("NS","both","MNT_only","WM88_only"))+
labs(x ="MNT_log2FC", y="WM88_log2FC") +
geom_hline(yintercept=0,linetype = 2,color="grey") +
geom_vline(xintercept=0,linetype = 2,color="grey") +
xlim(-2.5, 1) + ylim(-0.75, 0.75) +
theme(legend.text=element_text(size=12), axis.text=element_text(size=12),
axis.title=element_text(size=14),plot.margin=margin(t=20, r=20, b=20, l=20, "pt"))
p3
## add the percentage at four Quadrants
library("scales")
total <- length(both)+length(mnt_only)+length(wm_only)
per1 <- filter(fc2,fc2$MNT_log2FC>0 &fc2$WM88_log2FC>0) %>% filter(factor!=0); pp<-length(per1$factor)/total
per2 <- filter(fc2,fc2$MNT_log2FC>0 &fc2$WM88_log2FC<0) %>% filter(factor!=0); pn<-length(per2$factor)/total
per3 <- filter(fc2,fc2$MNT_log2FC<0 &fc2$WM88_log2FC>0) %>% filter(factor!=0); np<-length(per3$factor)/total
per4 <- filter(fc2,fc2$MNT_log2FC<0 &fc2$WM88_log2FC<0) %>% filter(factor!=0); nn<-length(per4$factor)/total
p3+ annotate("text", x=0.75, y=0.75, label=percent(pp), size=5)+
annotate("text", x=0.75, y=-0.75, label=percent(pn), size=5)+
annotate("text", x=-2, y=0.75, label=percent(np), size=5)+
annotate("text", x=-2, y=-0.75, label=percent(nn), size=5)
##----------------------------- --------- Density plot of the barcode counts ------------------------------ --------------
##data transformation, mark each row by samples using melt, for density plot
dens_tf <- function(x) {
ID <- rownames(x)
dat1 <- cbind(ID, x)
colnames(dat1) <- c("ID", colnames(x))
data <- dat1
head(data)
mdata <- melt(as.data.frame(data), "ID")
data2 <-data.frame(samples=mdata$variable, logcount= log10(as.numeric(mdata$value)))
return(data2)}
# raw count data
tfdata <- dens_tf(MNT1_DNA)
tfdata <- dens_tf(MNT1_RNA)
tfdata <- dens_tf(WM88_DNA)
tfdata <- dens_tf(WM88_RNA)
# normalized count data. All columns are scaled to have 10 million counts.
tfdata <- dens_tf(getDNA(normalize_counts(mpra_MNT)))
tfdata <- dens_tf(getRNA(normalize_counts(mpra_MNT)))
tfdata <- dens_tf(getDNA(normalize_counts(mpra_WM88)))
tfdata <- dens_tf(getRNA(normalize_counts(mpra_WM88)))
# Density plot using ggplot
p<-ggplot(data = tfdata, aes(x = logcount,fill = samples, colour = samples)) + geom_density( alpha = 0.5) + labs(x = "log10(Barcode_Counts)", y = "Density") + theme_bw() + theme(legend.position="top",legend.text=element_text(size=12),axis.text=element_text(size=14), axis.title=element_text(size=14,face=NULL, color = NULL), plot.margin=margin(t=20, r=20, b=20, l=20, "pt")) ;p
ggsave("WM88_RNA_cpm_norm.pdf", width =14, height = 14, units = "cm")
## plot using Plot_ly, click "show in a new window", and save as an svg.
ggplotly(p, width =800, height = 660) %>% config(toImageButtonOptions = list(format = "svg"))
##------------------------------- ----------- IQR plot using ggplot ------------------------------ ----------
cond <- rep(c("ref", "alt"), each = length(tfdata$samples)/2)
p<-ggplot(data = tfdata, aes(x =samples,y= logcount,fill= cond)) + geom_boxplot(outlier.colour="coral2", outlier.size = 1) + labs(y = "log10(Barcode_Counts)", x=NULL) + theme_bw() + theme(legend.position="top",legend.text=element_text(size=12),axis.text=element_text(size=14), axis.title=element_text(size=14,face=NULL, color = NULL), plot.margin=margin(t=20, r=20, b=20, l=30, "pt")) +scale_fill_brewer(palette="Accent")
ggsave("IQR_MNT1_RNA_norm.pdf", width =12, height = 10, units = "cm");p
## IQR plot using Plot_ly
plot_ly(tfdata, x = ~samples, y = ~logcount, type = "box", width = 500, height = 300, marker = list(color = "rgb(8,81,156)", outliercolor = "rgba(219, 64, 82, 0.6)", line = list(outliercolor = "rgba(219, 64, 82, 1.0)", outlierwidth = 2))) %>% plotly::layout( xaxis = list(title = "samples"), yaxis = list(title = "log10(Barcode_Counts)")) %>% config(toImageButtonOptions = list(format = "svg"))
##------------------------------------ PCA Plot----------------------------------------------
PCA_plot <- function(x) {
pca_data <- debrowser::run_pca(x)
# x is a read counts data.frame.
p_data <- debrowser::prepPCADat(pca_data)
shape2 <- as.factor(cbind("Alt","Alt","Alt","Ref","Ref","Ref"))
pcx=1;pcy=2
xaxis <- sprintf("PC%d (%.2f%%)", pcx, round(pca_data$explained[pcx] * 100, 2))
yaxis <- sprintf("PC%d (%.2f%%)", pcy, round(pca_data$explained[pcy] * 100, 2))
plot1 <- ggplot(data = p_data, aes(x = x, y = y)) + geom_point(aes(color = color,shape =shape2), size = 4) + labs(x = xaxis, y = yaxis) + theme(plot.margin = margin(t = 10, r = 10, b = 10, l = 10, "pt")) + geom_text(aes(label = samples), vjust = 0, nudge_y = 1)
return(plot1)
}
p <-PCA_plot(getDNA(normalize_counts(mpra_WM88)))
ggsave("PCA_WM88_DNA_norm.pdf", width =12, height = 10, units = "cm");p
#plot interactive PCA
ggplotly(p, width = 600, height = 500) %>% config(toImageButtonOptions = list(format = "svg"))
# ---------------------------------------- Volcano plot using a package-------------------------------- ----------
library("EnhancedVolcano")
library("scales")
# wm88 select top snp to highlight
selectLab <- toptab_WM88 %>% filter(adj.P.Val<1e-20 & abs(logFC)>0.25 ) %>% rownames()
summary(-log10(toptab_WM88$adj.P.Val))
summary(toptab_WM88$logFC)
# plotting ggplot
p<-EnhancedVolcano(toptab_WM88_M, lab=rownames(toptab_WM88), selectLab=selectLab, x="logFC", y="adj.P.Val",hlineCol="steelblue",hline= c(1e-20), borderWidth = 0.25, labSize = 4, title = NULL,titleLabSize = 16, subtitle=NULL,axisLabSize=16, pCutoff =1e-2, FCcutoff =0.2, xlim = c(-1.2, 1.2), ylim = c(0, 35), pointSize=1, colAlpha = 0.8,shape = 19,legendLabSize =15, captionLabSize = 15, cutoffLineType = 'longdash', cutoffLineWidth = 0.25, drawConnectors = T)
p
ggsave("Volcano_WM88.pdf", width =5, height = 6, units = "in")
# MNT1
selectLab2 <- toptab_MNT %>% filter(adj.P.Val<1e-20 & abs(logFC)>0.25 ) %>% rownames()
summary(-log10(toptab_MNT$adj.P.Val))
summary(toptab_MNT$logFC)
p2<-EnhancedVolcano(toptab_MNT, lab=rownames(toptab_MNT), selectLab=selectLab2, x="logFC", y="adj.P.Val",hlineCol="steelblue",hline= c(1e-20), borderWidth = 0.25, labSize = 4, title = NULL,titleLabSize = 16, subtitle=NULL,axisLabSize=16, pCutoff =1e-2, FCcutoff =0.2, xlim = c(-2.4, 1), ylim = c(0, 120), pointSize=1, colAlpha = 0.6,shape = 19,legendLabSize =15, captionLabSize = 15, cutoffLineType = 'longdash', cutoffLineWidth = 0.25, drawConnectors = T)
p2
ggsave("Volcano_MNT1.pdf", width =5, height = 6, units = "in")
#------------------------------------ ----------- Scatter plot----------------------------- ----------------
## geom_text_repel is used to avoid overlap.
Scatt_plot <- function(Ratio,x=12,y=12,z=1){
Ref=rowMeans(Ratio[,1:3])
Alt=rowMeans(Ratio[,4:6])
LFC=as.data.frame(log2(Ref/Alt))
Name=rownames(FC)
Ratio= mutate(Ratio,sig=ifelse(abs(LFC)>z ,2,1))
p <- ggplot(Ratio, aes(x=Ref, y=Alt,color=factor(Ratio$sig))) +
theme_bw() + geom_point(size=1,alpha=1) +
labs(x ="Average RNA/DNA Ratio of Ref", y="Average RNA/DNA Ratio of Alt") +
geom_abline(intercept = 0, slope = 1, linetype = 2, color="grey") +
xlim(0, x) + ylim(0, y)+ coord_fixed() +
theme(legend.title=NULL, legend.text=element_text(size=12), axis.text=element_text(size=12),
axis.title=element_text(size=12.5),plot.margin=margin(t=20, r=20, b=20, l=20, "pt")) +
geom_text_repel(aes(label=ifelse(abs(LFC)>z ,Name,''),hjust=-0.1, vjust=0.1),size=3, show.legend= F )
return(p)
}
Ratio <- getRNA(normalize_counts(mpra_MNT)) / getDNA(normalize_counts(mpra_MNT))
p<-Scatt_plot(Ratio,x=12,y=12,z=1) # x:xlim, y:ylim, z:fold_change.
p + scale_colour_manual(values = c("grey80", "red"),labels = c("NS", "Log2|FC|>1") )
ggsave("Scatter_MNT.pdf", width =16, height = 16, units = "cm")
Ratio <- getRNA(normalize_counts(mpra_WM88)) / getDNA(normalize_counts(mpra_WM88))
p<-Scatt_plot(Ratio,x=5.2,y=5.2,z=0.5) # x:xlim, y:ylim, z:fold_change.
p + scale_colour_manual(values = c("grey80", "red"),labels = c("NS", "Log2|FC|>0.5") )
ggsave("Scatter_WM88.pdf", width =16, height = 16, units = "cm")
##------------Bar plot of GO or KEGG --------------------
#Read in the GO enrichment dataframe or the mouse phenotype enrichment dataframe from GREAT
data <- read.table(file.choose(),header= T,sep="\t")
head(data)
GO_rank <- cbind.data.frame(data[,1],data[,9])
colnames(GO_rank) <- c("GO_Term","qvalue")
GO_rank$qvalue <- -log10(GO_rank$qvalue)
head(GO_rank)
GO_rank <- arrange(GO_rank, desc(qvalue))
ggplot(GO_rank, aes(x= reorder(GO_Term, qvalue), y=qvalue)) + geom_bar(stat="identity",fill="#08519c", width=.5, position="dodge" ) + coord_flip() + ggtitle("GO_term enrichment by GREAT") + labs(y="-log10(Binomial q-Value)", x="GO_Term") + theme_classic() + theme(legend.title=NULL, axis.text=element_text(size=14), axis.title=element_text(size=14), plot.margin=margin(t=20, r=20, b=20, l=20, "pt"))
ggsave("Scatter_WM88.pdf", width =8.5, height = 7.2, units = "in")
#Read in the mouse phenotype enrichment dataframe from GREAT
data <- read.table(file.choose(), sep="\t")
head(data)
GO_rank <- cbind.data.frame(data[,1],data[,4])
head(GO_rank)
colnames(GO_rank) <- c("GO_Term","qvalue")
GO_rank$qvalue <- -log10(GO_rank$qvalue)
head(GO_rank)
GO_rank <- arrange(GO_rank, desc(qvalue))
ggplot(GO_rank, aes(x= reorder(GO_Term, qvalue), y=qvalue)) + geom_bar(stat="identity",fill="#08519c", width=.5, position="dodge" ) + coord_flip() + ggtitle("Mouse Phenotype Enrichment") + labs(y="-log10(Binomial q-Value)", x="GO_Term") + theme_classic() + theme(legend.title=NULL, axis.text=element_text(size=14), axis.title=element_text(size=14), plot.margin=margin(t=20, r=20, b=20, l=20, "pt"))
ggsave("Scatter_WM88.pdf", width =8.5, height = 7.2, units = "in")
#-------------------------------- ---------- Volcano plot ----------------------------- ----------------
# read in the dataframe of toptable /Users/fengyq/Documents/2018_Pigmentation/2019_MPRA_data/Yuanqing_0815/0_MPRA_Rpackage_analysis/DEG/MNT1_Ref_vs_Alt.txt
toptab <- read.table(file.choose(),header =T)
head(toptab)
Vplot<-function(toptab){
toptab$logp <- -log10(toptab$adj.P.Val)
snp_loc <- rownames(toptab)
rownames(toptab) <- seq(1:length(toptab$logFC))
toptab$color= ifelse(toptab$logp>3, 2, 1)
p<-ggplot(data = toptab, aes(x = logFC, y = logp)) +
geom_point(aes(color=factor(toptab$color)),size=1,alpha=1) +
theme_bw() + labs(x ="Log2(Fold Change)", y="Log10(Adjust P)") +
xlim(-2.5, 2) + ylim(0, 9) +
theme(legend.title=NULL, legend.text=element_text(size=12), axis.text=element_text(size=12),
axis.title=element_text(size=12.5), plot.margin=margin(t=20, r=20, b=20, l=20, "pt")) +
geom_text_repel(aes(label=ifelse(toptab$logp>3,snp_loc,''),hjust=-0.1, vjust=0.1),size=3, show.legend= F) +
scale_colour_manual(values = c("grey80", "red"),labels = c("NS", "Log10(Adjust P)>3")) +
geom_abline(intercept = 3, slope = 0, linetype = 2,color="grey")
return(p)
}
## could change the xlim and ylim scale to avoid lost extrem values.
p<-Vplot(toptab_MNT)
p
Vplot(toptab_MNT); ggsave("Volcano_MNT-1.pdf", width =16, height = 16, units = "cm")
Vplot(toptab_WM88); ggsave("Volcano_WM88-1.pdf", width =16, height = 16, units = "cm")
ggplotly(p)