-
Notifications
You must be signed in to change notification settings - Fork 0
/
prepare_data_for_ML.R
103 lines (70 loc) · 3.33 KB
/
prepare_data_for_ML.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
library(edgeR)
library(sva)
setwd("/home/user/data")
# read in covar file and reads file
covar <- read.delim("full_mRNA_cohort.csv", sep = ",", header = TRUE,row.names = 1)
reads <- read.delim("rsem_gene_counts_638.tsv",row.names = 1,sep="\t", header = TRUE, stringsAsFactors = FALSE)
# order covar and read files so that they match
covar=covar[order(covar$sampleID), ]
reads1 <- reads[,order(names(reads))]
# read in list of cases to exclude then remove them from covar and reads
remove_list_ml <- readRDS("ml_exclusions.rds")
reads1 <- reads1[,!(names(reads1) %in% remove_list_ml)]
covar <- covar[!covar$sampleID %in% remove_list_ml, ]
# rename read files so that diagnosis is included in sample names (AD_sampleID,CTRL_sampleID)
names(reads1) <- covar$Dx_sampleID
# check for columns with abnormally low counts
dim(reads1)
reads1=reads1[,which(!colSums(reads1)<200000)]
dim(reads1)
dim(covar)
# filter the reads to only keep genes that have a total sum of 2500 across all subjects (roughly an average of 5)
reads_fil <- reads_wo_ctrl[which(!rowSums(reads_wo_ctrl)<=2500),]
dim(reads_fil)
# normalise the reads using the TMM method using edgeR
y=DGEList(counts=reads_fil)
y=calcNormFactors(y,method = "TMM")
reads_norm<-cpm(y, normalized.lib.sizes=T)
# log transform the reads to make the counts closer to a normal distribution
dat1 <- log2(reads_norm+1)
# apply ComBat batch correction to output batch corrected counts (input for machine learning)
combat <- ComBat(dat=dat1,batch=covar$batch,mod=NULL,par.prior=TRUE,prior.plots = FALSE)
dat <- t(data.frame(combat))
# save gene set and covar from full ml cohort
saveRDS(dat, "gene_input_full_ml_cohort.rds")
saveRDS(covar, "covar_full_ml_cohort.rds")
# prepare label and data objects for ml classification and regression
# Full cohort used for ml regression. Subset with equal numbers of AD and control used for classification
#select covariate columns for machine learning
vars <- c("Dx_sampleID","cogdx","episodic_memory","global_cognition","fast_decline_episodic","fast_decline_global")
ml_covar <- covar[,names(covar) %in% vars]
# confirm covar and reads are in the same order
covar_names <- covar$Dx_sampleID
reads_names <- rownames(dat)
identical(reads_names, covar_names)
# combine genes and covariates into one object
covar_genes <- cbind(dat,ml_covar)
# save full cohort with covariate columns for machine learning regression
saveRDS(covar_genes, "ml_regression_input_full_cohort.rds")
# balanced sub cohort for machine learning classification
# reag1_reag0
set.seed(123)
mldata <- covar_genes
#mldata$keep[mldata$cogdx==1 | mldata$cogdx==2] <- 1
#mldata <- mldata[complete.cases(mldata),]
mldata$labels[mldata$cogdx==1] <- "NCI"
mldata$labels[mldata$cogdx==4] <- "DEM"
table(mldata$cogdx)
cogdx1 <- mldata[mldata$cogdx==1,]
cogdx4 <- mldata[mldata$cogdx==4,]
cogdx4_172 <- cogdx1[sample(nrow(cogdx1), 172), ]
mldata <- rbind(cogdx1,cogdx4_172)
mldata$labels[mldata$cogdx==1] <- "NCI"
mldata$labels[mldata$cogdx==4] <- "DEM"
mldata <- mldata[complete.cases(mldata),]
mldata <- droplevels(mldata)
contrast_labels <- mldata$labels
rm_cols <- vars <- c("Dx_sampleID","cogdx","episodic_memory","global_cognition","fast_decline_episodic","fast_decline_global")
mldata <- mldata[,!(names(mldata) %in% rm_cols)]
saveRDS(mldata,"data_dem_vs_nci_ml_class.rds")
saveRDS(contrast_labels,"labels_dem_vs_nci_ml_class.rds")