-
Notifications
You must be signed in to change notification settings - Fork 0
/
boruta_classification_and_regression.R
83 lines (54 loc) · 3.41 KB
/
boruta_classification_and_regression.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
library(plyr)
library(tidyverse)
library(caret)
library(Boruta)
set.seed(123)
path <- "/home/user/data"
setwd(path)
# input data set up - the labels and data files are produced by the "prepare_for_m_learn" script
labels_dem_vs_nci <- readRDS("labels_dem_vs_nci_ml_class.rds")
data_dem_vs_nci <- readRDS("data_dem_vs_nci_ml_class.rds")
#######################################################################################################
# Example code for running boruta for Dementia vs NCI classification
# the same code is used for classication and regression (just change the colum)
data_labels_dem_vs_nci <- cbind(data_dem_vs_nci, labels_dem_vs_nci)
# Boruta can be slow to run. Running with x = data and y = labels instead of the formula interface can
# lead to shorter run times
boruta_dem_vs_nci_model=Boruta(labels_dem_vs_nci~. , data=data_labels_dem_vs_nci, doTrace = 2 , maxRuns = 10000)
saveRDS(boruta_dem_vs_nci_model,file="boruta_dem_vs_nci_model.rds")
final_boruta_dem_vs_nci_model=TentativeRoughFix(boruta_dem_vs_nci_model)
dem_vs_nci_model_sel_feat=getSelectedAttributes(final_boruta_dem_vs_nci_model)
saveRDS(dem_vs_nci_model_sel_feat,file="dem_vs_nci_model_boruta_sel_feat.rds")
k <-lapply(1:ncol(boruta_dem_vs_nci_model$ImpHistory),function(i)
boruta_dem_vs_nci_model$ImpHistory[is.finite(boruta_dem_vs_nci_model$ImpHistory[,i]),i])
names(k) <- colnames(boruta_dem_vs_nci_model$ImpHistory)
boruta_dem_vs_nci_model_feature_rank <- sort(sapply(k,median),decreasing = TRUE)
boruta_dem_vs_nci_model_rank_out <- data.frame(features=names(boruta_dem_vs_nci_model_feature_rank), z_score=boruta_dem_vs_nci_model_feature_rank, row.names=NULL)
# outputs all genes ranked by z-score. Genes with z-scores above the ShadowMax value are considered "important"
write.csv(boruta_dem_vs_nci_model_rank_out,file="boruta_dem_vs_nci_model_feature_rank_predict.csv",row.names = FALSE)
#######################################################################################################
# Example of a regression analysis from the gene association analysis
# Here we were finding the others genes that were predictive of PRTN3 levels
# ##### prtn3 ENSG00000196415.9
#
#
# read in batch-corrected gene expression data
data_prtn3 <- gene_data
#covar_prtn3 <- readRDS("prtn3_covar.rds")
# find the ENSG number for your gene of interest
labels_prtn3 <- data_prtn3$ENSG00000196415.9
rm_list <- ("ENSG00000196415.9")
data_prtn3 <- data_prtn3[,!(names(data_prtn3) %in% rm_list)]
#prtn3
data_labels_prtn3 <- cbind(data_prtn3, labels_prtn3)
boruta_prtn3_model=Boruta(labels_prtn3~. , data=data_labels_prtn3, doTrace = 2 , maxRuns = 10000)
saveRDS(boruta_prtn3_model,file="boruta_prtn3_model_genes_predict_2500.rds")
final_boruta_prtn3_model=TentativeRoughFix(boruta_prtn3_model)
prtn3_model_sel_feat=getSelectedAttributes(final_boruta_prtn3_model)
saveRDS(prtn3_model_sel_feat,file="prtn3_model_boruta_sel_feat_genes_predict_2500.rds")
k <-lapply(1:ncol(boruta_prtn3_model$ImpHistory),function(i)
boruta_prtn3_model$ImpHistory[is.finite(boruta_prtn3_model$ImpHistory[,i]),i])
names(k) <- colnames(boruta_prtn3_model$ImpHistory)
boruta_prtn3_model_feature_rank <- sort(sapply(k,median),decreasing = TRUE)
boruta_prtn3_model_rank_out <- data.frame(features=names(boruta_prtn3_model_feature_rank), z_score=boruta_prtn3_model_feature_rank, row.names=NULL)
write.csv(boruta_prtn3_model_rank_out,file="boruta_prtn3_model_feature_rank_predict_2500.csv",row.names = FALSE)