forked from stephenliu0423/PyDTI
-
Notifications
You must be signed in to change notification settings - Fork 0
/
functions.py
95 lines (75 loc) · 3.28 KB
/
functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import os
import numpy as np
from collections import defaultdict
def load_data_from_file(dataset, folder):
with open(os.path.join(folder, dataset+"_admat_dgc.txt"), "r") as inf:
inf.next()
int_array = [line.strip("\n").split()[1:] for line in inf]
with open(os.path.join(folder, dataset+"_simmat_dc.txt"), "r") as inf: # the drug similarity file
inf.next()
drug_sim = [line.strip("\n").split()[1:] for line in inf]
with open(os.path.join(folder, dataset+"_simmat_dg.txt"), "r") as inf: # the target similarity file
inf.next()
target_sim = [line.strip("\n").split()[1:] for line in inf]
intMat = np.array(int_array, dtype=np.float64).T # drug-target interaction matrix
drugMat = np.array(drug_sim, dtype=np.float64) # drug similarity matrix
targetMat = np.array(target_sim, dtype=np.float64) # target similarity matrix
return intMat, drugMat, targetMat
def get_drugs_targets_names(dataset, folder):
with open(os.path.join(folder, dataset+"_admat_dgc.txt"), "r") as inf:
drugs = inf.next().strip("\n").split()
targets = [line.strip("\n").split()[0] for line in inf]
return drugs, targets
def cross_validation(intMat, seeds, cv=0, num=10):
cv_data = defaultdict(list)
for seed in seeds:
num_drugs, num_targets = intMat.shape
prng = np.random.RandomState(seed)
if cv == 0:
index = prng.permutation(num_drugs)
if cv == 1:
index = prng.permutation(intMat.size)
step = index.size/num
for i in xrange(num):
if i < num-1:
ii = index[i*step:(i+1)*step]
else:
ii = index[i*step:]
if cv == 0:
test_data = np.array([[k, j] for k in ii for j in xrange(num_targets)], dtype=np.int32)
elif cv == 1:
test_data = np.array([[k/num_targets, k % num_targets] for k in ii], dtype=np.int32)
x, y = test_data[:, 0], test_data[:, 1]
test_label = intMat[x, y]
W = np.ones(intMat.shape)
W[x, y] = 0
cv_data[seed].append((W, test_data, test_label))
return cv_data
def train(model, cv_data, intMat, drugMat, targetMat):
aupr, auc = [], []
for seed in cv_data.keys():
for W, test_data, test_label in cv_data[seed]:
model.fix_model(W, intMat, drugMat, targetMat, seed)
aupr_val, auc_val = model.evaluation(test_data, test_label)
aupr.append(aupr_val)
auc.append(auc_val)
return np.array(aupr, dtype=np.float64), np.array(auc, dtype=np.float64)
def svd_init(M, num_factors):
from scipy.linalg import svd
U, s, V = svd(M, full_matrices=False)
ii = np.argsort(s)[::-1][:num_factors]
s1 = np.sqrt(np.diag(s[ii]))
U0, V0 = U[:, ii].dot(s1), s1.dot(V[ii, :])
return U0, V0.T
def mean_confidence_interval(data, confidence=0.95):
import scipy as sp
import scipy.stats
a = 1.0*np.array(data)
n = len(a)
m, se = np.mean(a), scipy.stats.sem(a)
h = se * sp.stats.t._ppf((1+confidence)/2., n-1)
return m, h
def write_metric_vector_to_file(auc_vec, file_name):
np.savetxt(file_name, auc_vec, fmt='%.6f')
def load_metric_vector(file_name):
return np.loadtxt(file_name, dtype=np.float64)