Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

update: speed up func sparse_mutual_reachability 20+ times #257

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 55 additions & 0 deletions hdbscan/_hdbscan_reachability.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,61 @@ cpdef sparse_mutual_reachability(object lil_matrix, np.intp_t min_points=5,
return result.tocsr()


def sparse_mutual_reachability_old(lil_matrix, min_points=5):
"""Added by codingafuture for performance comparation.
Func sparse_mutual_reachability_old is a pure python realisation of above func sparse_mutual_reachability.
"""
from scipy.sparse import lil_matrix as sparse_matrix
result = sparse_matrix(lil_matrix.shape)
core_distance = np.empty(lil_matrix.shape[0], dtype=np.double)

for i in range(lil_matrix.shape[0]): # 5w
sorted_row_data = sorted(lil_matrix.data[i])
if min_points < len(sorted_row_data):
core_distance[i] = sorted_row_data[min_points]
else:
core_distance[i] = np.infty

nz_row_data, nz_col_data = lil_matrix.nonzero()

for n in range(nz_row_data.shape[0]):
i = nz_row_data[n]
j = nz_col_data[n]

mr_dist = max(core_distance[i], core_distance[j], lil_matrix[i, j])
if np.isfinite(mr_dist):
result[i, j] = mr_dist

return result.tocsr()


def sparse_mutual_reachability_new(lil_mat, min_points=5):
"""Added by codingafuture for performance comparation.
Compared to sparse_mutual_reachability_old, sparse_mutual_reachability_new speed up 20+ times.
"""
from scipy.sparse import csr_matrix
core_distance = np.empty(lil_mat.shape[0], dtype=np.double)

for i in range(lil_mat.shape[0]): # 5w
sorted_row_data = sorted(lil_mat.data[i])
if min_points < len(sorted_row_data):
core_distance[i] = sorted_row_data[min_points]
else:
core_distance[i] = np.infty

csr_mat = lil_mat.tocsr()
csr_mat.eliminate_zeros()
row, col = csr_mat.nonzero()
data = csr_mat.data
# print len(row), len(col), len(data)
data_core_dis_i = core_distance[row]
data_core_dis_j = core_distance[col]

stage1 = np.where(data_core_dis_i > data, data_core_dis_i, data)
result = np.where(stage1 > data_core_dis_j, stage1, data_core_dis_j)
x = csr_matrix((result, (row, col)), shape=(lil_mat.shape[0], lil_mat.shape[1]))
return x

def kdtree_mutual_reachability(X, distance_matrix, metric, p=2, min_points=5,
alpha=1.0, **kwargs):
dim = distance_matrix.shape[0]
Expand Down