Source code for geno4sd.utils.clustering_tools

import numpy as np
from scipy.sparse import csgraph
from numpy import linalg as LA
from scipy.spatial.distance import pdist, squareform
from matplotlib import pyplot as plt

[docs]def getAffinityMatrix(coordinates, n_cluster = 10): """ Calculate affinity matrix based on input coordinates matrix and the numeber of nearest neighbours. Apply local scaling based on the k nearest neighbour References: https://papers.nips.cc/paper/2619-self-tuning-spectral-clustering.pdf """ # calculate euclidian distance matrix dists = squareform(pdist(coordinates)) # for each row, sort the distances ascendingly and take the index of the #k-th position (nearest neighbour) knn_distances = np.sort(dists, axis=0)[n_cluster] knn_distances = knn_distances[np.newaxis].T # calculate sigma_i * sigma_j local_scale = knn_distances.dot(knn_distances.T) affinity_matrix = dists * dists affinity_matrix = -affinity_matrix / local_scale # divide square distance matrix by local scale affinity_matrix[np.where(np.isnan(affinity_matrix))] = 0.0 # apply exponential affinity_matrix = np.exp(affinity_matrix) np.fill_diagonal(affinity_matrix, 0) return affinity_matrix
[docs]def eigenDecomposition(A, plot = True, topK = 5): """ This method performs the eigen decomposition on a given affinity matrix Parameters: ----------- A: Affinity matrix plot: plots the sorted eigen values for visual inspection Returns: -------- A tuple containing: - the optimal number of clusters by eigengap heuristic - all eigen values - all eigen vectors References: https://papers.nips.cc/paper/2619-self-tuning-spectral-clustering.pdf http://www.kyb.mpg.de/fileadmin/user_upload/files/publications/attachments/Luxburg07_tutorial_4488%5b0%5d.pdf """ L = csgraph.laplacian(A, normed=True) # LM parameter : Eigenvalues with largest magnitude (eigs, eigsh), that is, largest eigenvalues in # the euclidean norm of complex numbers. eigenvalues, eigenvectors = LA.eig(L) if plot: plt.title('Largest eigen values of input matrix') plt.scatter(np.arange(len(eigenvalues)), eigenvalues) plt.grid() plt.show() plt.close() # Identify the optimal number of clusters as the index corresponding # to the larger gap between eigen values index_largest_gap = np.argsort(np.diff(eigenvalues))[::-1][:topK] nb_clusters = index_largest_gap + 1 return nb_clusters, eigenvalues, eigenvectors