Source code for local2global_embedding.clustering

"""Graph clustering algorithms"""

from math import log
import os
from collections.abc import Iterable
from math import prod
from typing import Sequence

import community
import torch
import pymetis
import numpy as np
import numba

from local2global_embedding.network import TGraph, NPGraph
from local2global_embedding import progress


[docs] def distributed_clustering(graph: TGraph, beta, rounds=None, patience=3, min_samples=2): r""" Distributed clustering algorithm Implements algorithm of [#dist]_ with gpu support Args: graph: input graph beta: :math:`\beta` value of the algorithm (controls the number of seeds) rounds: number of iteration rounds (default: ``3*int(log(graph.num_nodes))``) patience: number of rounds without label changes before early stopping (default: ``3``) min_samples: minimum number of seed nodes (default: ``2``) .. Rubric:: Reference .. [#dist] H. Sun and L. Zanetti. “Distributed Graph Clustering and Sparsification”. ACM Transactions on Parallel Computing 6.3 (2019), pp. 1–23. doi: `10.1145/3364208 <https://doi.org/10.1145/3364208>`_. """ if rounds is None: rounds = 3*int(log(graph.num_nodes)) strength = graph.strength # sample seed nodes index = torch.rand((graph.num_nodes,)) < 1/beta * log(1 / beta) * graph.strength / graph.strength.sum() while index.sum() < min_samples: index = torch.rand((graph.num_nodes,)) < 1/beta * log(1 / beta) * graph.strength / graph.strength.sum() seeds = torch.nonzero(index).flatten() n_samples = seeds.numel() states = torch.zeros((graph.num_nodes, n_samples), dtype=torch.double, device=graph.device) states[index, torch.arange(n_samples, device=graph.device)] = 1/torch.sqrt(strength[index]).to(dtype=torch.double) clusters = torch.argmax(states, dim=1) weights = graph.weights / torch.sqrt(strength[graph.edge_index[0]]*strength[graph.edge_index[1]]) weights = weights.to(dtype=torch.double) r = 0 num_same = 0 while r < rounds and num_same < patience: # keep iterating until clustering does not change for 'patience' rounds r += 1 states *= 0.5 states.index_add_(0, graph.edge_index[0], 0.5*states[graph.edge_index[1]]*weights.view(-1, 1)) # states = ts.scatter(out=states, dim=0, index=graph.edge_index[0], # src=0.5*states[graph.edge_index[1]]*weights.view(-1, 1)) old_clusters = clusters clusters = torch.argmax(states, dim=1) if torch.equal(old_clusters, clusters): num_same += 1 else: num_same = 0 clusters[states[range(graph.num_nodes), clusters] == 0] = -1 uc, clusters = torch.unique(clusters, return_inverse=True) if uc[0] == -1: clusters -= 1 return clusters
[docs] def fennel_clustering(graph, num_clusters, load_limit=1.1, alpha=None, gamma=1.5, num_iters=1, clusters=None): graph = graph.to(NPGraph) if clusters is None: clusters = _fennel_clustering(graph.edge_index, graph.adj_index, graph.num_nodes, num_clusters, load_limit, alpha, gamma, num_iters) else: clusters = _fennel_clustering(graph.edge_index, graph.adj_index, graph.num_nodes, num_clusters, load_limit, alpha, gamma, num_iters, clusters) return torch.as_tensor(clusters)
@numba.njit def _fennel_clustering(edge_index, adj_index, num_nodes, num_clusters, load_limit=1.1, alpha=None, gamma=1.5, num_iters=1, clusters=np.empty(0, dtype=np.int64)): r""" FENNEL single-pass graph clustering algorithm Implements the graph clustering algorithm of [#fennel]_. Args: graph: input graph num_clusters: target number of clusters load_limit: maximum cluster size is ``load_limit * graph.num_nodes / num_clusters`` (default: ``1.1``) alpha: :math:`\alpha` value for the algorithm (default as suggested in [#fennel]_) gamma: :math:`\gamma` value for the algorithm (default: 1.5) randomise_order: if ``True``, randomise order, else use breadth-first-search order. clusters: input clustering to refine (optional) num_iters: number of cluster assignment iterations (default: ``1``) Returns: cluster index tensor References: .. [#fennel] C. Tsourakakis et al. “FENNEL: Streaming Graph Partitioning for Massive Scale Graphs”. In: Proceedings of the 7th ACM international conference on Web search and data mining. WSDM'14 (2014) doi: `10.1145/2556195.2556213 <https://doi.org/10.1145/2556195.2556213>`_. """ if num_iters is None: num_iters = 1 num_edges = edge_index.shape[1] total = num_edges * num_iters if alpha is None: alpha = num_edges * (num_clusters ** (gamma-1)) / (num_nodes ** gamma) partition_sizes = np.zeros(num_clusters, dtype=np.int64) if clusters.size == 0: clusters = np.full((num_nodes,), -1, dtype=np.int64) else: clusters = np.copy(clusters) for index in clusters: partition_sizes[index] += 1 load_limit *= num_nodes/num_clusters deltas = - alpha * gamma * (partition_sizes ** (gamma - 1)) with numba.objmode: progress.reset(num_nodes) for it in range(num_iters): not_converged = 0 progress_it = 0 for i in range(num_nodes): cluster_indices = np.empty((adj_index[i+1]-adj_index[i],), dtype=np.int64) for ni, index in enumerate(range(adj_index[i], adj_index[i+1])): cluster_indices[ni] = clusters[edge_index[1, index]] old_cluster = clusters[i] if old_cluster >= 0: partition_sizes[old_cluster] -= 1 cluster_indices = cluster_indices[cluster_indices >= 0] if cluster_indices.size > 0: c_size = np.zeros(num_clusters, dtype=np.int64) for index in cluster_indices: c_size[index] += 1 ind = np.argmax(deltas + c_size) else: ind = np.argmax(deltas) clusters[i] = ind partition_sizes[ind] += 1 if partition_sizes[ind] == load_limit: deltas[ind] = - np.inf else: deltas[ind] = - alpha * gamma * (partition_sizes[ind] ** (gamma - 1)) not_converged += ind != old_cluster if i % 10000 == 0 and i > 0: progress_it = i with numba.objmode: progress.update(10000) with numba.objmode: progress.update(num_nodes - progress_it) print('iteration: ' + str(it) + ', not converged: ' + str(not_converged)) if not_converged == 0: print(f'converged after ' + str(it) + ' iterations.') break with numba.objmode: progress.close() return clusters
[docs] def louvain_clustering(graph: TGraph, *args, **kwargs): r""" Implements clustering using the Louvain [#l]_ algorithm for modularity optimisation Args: graph: input graph Returns: partition tensor This is a minimal wrapper around :py:func:`community.best_partition` from the `python-louvain <https://github.com/taynaud/python-louvain>`_ package. Any other arguments provided are passed through. References: .. [#l] V. D. Blondel et al. “Fast unfolding of communities in large networks”. Journal of Statistical Mechanics: Theory and Experiment 2008.10 (2008), P10008. DOI: `10.1088/1742-5468/2008/10/P10008 <https://doi.org/10.1088/1742-5468/2008/10/P10008>`_ """ clusters = community.best_partition(graph.to_networkx().to_undirected(), *args, **kwargs) return torch.tensor([clusters[i] for i in range(graph.num_nodes)], dtype=torch.long)
[docs] def metis_clustering(graph: TGraph, num_clusters): """ Implements clustering using metis Args: graph: input graph num_clusters: number of cluster Returns: partition tensor This uses the `pymetis <https://github.com/inducer/pymetis>`_ package References: .. [#metis] “A Fast and Highly Quality Multilevel Scheme for Partitioning Irregular Graphs”. George Karypis and Vipin Kumar. SIAM Journal on Scientific Computing, Vol. 20, No. 1, pp. 359—392, 1999. """ graph = graph.to(NPGraph) n_cuts, memberships = pymetis.part_graph(num_clusters, adjncy=graph.edge_index[1], xadj=graph.adj_index, eweights=graph.edge_attr) return torch.as_tensor(memberships, dtype=torch.long, device=graph.device)
[docs] def spread_clustering(graph, num_clusters, max_degree_init=True): clusters = torch.full((graph.num_nodes,), -1, dtype=torch.long, device=graph.device) if max_degree_init: seeds = torch.topk(torch.as_tensor(graph.degree), k=num_clusters).indices else: seeds = torch.multinomial(torch.as_tensor(graph.degree), num_clusters, replacement=False) clusters[seeds] = torch.arange(num_clusters) spread_weights = torch.zeros((num_clusters, graph.num_nodes), dtype=torch.double, device=graph.device) spread_weights[:, seeds] = -1 unassigned = clusters < 0 for seed in seeds: c = clusters[seed] inds, weights = graph.adj_weighted(seed) keep = unassigned[inds] spread_weights[c, inds[keep]] += weights[keep] / graph.strength[inds[keep]] num_unassigned = graph.num_nodes - num_clusters while num_unassigned > 0: progress = False for c in range(num_clusters): node = torch.argmax(spread_weights[c]) if spread_weights[c, node] > 0: progress = True # make sure node is actually connected to cluster clusters[node] = c spread_weights[:, node] = -1 # should not be chosen again unassigned[node] = False num_unassigned -= 1 inds, weights = graph.adj_weighted(node) keep = unassigned[inds] spread_weights[c, inds[keep]] += weights[keep] / graph.strength[inds[keep]] if not progress: print('increasing number of clusters due to disconnected components') unassigned_nodes = torch.nonzero(unassigned).ravel() if max_degree_init: seed = unassigned_nodes[torch.argmax(torch.as_tensor(graph.degree[unassigned_nodes]))] else: seed = unassigned_nodes[torch.multinomial(torch.as_tensor(graph.degree[unassigned_nodes]), 1)] clusters[seed] = num_clusters spread_weights = torch.cat((spread_weights, torch.zeros((1, graph.num_nodes), dtype=torch.double, device=graph.device))) unassigned[seed] = False spread_weights[:, seed] = -1 inds, weights = graph.adj_weighted(seed) keep = unassigned[inds] spread_weights[num_clusters, inds[keep]] += weights[keep] / graph.strength[inds[keep]] num_clusters += 1 num_unassigned -= 1 return clusters
[docs] def hierarchical_aglomerative_clustering(graph, method=spread_clustering, levels=None, branch_factors=None): if branch_factors is None: branch_factors = [graph.num_nodes**(1/(levels+1)) for _ in range(levels)] else: if not isinstance(branch_factors, Iterable): branch_factors = [branch_factors] * (levels) else: if levels is None: levels = len(branch_factors) elif len(branch_factors) != levels: raise ValueError(f"{levels=} does not match {len(branch_factors)=}") num_clusters = np.cumprod(branch_factors)[::-1] clusters = [] rgraph = graph for c in num_clusters: cluster = method(rgraph, int(c)) rgraph = rgraph.partition_graph(cluster) clusters.append(cluster) return clusters
[docs] class Partition(Sequence):
[docs] def __init__(self, partition_tensor): partition_tensor = torch.as_tensor(partition_tensor) counts = torch.bincount(partition_tensor) self.num_parts = len(counts) self.nodes = torch.argsort(partition_tensor) self.part_index = torch.zeros(self.num_parts + 1, dtype=torch.long) self.part_index[1:] = torch.cumsum(counts, dim=0)
def __getitem__(self, item): return self.nodes[self.part_index[item]:self.part_index[item+1]] def __len__(self): return self.num_parts