Source code for local2global_embedding.clustering

"""Graph clustering algorithms"""

from math import log
import os
from collections.abc import Iterable
from math import prod
from typing import Sequence

import community
import torch
import pymetis
import numpy as np
import numba

from local2global_embedding.network import TGraph, NPGraph
from local2global_embedding import progress



[docs]
def distributed_clustering(graph: TGraph, beta, rounds=None, patience=3, min_samples=2):
    r"""
    Distributed clustering algorithm

    Implements algorithm of [#dist]_ with gpu support

    Args:
        graph: input graph
        beta: :math:`\beta` value of the algorithm (controls the number of seeds)
        rounds: number of iteration rounds (default: ``3*int(log(graph.num_nodes))``)
        patience: number of rounds without label changes before early stopping (default: ``3``)
        min_samples: minimum number of seed nodes (default: ``2``)

    .. Rubric:: Reference

    .. [#dist] H. Sun and L. Zanetti. “Distributed Graph Clustering and Sparsification”.
               ACM Transactions on Parallel Computing 6.3 (2019), pp. 1–23.
               doi: `10.1145/3364208 <https://doi.org/10.1145/3364208>`_.

    """
    if rounds is None:
        rounds = 3*int(log(graph.num_nodes))
    strength = graph.strength

    # sample seed nodes
    index = torch.rand((graph.num_nodes,)) < 1/beta * log(1 / beta) * graph.strength / graph.strength.sum()
    while index.sum() < min_samples:
        index = torch.rand((graph.num_nodes,)) < 1/beta * log(1 / beta) * graph.strength / graph.strength.sum()
    seeds = torch.nonzero(index).flatten()
    n_samples = seeds.numel()

    states = torch.zeros((graph.num_nodes, n_samples), dtype=torch.double, device=graph.device)
    states[index, torch.arange(n_samples, device=graph.device)] = 1/torch.sqrt(strength[index]).to(dtype=torch.double)
    clusters = torch.argmax(states, dim=1)
    weights = graph.weights / torch.sqrt(strength[graph.edge_index[0]]*strength[graph.edge_index[1]])
    weights = weights.to(dtype=torch.double)
    r = 0
    num_same = 0
    while r < rounds and num_same < patience:  # keep iterating until clustering does not change for 'patience' rounds
        r += 1
        states *= 0.5
        states.index_add_(0, graph.edge_index[0], 0.5*states[graph.edge_index[1]]*weights.view(-1, 1))
        # states = ts.scatter(out=states, dim=0, index=graph.edge_index[0],
        #                     src=0.5*states[graph.edge_index[1]]*weights.view(-1, 1))
        old_clusters = clusters
        clusters = torch.argmax(states, dim=1)
        if torch.equal(old_clusters, clusters):
            num_same += 1
        else:
            num_same = 0
    clusters[states[range(graph.num_nodes), clusters] == 0] = -1
    uc, clusters = torch.unique(clusters, return_inverse=True)
    if uc[0] == -1:
        clusters -= 1
    return clusters




[docs]
def fennel_clustering(graph, num_clusters, load_limit=1.1, alpha=None, gamma=1.5, num_iters=1, clusters=None):
    graph = graph.to(NPGraph)

    if clusters is None:
        clusters = _fennel_clustering(graph.edge_index, graph.adj_index, graph.num_nodes, num_clusters, load_limit, alpha, gamma, num_iters)
    else:
        clusters = _fennel_clustering(graph.edge_index, graph.adj_index, graph.num_nodes, num_clusters, load_limit, alpha, gamma, num_iters,
                                      clusters)
    return torch.as_tensor(clusters)



@numba.njit
def _fennel_clustering(edge_index, adj_index, num_nodes, num_clusters, load_limit=1.1, alpha=None, gamma=1.5, num_iters=1,
                       clusters=np.empty(0, dtype=np.int64)):
    r"""
    FENNEL single-pass graph clustering algorithm

    Implements the graph clustering algorithm of [#fennel]_.

    Args:
        graph: input graph
        num_clusters: target number of clusters
        load_limit: maximum cluster size is ``load_limit * graph.num_nodes / num_clusters`` (default: ``1.1``)
        alpha: :math:`\alpha` value for the algorithm (default as suggested in [#fennel]_)
        gamma: :math:`\gamma` value for the algorithm (default: 1.5)
        randomise_order: if ``True``, randomise order, else use breadth-first-search order.
        clusters: input clustering to refine (optional)
        num_iters: number of cluster assignment iterations (default: ``1``)

    Returns:
        cluster index tensor

    References:
        .. [#fennel] C. Tsourakakis et al. “FENNEL: Streaming Graph Partitioning for Massive Scale Graphs”.
                     In: Proceedings of the 7th ACM international conference on Web search and data mining.
                     WSDM'14 (2014) doi: `10.1145/2556195.2556213 <https://doi.org/10.1145/2556195.2556213>`_.

    """
    if num_iters is None:
        num_iters = 1

    num_edges = edge_index.shape[1]
    total = num_edges * num_iters

    if alpha is None:
        alpha = num_edges * (num_clusters ** (gamma-1)) / (num_nodes ** gamma)

    partition_sizes = np.zeros(num_clusters, dtype=np.int64)
    if clusters.size == 0:
        clusters = np.full((num_nodes,), -1, dtype=np.int64)
    else:
        clusters = np.copy(clusters)
        for index in clusters:
            partition_sizes[index] += 1

    load_limit *= num_nodes/num_clusters

    deltas = - alpha * gamma * (partition_sizes ** (gamma - 1))

    with numba.objmode:
        progress.reset(num_nodes)

    for it in range(num_iters):
        not_converged = 0

        progress_it = 0
        for i in range(num_nodes):
            cluster_indices = np.empty((adj_index[i+1]-adj_index[i],), dtype=np.int64)
            for ni, index in enumerate(range(adj_index[i], adj_index[i+1])):
                cluster_indices[ni] = clusters[edge_index[1, index]]
            old_cluster = clusters[i]
            if old_cluster >= 0:
                partition_sizes[old_cluster] -= 1
            cluster_indices = cluster_indices[cluster_indices >= 0]

            if cluster_indices.size > 0:
                c_size = np.zeros(num_clusters, dtype=np.int64)
                for index in cluster_indices:
                    c_size[index] += 1
                ind = np.argmax(deltas + c_size)
            else:
                ind = np.argmax(deltas)
            clusters[i] = ind
            partition_sizes[ind] += 1
            if partition_sizes[ind] == load_limit:
                deltas[ind] = - np.inf
            else:
                deltas[ind] = - alpha * gamma * (partition_sizes[ind] ** (gamma - 1))
            not_converged += ind != old_cluster

            if i % 10000 == 0 and i > 0:
                progress_it = i
                with numba.objmode:
                    progress.update(10000)
        with numba.objmode:
            progress.update(num_nodes - progress_it)

        print('iteration: ' + str(it) + ', not converged: ' + str(not_converged))

        if not_converged == 0:
            print(f'converged after ' + str(it) + ' iterations.')
            break
    with numba.objmode:
        progress.close()

    return clusters



[docs]
def louvain_clustering(graph: TGraph, *args, **kwargs):
    r"""
    Implements clustering using the Louvain [#l]_ algorithm for modularity optimisation

    Args:
        graph: input graph

    Returns:
        partition tensor

    This is a minimal wrapper around :py:func:`community.best_partition` from the
    `python-louvain <https://github.com/taynaud/python-louvain>`_ package. Any other
    arguments provided are passed through.

    References:
        .. [#l] V. D. Blondel et al.
                “Fast unfolding of communities in large networks”.
                Journal of Statistical Mechanics: Theory and Experiment 2008.10 (2008), P10008.
                DOI: `10.1088/1742-5468/2008/10/P10008 <https://doi.org/10.1088/1742-5468/2008/10/P10008>`_

    """
    clusters = community.best_partition(graph.to_networkx().to_undirected(), *args, **kwargs)
    return torch.tensor([clusters[i] for i in range(graph.num_nodes)], dtype=torch.long)




[docs]
def metis_clustering(graph: TGraph, num_clusters):
    """
    Implements clustering using metis

    Args:
        graph: input graph
        num_clusters: number of cluster

    Returns:
        partition tensor

    This uses the `pymetis <https://github.com/inducer/pymetis>`_ package

    References:
        .. [#metis] “A Fast and Highly Quality Multilevel Scheme for Partitioning Irregular Graphs”.
                    George Karypis and Vipin Kumar.
                    SIAM Journal on Scientific Computing, Vol. 20, No. 1, pp. 359—392, 1999.
    """
    graph = graph.to(NPGraph)
    n_cuts, memberships = pymetis.part_graph(num_clusters, adjncy=graph.edge_index[1], xadj=graph.adj_index,
                                             eweights=graph.edge_attr)
    return torch.as_tensor(memberships, dtype=torch.long, device=graph.device)




[docs]
def spread_clustering(graph, num_clusters, max_degree_init=True):
    clusters = torch.full((graph.num_nodes,), -1, dtype=torch.long, device=graph.device)
    if max_degree_init:
        seeds = torch.topk(torch.as_tensor(graph.degree), k=num_clusters).indices
    else:
        seeds = torch.multinomial(torch.as_tensor(graph.degree), num_clusters, replacement=False)

    clusters[seeds] = torch.arange(num_clusters)
    spread_weights = torch.zeros((num_clusters, graph.num_nodes), dtype=torch.double, device=graph.device)
    spread_weights[:, seeds] = -1
    unassigned = clusters < 0
    for seed in seeds:
        c = clusters[seed]
        inds, weights = graph.adj_weighted(seed)
        keep = unassigned[inds]
        spread_weights[c, inds[keep]] += weights[keep] / graph.strength[inds[keep]]

    num_unassigned = graph.num_nodes - num_clusters

    while num_unassigned > 0:
        progress = False
        for c in range(num_clusters):
            node = torch.argmax(spread_weights[c])
            if spread_weights[c, node] > 0:
                progress = True
                # make sure node is actually connected to cluster
                clusters[node] = c
                spread_weights[:, node] = -1  # should not be chosen again
                unassigned[node] = False
                num_unassigned -= 1
                inds, weights = graph.adj_weighted(node)
                keep = unassigned[inds]
                spread_weights[c, inds[keep]] += weights[keep] / graph.strength[inds[keep]]
        if not progress:
            print('increasing number of clusters due to disconnected components')
            unassigned_nodes = torch.nonzero(unassigned).ravel()
            if max_degree_init:
                seed = unassigned_nodes[torch.argmax(torch.as_tensor(graph.degree[unassigned_nodes]))]
            else:
                seed = unassigned_nodes[torch.multinomial(torch.as_tensor(graph.degree[unassigned_nodes]), 1)]
            clusters[seed] = num_clusters
            spread_weights = torch.cat((spread_weights, torch.zeros((1, graph.num_nodes),
                                                                      dtype=torch.double, device=graph.device)))
            unassigned[seed] = False
            spread_weights[:, seed] = -1
            inds, weights = graph.adj_weighted(seed)
            keep = unassigned[inds]
            spread_weights[num_clusters, inds[keep]] += weights[keep] / graph.strength[inds[keep]]
            num_clusters += 1
            num_unassigned -= 1
    return clusters




[docs]
def hierarchical_aglomerative_clustering(graph, method=spread_clustering, levels=None, branch_factors=None):
    if branch_factors is None:
        branch_factors = [graph.num_nodes**(1/(levels+1)) for _ in range(levels)]
    else:
        if not isinstance(branch_factors, Iterable):
            branch_factors = [branch_factors] * (levels)
        else:
            if levels is None:
                levels = len(branch_factors)
            elif len(branch_factors) != levels:
                raise ValueError(f"{levels=} does not match {len(branch_factors)=}")
    num_clusters = np.cumprod(branch_factors)[::-1]
    clusters = []
    rgraph = graph
    for c in num_clusters:
        cluster = method(rgraph, int(c))
        rgraph = rgraph.partition_graph(cluster)
        clusters.append(cluster)
    return clusters




[docs]
class Partition(Sequence):

[docs]
    def __init__(self, partition_tensor):
        partition_tensor = torch.as_tensor(partition_tensor)
        counts = torch.bincount(partition_tensor)
        self.num_parts = len(counts)
        self.nodes = torch.argsort(partition_tensor)
        self.part_index = torch.zeros(self.num_parts + 1, dtype=torch.long)
        self.part_index[1:] = torch.cumsum(counts, dim=0)


    def __getitem__(self, item):
        return self.nodes[self.part_index[item]:self.part_index[item+1]]

    def __len__(self):
        return self.num_parts