Source code for netrd.distance.dk_series


Graph distance based on the dk-series.

author: Brennan Klein & Stefan McCabe
Submitted as part of the 2019 NetSI Collabathon.


import networkx as nx
import numpy as np
from scipy.sparse import coo_matrix
from collections import defaultdict
from .base import BaseDistance
from ..utilities import entropy, undirected, unweighted

[docs]class dkSeries(BaseDistance): """Compare graphs based on their :math:`dk`-series."""
[docs] @unweighted @undirected def dist(self, G1, G2, d=2): r"""Compute the distance between two graphs by using the Jensen-Shannon divergence between the :math:`dk`-series of the graphs. The :math:`dk`-series of a graph is the collection of distributions of size :math:`d` subgraphs, where nodes are labelled by degrees. For simplicity, we currently consider only the :math:`1k`-series, i.e., the degree distribution, or the :math:`2k`-series, i.e., the distribution of edges between nodes of degree :math:`(k_i, k_j)`. The distance between these :math:`dk`-series is calculated using the Jensen-Shannon divergence. Parameters ---------- G1, G2 (nx.Graph) two networkx graphs to be compared d (int) the size of the subgraph to consider Returns ------- dist (float) the distance between `G1` and `G2`. References ---------- .. [1] Orsini, Chiara, Marija M. Dankulov, Pol Colomer-de-Simón, Almerima Jamakovic, Priya Mahadevan, Amin Vahdat, Kevin E. Bassler, et al. 2015. “Quantifying Randomness in Real Networks.” Nature Communications 6 (1). """ N = max(len(G1), len(G2)) if d == 1: from .degree_divergence import DegreeDivergence degdiv = DegreeDivergence() dist = degdiv.dist() # the 2k-distance stores the distribution in a sparse matrix, # so here we take the output of DegreeDivergence and # produce a comparable object hist1, hist2 = degdiv.results["degree_histograms"] hist1 /= len(G1) hist2 /= len(G2) hist1 = coo_matrix(hist1) hist2 = coo_matrix(hist2) self.results["dk_distributions"] = hist1, hist2 elif d == 2: D1 = dk2_series(G1, N) D2 = dk2_series(G2, N) # store the 2K-distributions self.results["dk_distributions"] = D1, D2 # flatten matrices. this is safe because we've padded to the same size G1_dk_normed = D1.toarray()[np.triu_indices(N)].flatten() G2_dk_normed = D2.toarray()[np.triu_indices(N)].flatten() assert np.isclose(G1_dk_normed.sum(), 1) assert np.isclose(G2_dk_normed.sum(), 1) dist = entropy.js_divergence(G1_dk_normed, G2_dk_normed) else: raise NotImplementedError() self.results["dist"] = dist return dist
def dk2_series(G, N=None): """ Calculate the 2k-series (i.e. the number of edges between degree-labelled nodes) for G. """ if N is None: N = len(G) k_dict = dict( dk2 = defaultdict(int) for i, j in G.edges: k_i = k_dict[i] k_j = k_dict[j] # We're enforcing order here because at the end we're going to # leverage that all the information can be stored in the upper # triangular for convenience. if k_i <= k_j: dk2[(k_i, k_j)] += 1 else: dk2[(k_j, k_i)] += 1 # every edge should be counted once assert sum(list(dk2.values())) == G.size() # convert from dict to sparse matrix row = [i for (i, j) in dk2.keys()] col = [j for (i, j) in dk2.keys()] data = [x for x in dk2.values()] D = coo_matrix((data, (row, col)), shape=(N, N)) # this should be normalized by the number of edges D = D / G.size() return D