"""
dk_series.py
--------------------------
Graph distance based on the dk-series.
author: Brennan Klein & Stefan McCabe
email: brennanjamesklein@gmail.com
Submitted as part of the 2019 NetSI Collabathon.
"""
import networkx as nx
import numpy as np
from scipy.sparse import coo_matrix
from collections import defaultdict
from .base import BaseDistance
from ..utilities import entropy, undirected, unweighted
[docs]class dkSeries(BaseDistance):
"""Compare graphs based on their :math:`dk`-series."""
[docs] @unweighted
@undirected
def dist(self, G1, G2, d=2):
r"""Compute the distance between two graphs by using the Jensen-Shannon
divergence between the :math:`dk`-series of the graphs.
The :math:`dk`-series of a graph is the collection of distributions of
size :math:`d` subgraphs, where nodes are labelled by degrees. For
simplicity, we currently consider only the :math:`1k`-series, i.e., the
degree distribution, or the :math:`2k`-series, i.e., the
distribution of edges between nodes of degree :math:`(k_i, k_j)`. The
distance between these :math:`dk`-series is calculated using the
Jensen-Shannon divergence.
Parameters
----------
G1, G2 (nx.Graph)
two networkx graphs to be compared
d (int)
the size of the subgraph to consider
Returns
-------
dist (float)
the distance between `G1` and `G2`.
References
----------
.. [1] Orsini, Chiara, Marija M. Dankulov, Pol Colomer-de-Simón,
Almerima Jamakovic, Priya Mahadevan, Amin Vahdat, Kevin E.
Bassler, et al. 2015. “Quantifying Randomness in Real Networks.”
Nature Communications 6 (1). https://doi.org/10.1038/ncomms9627.
"""
N = max(len(G1), len(G2))
if d == 1:
from .degree_divergence import DegreeDivergence
degdiv = DegreeDivergence()
dist = degdiv.dist()
# the 2k-distance stores the distribution in a sparse matrix,
# so here we take the output of DegreeDivergence and
# produce a comparable object
hist1, hist2 = degdiv.results["degree_histograms"]
hist1 /= len(G1)
hist2 /= len(G2)
hist1 = coo_matrix(hist1)
hist2 = coo_matrix(hist2)
self.results["dk_distributions"] = hist1, hist2
elif d == 2:
D1 = dk2_series(G1, N)
D2 = dk2_series(G2, N)
# store the 2K-distributions
self.results["dk_distributions"] = D1, D2
# flatten matrices. this is safe because we've padded to the same size
G1_dk_normed = D1.toarray()[np.triu_indices(N)].flatten()
G2_dk_normed = D2.toarray()[np.triu_indices(N)].flatten()
assert np.isclose(G1_dk_normed.sum(), 1)
assert np.isclose(G2_dk_normed.sum(), 1)
dist = entropy.js_divergence(G1_dk_normed, G2_dk_normed)
else:
raise NotImplementedError()
self.results["dist"] = dist
return dist
def dk2_series(G, N=None):
"""
Calculate the 2k-series (i.e. the number of edges between
degree-labelled nodes) for G.
"""
if N is None:
N = len(G)
k_dict = dict(nx.degree(G))
dk2 = defaultdict(int)
for i, j in G.edges:
k_i = k_dict[i]
k_j = k_dict[j]
# We're enforcing order here because at the end we're going to
# leverage that all the information can be stored in the upper
# triangular for convenience.
if k_i <= k_j:
dk2[(k_i, k_j)] += 1
else:
dk2[(k_j, k_i)] += 1
# every edge should be counted once
assert sum(list(dk2.values())) == G.size()
# convert from dict to sparse matrix
row = [i for (i, j) in dk2.keys()]
col = [j for (i, j) in dk2.keys()]
data = [x for x in dk2.values()]
D = coo_matrix((data, (row, col)), shape=(N, N))
# this should be normalized by the number of edges
D = D / G.size()
return D