# Source code for netrd.distance.dk_series

"""
dk_series.py
--------------------------

Graph distance based on the dk-series.

author: Brennan Klein & Stefan McCabe
email: brennanjamesklein@gmail.com
Submitted as part of the 2019 NetSI Collabathon.

"""

import networkx as nx
import numpy as np
from scipy.sparse import coo_matrix
from collections import defaultdict
from .base import BaseDistance
from ..utilities import entropy, undirected, unweighted

[docs]class dkSeries(BaseDistance):
"""Compare graphs based on their :math:dk-series."""

[docs]    @unweighted
@undirected
def dist(self, G1, G2, d=2):
r"""Compute the distance between two graphs by using the Jensen-Shannon
divergence between the :math:dk-series of the graphs.

The :math:dk-series of a graph is the collection of distributions of
size :math:d subgraphs, where nodes are labelled by degrees. For
simplicity, we currently consider only the :math:1k-series, i.e., the
degree distribution, or the :math:2k-series, i.e., the
distribution of edges between nodes of degree :math:(k_i, k_j). The
distance between these :math:dk-series is calculated using the
Jensen-Shannon divergence.

Parameters
----------

G1, G2 (nx.Graph)
two networkx graphs to be compared

d (int)
the size of the subgraph to consider

Returns
-------

dist (float)
the distance between G1 and G2.

References
----------

.. [1] Orsini, Chiara, Marija M. Dankulov, Pol Colomer-de-Simón,
Almerima Jamakovic, Priya Mahadevan, Amin Vahdat, Kevin E.
Bassler, et al. 2015. “Quantifying Randomness in Real Networks.”
Nature Communications 6 (1). https://doi.org/10.1038/ncomms9627.

"""

N = max(len(G1), len(G2))

if d == 1:
from .degree_divergence import DegreeDivergence

degdiv = DegreeDivergence()
dist = degdiv.dist()

# the 2k-distance stores the distribution in a sparse matrix,
# so here we take the output of DegreeDivergence and
# produce a comparable object
hist1, hist2 = degdiv.results["degree_histograms"]
hist1 /= len(G1)
hist2 /= len(G2)
hist1 = coo_matrix(hist1)
hist2 = coo_matrix(hist2)

self.results["dk_distributions"] = hist1, hist2

elif d == 2:
D1 = dk2_series(G1, N)
D2 = dk2_series(G2, N)

# store the 2K-distributions
self.results["dk_distributions"] = D1, D2

# flatten matrices. this is safe because we've padded to the same size
G1_dk_normed = D1.toarray()[np.triu_indices(N)].flatten()
G2_dk_normed = D2.toarray()[np.triu_indices(N)].flatten()

assert np.isclose(G1_dk_normed.sum(), 1)
assert np.isclose(G2_dk_normed.sum(), 1)

dist = entropy.js_divergence(G1_dk_normed, G2_dk_normed)
else:
raise NotImplementedError()

self.results["dist"] = dist
return dist

def dk2_series(G, N=None):
"""
Calculate the 2k-series (i.e. the number of edges between
degree-labelled nodes) for G.
"""

if N is None:
N = len(G)

k_dict = dict(nx.degree(G))
dk2 = defaultdict(int)

for i, j in G.edges:
k_i = k_dict[i]
k_j = k_dict[j]

# We're enforcing order here because at the end we're going to
# leverage that all the information can be stored in the upper
# triangular for convenience.
if k_i <= k_j:
dk2[(k_i, k_j)] += 1
else:
dk2[(k_j, k_i)] += 1

# every edge should be counted once
assert sum(list(dk2.values())) == G.size()

# convert from dict to sparse matrix
row = [i for (i, j) in dk2.keys()]
col = [j for (i, j) in dk2.keys()]
data = [x for x in dk2.values()]

D = coo_matrix((data, (row, col)), shape=(N, N))

# this should be normalized by the number of edges
D = D / G.size()

return D