Source code for netrd.distance.dmeasure

"""
d_measure.py
--------------------------

Distance measure based on the Jensen-Shannon Divergence
between the network node dispersion distributions of two graphs.

Schieber, T. A. et al.
Quantification of network structural dissimilarities.
Nat. Commun. 8, 13928 (2017).

https://www.nature.com/articles/ncomms13928

author: Brennan Klein
email: brennanjamesklein@gmail.com
Submitted as part of the 2019 NetSI Collabathon.

"""

from collections import Counter
import networkx as nx
import numpy as np
from scipy.stats import entropy
from .base import BaseDistance
from ..utilities.entropy import js_divergence
from ..utilities import undirected


[docs]class DMeasure(BaseDistance): """Compare two graphs by their network node dispersion."""
[docs] @undirected def dist(self, G1, G2, w1=0.45, w2=0.45, w3=0.10, niter=50): r"""The D-Measure is a comparison of structural dissimilarities between graphs. The key concept is the network node dispersion .. math:: NND(G) = \frac{\mathcal{J}(\mathbf{P}_1,\ldots,\mathbf{P}_N)}{\log(d+1)}, where :math:`\mathcal{J}` is the Jenson-Shannon divergence between :math:`N` node-distance distributions .. math:: \mathbf{P}_i = \{p_i(j)\}, and :math:`p_i(j)` is the fraction of nodes at distance :math:`i` from node :math:`j`. The D-measure itself is a weighted sum of three components: the square root of the Jensen-Shannon divergence between the average node-distance probabilities of the two graphs .. math:: \mu_j = \frac{1}{N}\sum_{i=1}^N p_i(j), the second term is the absolute value of the differences in the square roots of the network node dispersions of the two graphs, and the third term is the sum of the square roots of the Jensen-Shannon divergences between the probability distributions of the alpha centralities of two graph and of their complements. Parameters ---------- G1 (nx.Graph): the first graph to be compared. G2 (nx.Graph): the second graph to be compared. w1 (float): weight of the first term in the calculation; with w2 and w3, must sum to 1.0. w2 (float): weight of the second term in the calculation; with w1 and w3, must sum to 1.0. w3 (float): weight of the third term in the calculation; with w1 d w2, must sum to 1.0. niter (int): the alpha centralities are calculated using power iteration, with this many iterations Returns ------- dist (float): between 0 and 1, the D-measure distance between G1 and G2 Notes ----- The default values for w1, w2, and w3 are from the original paper. References ---------- .. [1] Schieber, T. A. et al. Quantification of network structural dissimilarities. Nat. Commun. 8, 13928 (2017). https://www.nature.com/articles/ncomms13928 """ if sum([w1, w2, w3]) != 1: raise ValueError("Weights must sum to one.") first_term = 0 second_term = 0 third_term = 0 if w1 + w2 > 0: g1_nnd, g1_pdfs = network_node_dispersion(G1) g2_nnd, g2_pdfs = network_node_dispersion(G2) first_term = np.sqrt(js_divergence(g1_pdfs, g2_pdfs)) second_term = np.abs(np.sqrt(g1_nnd) - np.sqrt(g2_nnd)) if w3 > 0: def alpha_jsd(G1, G2): """ Compute the Jensen-Shannon divergence between the alpha-centrality probability distributions of two graphs. """ p1 = alpha_centrality_prob(G1, niter=niter) p2 = alpha_centrality_prob(G2, niter=niter) m = max([len(p1), len(p2)]) P1 = np.zeros(m) P2 = np.zeros(m) P1[(m - len(p1)) : m] = p1 P2[(m - len(p2)) : m] = p2 return js_divergence(P1, P2) G1c = nx.complement(G1) G2c = nx.complement(G2) first_jsd = alpha_jsd(G1, G2) second_jsd = alpha_jsd(G1c, G2c) third_term = 0.5 * (np.sqrt(first_jsd) + np.sqrt(second_jsd)) dist = w1 * first_term + w2 * second_term + w3 * third_term self.results["components"] = (first_term, second_term, third_term) self.results["weights"] = (w1, w2, w3) self.results["dist"] = dist return dist
def shortest_path_matrix(G): """ Return a matrix of pairwise shortest path lengths between nodes. Parameters ---------- G (nx.Graph): the graph in question Returns ------- pmat (np.ndarray): a matrix of shortest paths between nodes in G """ N = G.number_of_nodes() pmat = np.zeros((N, N)) + N paths = nx.all_pairs_shortest_path_length(G) for node_i, node_ij in paths: for node_j, length_ij in node_ij.items(): pmat[node_i, node_j] = length_ij pmat[pmat == np.inf] = N return pmat def node_distance(G): """ Return an NxN matrix that consists of histograms of shortest path lengths between nodes i and j. This is useful for eventually taking information theoretic distances between the nodes. Parameters ---------- G (nx.Graph): the graph in question. Returns ------- out (np.ndarray): a matrix of binned node distance values. """ N = G.number_of_nodes() a = np.zeros((N, N)) dists = nx.shortest_path_length(G) for idx, row in enumerate(dists): counts = Counter(row[1].values()) a[idx] = [counts[l] for l in range(1, N + 1)] return a / (N - 1) def network_node_dispersion(G): """ This function calculates the network node dispersion of a graph G. This function also returns the average of the each node-distance distribution. Parameters ---------- G (nx.Graph): the graph in question. Returns ------- nnd (float): the nearest node dispersion nd_vec (np.ndarray): a vector of averages of the node-distance distributions """ N = G.number_of_nodes() nd = node_distance(G) pdfm = np.mean(nd, axis=0) # NOTE: the paper says that the normalization is the diameter plus one, # but the previous implementation uses the number of nonzero entries in the # node-distance matrix. This number should typically be the diameter plus # one anyway. norm = np.log(nx.diameter(G) + 1) ndf = nd.flatten() # calculate the entropy, with the convention that 0/0 = 0 entr = -1 * sum(ndf * np.log(ndf, out=np.zeros_like(ndf), where=(ndf != 0))) nnd = max([0, entropy(pdfm) - entr / N]) / norm return nnd, pdfm def alpha_centrality_prob(G, niter): """ Returns a probability distribution over alpha centralities for the network. Parameters ---------- G (nx.Graph): the graph in question. niter (int): the number of iterations needed to converge properly. Returns: alpha_prob (np.ndarray): a vector of probabilities for each node in G. """ # calculate the alpha centrality for each node N = G.number_of_nodes() alpha = 1 / N A = nx.to_numpy_array(G) s = A.sum(axis=1) cr = s.copy() for _ in range(niter): cr = s + alpha * A.dot(cr) # turn the alpha centralities into a probability distribution cr = cr / (N - 1) r = sorted(cr / (N**2)) alpha_prob = list(r) + [max([0, 1 - sum(r)])] return np.array(alpha_prob)