Source code for netrd.distance.dmeasure

"""
d_measure.py
--------------------------

Distance measure based on the Jensen-Shannon Divergence
between the network node dispersion distributions of two graphs.

Schieber, T. A. et al.
Quantification of network structural dissimilarities.
Nat. Commun. 8, 13928 (2017).

https://www.nature.com/articles/ncomms13928

author: Brennan Klein
email: brennanjamesklein@gmail.com
Submitted as part of the 2019 NetSI Collabathon.

"""

from collections import Counter
import networkx as nx
import numpy as np
from scipy.stats import entropy
from .base import BaseDistance
from ..utilities.entropy import js_divergence
from ..utilities import undirected


[docs]class DMeasure(BaseDistance):
    """Compare two graphs by their network node dispersion."""

[docs]    @undirected
    def dist(self, G1, G2, w1=0.45, w2=0.45, w3=0.10, niter=50):
        r"""The D-Measure is a comparison of structural dissimilarities between graphs.

        The key concept is the network node dispersion

        .. math::
            NND(G) = \frac{\mathcal{J}(\mathbf{P}_1,\ldots,\mathbf{P}_N)}{\log(d+1)},

        where :math:`\mathcal{J}` is the Jenson-Shannon divergence between
        :math:`N` node-distance distributions

        .. math::
            \mathbf{P}_i = \{p_i(j)\},

        and :math:`p_i(j)` is the fraction of nodes at distance :math:`i` from
        node :math:`j`.

        The D-measure itself is a weighted sum of three components: the square
        root of the Jensen-Shannon divergence between the average node-distance
        probabilities of the two graphs

        .. math::
           \mu_j = \frac{1}{N}\sum_{i=1}^N p_i(j),

        the second term is the absolute value of the differences in the square
        roots of the network node dispersions of the two graphs, and the third
        term is the sum of the square roots of the Jensen-Shannon divergences
        between the probability distributions of the alpha centralities of two
        graph and of their complements.


        Parameters
        ----------

        G1 (nx.Graph):
            the first graph to be compared.
        G2 (nx.Graph):
            the second graph to be compared.
        w1 (float):
            weight of the first term in the calculation;
            with w2 and w3, must sum to 1.0.
        w2 (float):
            weight of the second term in the calculation;
            with w1 and w3, must sum to 1.0.
        w3 (float):
            weight of the third term in the calculation;
            with w1 d w2, must sum to 1.0.
        niter (int):
            the alpha centralities are calculated using power iteration, with
            this many iterations

        Returns
        -------

        dist (float):
            between 0 and 1, the D-measure distance between G1 and G2


        Notes
        -----
        The default values for w1, w2, and w3 are from the original paper.


        References
        ----------

        .. [1] Schieber, T. A. et al. Quantification of network structural
               dissimilarities. Nat. Commun. 8, 13928 (2017).
               https://www.nature.com/articles/ncomms13928

        """

        if sum([w1, w2, w3]) != 1:
            raise ValueError("Weights must sum to one.")

        first_term = 0
        second_term = 0
        third_term = 0

        if w1 + w2 > 0:
            g1_nnd, g1_pdfs = network_node_dispersion(G1)
            g2_nnd, g2_pdfs = network_node_dispersion(G2)

            first_term = np.sqrt(js_divergence(g1_pdfs, g2_pdfs))
            second_term = np.abs(np.sqrt(g1_nnd) - np.sqrt(g2_nnd))

        if w3 > 0:

            def alpha_jsd(G1, G2):
                """
                Compute the Jensen-Shannon divergence between the
                alpha-centrality probability distributions of two graphs.
                """
                p1 = alpha_centrality_prob(G1, niter=niter)
                p2 = alpha_centrality_prob(G2, niter=niter)

                m = max([len(p1), len(p2)])

                P1 = np.zeros(m)
                P2 = np.zeros(m)

                P1[(m - len(p1)) : m] = p1
                P2[(m - len(p2)) : m] = p2

                return js_divergence(P1, P2)

            G1c = nx.complement(G1)
            G2c = nx.complement(G2)

            first_jsd = alpha_jsd(G1, G2)
            second_jsd = alpha_jsd(G1c, G2c)
            third_term = 0.5 * (np.sqrt(first_jsd) + np.sqrt(second_jsd))

        dist = w1 * first_term + w2 * second_term + w3 * third_term

        self.results["components"] = (first_term, second_term, third_term)
        self.results["weights"] = (w1, w2, w3)
        self.results["dist"] = dist

        return dist


def shortest_path_matrix(G):
    """
    Return a matrix of pairwise shortest path lengths between nodes.

    Parameters
    ----------
    G (nx.Graph): the graph in question

    Returns
    -------
    pmat (np.ndarray): a matrix of shortest paths between nodes in G

    """

    N = G.number_of_nodes()
    pmat = np.zeros((N, N)) + N

    paths = nx.all_pairs_shortest_path_length(G)
    for node_i, node_ij in paths:
        for node_j, length_ij in node_ij.items():
            pmat[node_i, node_j] = length_ij

    pmat[pmat == np.inf] = N

    return pmat


def node_distance(G):
    """
    Return an NxN matrix that consists of histograms of shortest path
    lengths between nodes i and j. This is useful for eventually taking
    information theoretic distances between the nodes.

    Parameters
    ----------
    G (nx.Graph): the graph in question.

    Returns
    -------
    out (np.ndarray): a matrix of binned node distance values.

    """

    N = G.number_of_nodes()
    a = np.zeros((N, N))

    dists = nx.shortest_path_length(G)
    for idx, row in enumerate(dists):
        counts = Counter(row[1].values())
        a[idx] = [counts[l] for l in range(1, N + 1)]

    return a / (N - 1)


def network_node_dispersion(G):
    """
    This function calculates the network node dispersion of a graph G. This
    function also returns the average of the each node-distance distribution.

    Parameters
    ----------
    G (nx.Graph): the graph in question.

    Returns
    -------
    nnd (float): the nearest node dispersion
    nd_vec (np.ndarray): a vector of averages of the
                         node-distance distributions

    """

    N = G.number_of_nodes()
    nd = node_distance(G)
    pdfm = np.mean(nd, axis=0)

    # NOTE: the paper says that the normalization is the diameter plus one,
    # but the previous implementation uses the number of nonzero entries in the
    # node-distance matrix. This number should typically be the diameter plus
    # one anyway.
    norm = np.log(nx.diameter(G) + 1)

    ndf = nd.flatten()
    # calculate the entropy, with the convention that 0/0 = 0
    entr = -1 * sum(ndf * np.log(ndf, out=np.zeros_like(ndf), where=(ndf != 0)))

    nnd = max([0, entropy(pdfm) - entr / N]) / norm

    return nnd, pdfm


def alpha_centrality_prob(G, niter):
    """
    Returns a probability distribution over alpha centralities for the network.

    Parameters
    ----------
    G (nx.Graph): the graph in question.
    niter (int): the number of iterations needed to converge properly.

    Returns:
    alpha_prob (np.ndarray): a vector of probabilities for each node in G.
    """

    # calculate the alpha centrality for each node
    N = G.number_of_nodes()
    alpha = 1 / N

    A = nx.to_numpy_array(G)

    s = A.sum(axis=1)
    cr = s.copy()

    for _ in range(niter):
        cr = s + alpha * A.dot(cr)

    # turn the alpha centralities into a probability distribution
    cr = cr / (N - 1)
    r = sorted(cr / (N**2))
    alpha_prob = list(r) + [max([0, 1 - sum(r)])]

    return np.array(alpha_prob)