Source code for netrd.distance.portrait_divergence

"""
portrait_divergence.py
---------------------

Adapted from "An information-theoretic, all-scales approach to comparing
networks" by James P. Bagrow and Erik M. Bollt, 2018 arXiv:1804.03665 and [this
repository](https://github.com/bagrow/portrait-divergence)

author: Brennan Klein
email: brennanjamesklein at gmail dot com
submitted as part of the 2019 NetSI Collabathon

"""

# Much of the following is adapted from Jim Bagrow's implementation of
# portrait divergence, which is available under the following MIT license:

# Copyright (c) 2018 Jim Bagrow
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from .base import BaseDistance
from collections import Counter
import numpy as np
import networkx as nx
from ..utilities import entropy


[docs]class PortraitDivergence(BaseDistance):
    """Compares graph portraits."""

[docs]    def dist(self, G1, G2, bins=None, binedges=None):
        """Distance measure based on the two graphs' "portraits".

        The results dictionary also stores a 2-tuple of the underlying
        adjacency matrices in the key `'adjacency_matrices'` and the
        portrait matrices in `'portrait_matrices'`.

        Parameters
        ----------

        G1, G2 (nx.Graph)
            two graphs

        bins (int)
            width of bins in percentiles

        binedges (list)
            vector of bin edges (mutually exclusive from bins)


        Returns
        -------
        dist (float)
            the portrait divergence between two graphs.


        References
        ----------
        [1] An information-theoretic, all-scales approach to comparing networks
        James P. Bagrow and Erik M. Bollt, 2018 arXiv:1804.03665

        [2] https://github.com/bagrow/portrait-divergence

        """

        adj1 = nx.to_numpy_array(G1)
        adj2 = nx.to_numpy_array(G2)

        ## NOTE dijkstra cannot handle negative weights
        if (adj1 < 0).any() or (adj2 < 0).any():
            adj1 = np.abs(adj1)
            adj2 = np.abs(adj2)
            G1 = nx.from_numpy_array(adj1)
            G2 = nx.from_numpy_array(adj2)

        paths_G1 = list(nx.all_pairs_dijkstra_path_length(G1))
        paths_G2 = list(nx.all_pairs_dijkstra_path_length(G2))

        # get bin_edges in common for G and H:
        if binedges is None:
            if bins is None:
                bins = 1

            UPL_G1 = set(_get_unique_path_lengths(G1, paths=paths_G1))
            UPL_G2 = set(_get_unique_path_lengths(G2, paths=paths_G2))

            unique_path_lengths = sorted(list(UPL_G1 | UPL_G2))
            binedges = np.percentile(unique_path_lengths, np.arange(0, 101, bins))

        # get weighted portraits:
        BG1 = weighted_portrait(G1, paths=paths_G1, binedges=binedges)
        BG2 = weighted_portrait(G2, paths=paths_G2, binedges=binedges)

        dist = portrait_divergence(
            BG1, BG2, N1=G1.number_of_nodes(), N2=G2.number_of_nodes()
        )

        self.results['dist'] = dist
        self.results['adjacency_matrices'] = adj1, adj2
        self.results['portrait_matrices'] = BG1, BG2

        return dist


def portrait(G):
    """
    Parameters
    ----------
    G (nx.Graph or nx.DiGraph):
        a graph.

    Returns
    -------
    B (np.ndarray):
        a matrix :math:`B` such that :math:`B_{i,j}` is the number of starting
        nodes in graph with :math:`j` nodes in shell :math:`i`.
    """

    dia = nx.diameter(G)
    N = G.number_of_nodes()

    # B indices are 0...dia x 0...N-1:
    B = np.zeros((dia + 1, N))

    max_path = 1
    adj = G.adj

    for starting_node in G.nodes():
        nodes_visited = {starting_node: 0}
        search_queue = [starting_node]
        d = 1

        while search_queue:
            next_depth = []
            extend = next_depth.extend

            for n in search_queue:
                l = [i for i in adj[n] if i not in nodes_visited]
                extend(l)

                for j in l:
                    nodes_visited[j] = d

            search_queue = next_depth
            d += 1

        node_distances = nodes_visited.values()
        max_node_distances = max(node_distances)

        curr_max_path = max_node_distances
        if curr_max_path > max_path:
            max_path = curr_max_path

        # build individual distribution:
        dict_distribution = dict.fromkeys(node_distances, 0)
        for d in node_distances:
            dict_distribution[d] += 1

        # add individual distribution to matrix:
        for shell, count in dict_distribution.items():
            B[shell][count] += 1

        # HACK: count starting nodes that have zero nodes in farther shells
        max_shell = dia
        while max_shell > max_node_distances:
            B[max_shell][0] += 1
            max_shell -= 1

    return B[: max_path + 1, :]


def weighted_portrait(G, paths=None, binedges=None):
    """Compute weighted portrait, using Dijkstra's algorithm for finding
    shortest paths.

    Parameters
    ----------
    G (nx.Graph or nx.DiGraph):
        a graph.

    paths (list):
        a list of all pairs of paths.

    binedges (list):
        sampled path lengths.

    Returns
    -------
    B (np.ndarray):
        a matrix :math:`B` where :math:`B_{i,j}` is the number of starting
        nodes in graph with :math:`j` nodes at distance :math:`d_i < d <
        d_{i+1}`.

    """

    # all pairs path lengths
    if paths is None:
        paths = list(nx.all_pairs_dijkstra_path_length(G))

    if binedges is None:
        unique_path_lengths = _get_unique_path_lengths(G, paths=paths)
        sampled_path_lengths = np.percentile(unique_path_lengths, np.arange(0, 101, 1))
    else:
        sampled_path_lengths = binedges

    UPL = np.array(sampled_path_lengths)

    l_s_v = []
    for i, (s, dist_dict) in enumerate(paths):
        distances = np.array(list(dist_dict.values()))
        s_v, e = np.histogram(distances, bins=UPL)
        l_s_v.append(s_v)

    M = np.array(l_s_v)

    B = np.zeros((len(UPL) - 1, G.number_of_nodes() + 1))

    for i in range(len(UPL) - 1):
        col = M[:, i]  # ith col = numbers of nodes at d_i <= distance < d_i+1

        for n, c in Counter(col).items():
            B[i, n] += c

    return B


def _get_unique_path_lengths(G, paths=None):
    """
    Compute the unique path lengths.

    Parameters
    ----------
    G (nx.Graph or DiGraph):
        a graph.

    paths (list):
        list of paths.

    Returns
    -------
    unique_path_lengths (list):
        sorted unique path lengths.
    """

    if paths is None:
        paths = list(nx.all_pairs_dijkstra_path_length(G))

    unique_path_lengths = set()

    for starting_node, dist_dict in paths:
        unique_path_lengths |= set(dist_dict.values())

    unique_path_lengths = sorted(list(unique_path_lengths))
    return unique_path_lengths


def pad_portraits_to_same_size(B1, B2):
    """
    Make sure that two matrices are padded with zeros and/or trimmed of
    zeros to be the same dimensions.

    Parameters
    ----------
    B1, B2 (np.ndarray):
        Portrait matrices of a graph (k x N)

    Returns
    -------
    BigB1, BigB2 (np.ndarray):
        padded versions of B1 and B2 with the same dimensions
    """
    ns, ms = B1.shape
    nl, ml = B2.shape

    # Bmats have N columns, find last *occupied* column and trim both down:
    lastcol1 = max(np.nonzero(B1)[1])
    lastcol2 = max(np.nonzero(B2)[1])
    lastcol = max(lastcol1, lastcol2)

    B1 = B1[:, : lastcol + 1]
    B2 = B2[:, : lastcol + 1]

    BigB1 = np.zeros((max(ns, nl), lastcol + 1))
    BigB2 = np.zeros((max(ns, nl), lastcol + 1))

    BigB1[: B1.shape[0], : B1.shape[1]] = B1
    BigB2[: B2.shape[0], : B2.shape[1]] = B2

    return BigB1, BigB2


def _graph_or_portrait(X):
    """
    Check if X is a nx.(Di)Graph. If it is, get its portrait. Otherwise assume
    it's a portrait and just return it.
    """
    if isinstance(X, (nx.Graph, nx.DiGraph)):
        return portrait(X)

    return X


def _get_prob_distance(B):
    """
    Helper function.
    """
    d, K = B.shape

    v = np.arange(0, K)
    f = (B * v).sum(axis=1)
    return f / f.sum()


def _get_prob_k_given_L(B, N=None):
    """
    Helper function.
    """
    if N is None:
        N = int(B[0, 1])
    return B / N


def portrait_divergence(G1, G2, N1=None, N2=None):
    """
    Compute the portrait divergence between graphs G1 and G2.

    Parameters
    ----------
    G1, G2 (nx.Graph or nx.DiGraph):
        Two graphs to compare.

    Returns
    -------
    JSDpq (float):
        the Jensen-Shannon divergence between the portraits of G1 and G2

    """
    BG1 = _graph_or_portrait(G1)
    BG2 = _graph_or_portrait(G2)
    BG1, BG2 = pad_portraits_to_same_size(BG1, BG2)

    # build joint distribution for G:
    P_L = _get_prob_distance(BG1)
    P_KgL = _get_prob_k_given_L(BG1, N=N1)
    P_KaL = P_KgL * P_L[:, None]

    # build joint distribution for H:
    Q_L = _get_prob_distance(BG2)
    Q_KgL = _get_prob_k_given_L(BG2, N=N2)
    Q_KaL = Q_KgL * Q_L[:, None]

    # flatten distribution matrices as arrays:
    P = P_KaL.ravel()
    Q = Q_KaL.ravel()

    return entropy.js_divergence(P, Q)