Source code for netrd.reconstruction.partial_correlation_influence

"""
partial_correlation_influence.py
--------------------------------

Reconstruction of graphs using the partial correlation influence, as defined in:

Kenett, D. Y. et al. Dominating clasp of the financial sector revealed by
partial correlation analysis of the stock market. PLoS ONE 5, e15032 (2010).

The index variable option as in:

Kenett, D. Y., Huang, X., Vodenska, I., Havlin, S. & Stanley, H. E. Partial correlation
analysis: applications for financial markets. Quantitative Finance 15, 569–578 (2015).


author: Carolina Mattsson and Chia-Hung Yang
email: mattsson dot c at northeastern dot edu
Submitted as part of the 2019 NetSI Collabathon
"""
from .base import BaseReconstructor
import numpy as np
from scipy import linalg
from ..utilities import create_graph, threshold


[docs]class PartialCorrelationInfluence(BaseReconstructor):
    """Uses average effect from a sensor to all others."""

[docs]    def fit(self, TS, index=None, threshold_type='range', **kwargs):
        r"""Uses the average effect of a series :math:`Z` on the correlation between
        a series :math:`X` and all other series.

        The partial correlation influence:

        .. math::

            d(X:Z) = <d(X,Y:Z)>_Y \neq X,

        where :math:`d(X,Y:Z) = \rho(X,Y) - \rho(X,Y:Z)`


        If an index is given, both terms become partial correlations:

        .. math::

            d(X,Y:Z) ≡ ρ(X,Y:M) − ρ(X,Y:M,Z)


        The results dictionary also stores the matrix of partial
        correlations as `'weights_matrix'` and the thresholded version of
        the partial correlation matrix as `'thresholded_matrix'`.

        Parameters
        ----------
        TS (np.ndarray)
            Array consisting of :math:`L` observations from :math:`N` sensors.

        index (int, array of ints, or None)
            An index variable or set of index variables, which are assumed to
            be confounders of all other variables. They are held constant when
            calculating the partial correlations. Default to None.

        threshold_type (str):
            Which thresholding function to use on the matrix of
            weights. See `netrd.utilities.threshold.py` for
            documentation. Pass additional arguments to the thresholder
            using ``**kwargs``.

        Returns
        -------

        G (nx.Graph)
            a reconstructed graph.

        References
        -----------

        .. [1] Kenett, D. Y. et al. Dominating clasp of the financial
               sector revealed by partial correlation analysis of the stock
               market. PLoS ONE 5, e15032 (2010).

        .. [2] Kenett, D. Y., Huang, X., Vodenska, I., Havlin, S. &
               Stanley, H. E. Partial correlation analysis: applications
               for financial markets. Quantitative Finance 15, 569–578
               (2015).

        """
        data = TS.T
        N = data.shape[1]

        # Create masks to separate variables of interests from the pre-included
        # index variables
        mask = np.ones(N, dtype=bool)
        if index is not None:
            mask[index] = False

        # Compute partial correlations with the index variables held constant
        p_corr = np.full((N, N), np.nan)
        p_corr[np.ix_(mask, mask)] = partial_corr(data[:, mask], data[:, ~mask])

        # For every non-index variable Z, compute partial correlation influence
        # between other variables when Z is also held constant
        p_corr_inf = np.full((N, N, N), np.nan)
        for z in np.arange(N)[mask]:
            m_new = mask.copy()  # New mask including variable Z
            m_new[z] = False

            diff = p_corr[np.ix_(m_new, m_new)]
            diff -= partial_corr(data[:, m_new], data[:, ~m_new])
            p_corr_inf[np.ix_(m_new, m_new, [z])] = diff[:, :, np.newaxis]

            # Exclude the cases of Y = X
            np.fill_diagonal(p_corr_inf[:, :, z], np.nan)
            # Set PCI for X = Z to 0 for consistency after averaging
            p_corr_inf[z, :, z] = 0

        # Obtain the average partial correlation influence
        influence = np.zeros((N, N))  # Default self-influence by zero
        influence[mask, mask] = np.nanmean(p_corr_inf[mask, mask], axis=1)

        influence[~mask, :] = np.inf  # Index variables influence all others
        influence[:, ~mask] = 0  # but no one influences the index variables

        self.results['weights_matrix'] = influence

        # threshold the network
        W_thresh = threshold(influence, threshold_type, **kwargs)

        # construct the network
        self.results['graph'] = create_graph(W_thresh)
        self.results['thresholded_matrix'] = W_thresh

        G = self.results['graph']

        return G


def partial_corr(_vars, idx_vars):
    """
    Return the partial correlations between pairs of variables, given a set of
    index variables held constant.

    Parameters
    ----------
    _vars (numpy.ndarray)
        Variables of interests (which are columns of the array).

    idx_vars (numpy.ndarray)
        Index variables to be held constant (which are columns of the array).
        If the array has zero size, namely no index variable, return the
        Pearson correlations between variables.

    Return
    ------
    p_corr (numpy.ndarray)
         Square array of pairwise partial correlations between variables.

    Note
    ----
    Precondition: The index variables should not contain or synchronize with
                  a variable of interests.

    """
    if idx_vars.size == 0:
        return np.corrcoef(_vars, rowvar=False)
    else:
        coef = linalg.lstsq(idx_vars, _vars)[0]  # Coefficients of regression
        resid = _vars - idx_vars.dot(coef)  # Residuals
        return np.corrcoef(resid, rowvar=False)