# Source code for netrd.reconstruction.marchenko_pastur

"""
marchenko_pastur.py
--------------

Graph reconstruction algorithm based on Marchenko, V. A., & Pastur, L. A. (1967).
Distribution of eigenvalues for some sets of random matrices. Matematicheskii
Sbornik, 114(4), 507-536.

author: Matteo Chinazzi
Submitted as part of the 2019 NetSI Collabathon.
"""

from .base import BaseReconstructor
import numpy as np
import networkx as nx
from ..utilities import create_graph, threshold

[docs]class MarchenkoPastur(BaseReconstructor):
"""Uses Marchenko-Pastur law to remove noise."""

[docs]    def fit(
self,
TS,
remove_largest=False,
metric_distance=False,
threshold_type='range',
**kwargs
):
r"""Create a correlation-based graph using Marchenko-Pastur law to remove noise.

A signed graph is built by constructing a projection of the
empirical correlation matrix generated from the time series data
after having removed noisy components.  This method combines the
results presented in [1]_, [2]_, and [3]_.

The results dictionary also stores the weight matrix as
'weights_matrix' and the thresholded version of the weight matrix
as 'thresholded_matrix'.

Parameters
----------

TS (np.ndarray)
:math:N \times L array consisting of :math:L observations
from :math:N sensors.

remove_largest (bool), optional
If False, all the eigenvectors associated to the
significant eigenvalues will be used to reconstruct the
de-noised empirical correlation matrix. If True, the
eigenvector associated to the largest eigenvalue (normally
known as the market mode, [2]) is going to be excluded from
the recontruction step.  metric_distance (bool), optional: If
False, a signed graph is obtained.  The weights associated
to the edges represent the de-noised correlation coefficient
:math:\rho_{i,j} between time series :math:i and :math:j.
If True, the correlation is transformed by defining a
metric distance between each pair of nodes where :math:d_{i,j}
= \sqrt{2(1-\rho_{i,j})} as proposed in [3].  threshold_type
(str): Which thresholding function to use on the matrix of
weights. See netrd.utilities.threshold.py for
documentation. Pass additional arguments to the thresholder
using **kwargs.

Returns
-------

G (nx.Graph)
A reconstructed graph with :math:N nodes.

Examples
--------
.. code:: python

import numpy as np
import networkx as nx
from matplotlib import pyplot as plt
from netrd.reconstruction import MarchenkoPastur

N = 250
T = 300
M = np.random.normal(size=(N,T))

print('Create correlated time series')
market_mode = 0.4*np.random.normal(size=(1,T))
M += market_mode

sector_modes = {d: 0.5*np.random.normal(size=(1,T)) for d in range(5)}
for sector_mode, vals in sector_modes.items():
M[sector_mode*50:(sector_mode+1)*50,:] += vals

print('Network reconstruction step')
mp_net = MarchenkoPastur()
G = mp_net.fit(M, only_positive=True)
G_no_market = mp_net.fit(M, only_positive=True, remove_largest=True)

print('Observed noisy correlation')
C = np.corrcoef(M)
C[C<0] = 0 # remove negative values
np.fill_diagonal(C,0) # remove self-loops
G_noisy = nx.from_numpy_array(C) # create graph

print('Plot observed noisy correlation graph')
fig, ax = plt.subplots()
nx.draw(G_noisy, ax=ax)

print('Plot reconstructed correlation graph')
fig, ax = plt.subplots()
nx.draw(G, ax=ax)

print('Plot reconstructed correlation graph without market mode')
fig, ax = plt.subplots()
nx.draw(G_no_market, ax=ax)

References
----------
.. [1] Marchenko, V. A., & Pastur, L. A. (1967). Distribution of
eigenvalues for some sets of random
matrices. Matematicheskii Sbornik, 114(4), 507-536.

.. [2] Laloux, L., Cizeau, P., Bouchaud, J. P., & Potters,
M. (1999). Noise dressing of financial correlation
matrices. Physical review letters, 83(7), 1467.
https://journals.aps.org/prl/abstract/10.1103/PhysRevLett.83.1467

.. [3] Bonanno, G., Caldarelli, G., Lillo, F., Micciche, S.,
Vandewalle, N., & Mantegna, R. N. (2004). Networks of
equities in financial markets. The European Physical Journal
B, 38(2), 363-371.

"""
N, L = TS.shape
if N > L:
raise ValueError("L must be greater or equal than N.")

Q = L / N
C = np.corrcoef(TS)  # Empirical correlation matrix

w, v = np.linalg.eigh(C)  # Spectral decomposition of C

w_min = 1 + 1 / Q - 2 * np.sqrt(1 / Q)
w_max = 1 + 1 / Q + 2 * np.sqrt(1 / Q)

selected = (w < w_min) | (w > w_max)

if selected.sum() == 0:
G = nx.empty_graph(n=N)
self.results['graph'] = G
return G

if remove_largest:
selected[-1] = False

w_signal = w[selected]
v_signal = v[:, selected]

C_signal = v_signal.dot(np.diag(w_signal)).dot(v_signal.T)

if metric_distance:
C_signal = np.sqrt(2 * (1 - C_signal))

self.results['weights_matrix'] = C_signal

# threshold signal matrix

self.results['thresholded_matrix'] = threshold(
C_signal, threshold_type, **kwargs
)

G = create_graph(self.results['thresholded_matrix'])

self.results['graph'] = G
return G