Source code for netrd.utilities.entropy

"""
entropy.py
----------

Utility functions computing entropy of variables in time series data.

author: Chia-Hung Yang

Submitted as part of the 2019 NetSI Collabathon.
"""

from collections import defaultdict
import numpy as np
from scipy.stats import entropy as sp_entropy


[docs]def js_divergence(P, Q): """Jensen-Shannon divergence between `P` and `Q`. Parameters ---------- P, Q (np.ndarray) Two discrete distributions represented as 1D arrays. They are assumed to have the same support Returns ------- float The Jensen-Shannon divergence between `P` and `Q`. """ M = 0.5 * (P + Q) jsd = 0.5 * (sp_entropy(P, M, base=2) + sp_entropy(Q, M, base=2)) # If the input distributions are identical, floating-point error in the # construction of the mixture matrix can result in negative values that are # very close to zero. If one wants to compute the root-JSD metric, these # negative values lead to undesirable nans. if np.isclose(jsd, 0.0): return 0 else: return jsd
[docs]def entropy_from_seq(var): r"""Return the Shannon entropy of a variable. This differs from Scipy's entropy by taking a sequence of observations as input rather than a histogram or probability distribution. Parameters ---------- var (ndarray) 1D array of observations of the variable. Notes ----- 1. :math:`H(X) = - \sum p(X) \log_2(p(X))` 2. Data of the variable must be categorical. """ return joint_entropy(var[:, np.newaxis])
[docs]def joint_entropy(data): r"""Joint entropy of all variables in the data. Parameters ---------- data (np.ndarray) Array of data with variables as columns and observations as rows. Returns ------- float Joint entropy of the variables of interests. Notes ----- 1. :math:`H(\{X_i\}) = - \sum p(\{X_i\}) \log_2(p(\{X_i\}))` 2. The data of variables must be categorical. """ # Entropy is computed through summing contribution of states with # non-zero empirical probability in the data count = defaultdict(int) for state in data: key = tuple(state) count[key] += 1 return sp_entropy(list(count.values()), base=2)
[docs]def conditional_entropy(data, given): r"""Conditional entropy of variables in the data conditioned on a given set of variables. Parameters ---------- data (np.ndarray) Array of data with variables of interests as columns and observations as rows. given (np.ndarray) Array of data with the conditioned variables as columns and observations as rows. Returns ------- float Conditional entrpoy of the variables :math:`\{X_i\}` of interest conditioned on variables :math:`\{Y_j\}`. Notes ----- 1. :math:`H(\{X_i\}|\{Y_j\}) = - \sum p(\{X_i\}\cup\{Y_j\}) \log_2(p(\{X_i\}|\{Y_j\}))` 2. The data of vairiables must be categorical. """ joint = np.hstack((data, given)) entrp = joint_entropy(joint) - joint_entropy(given) return entrp
[docs]def categorized_data(raw, n_bins): """Categorize data. An entry in the returned array is the index of the bin of the linearly-binned raw continuous data. Parameters ---------- raw (np.ndarray) Array of raw continuous data. n_bins (int) A universal number of bins for all the variables. Returns ------- np.ndarray Array of bin indices after categorizing the raw data. """ bins = linear_bins(raw, n_bins) data = np.ones(raw.shape, dtype=int) # Find the index of bins each element in the raw data array belongs to for (i, j), val in np.ndenumerate(raw): data[i, j] = np.argmax(bins[1:, j] >= val) return data
[docs]def linear_bins(raw, n_bins): r"""Separators of linear bins for each variable in the raw data. Parameters ---------- raw (np.ndarray) Array of raw continuous data. n_bins (int) A universal number of bins for all the variables. Returns ------- np.ndarray Array where a column is the separators of bins for a variable. Notes ----- The bins are :math:`B_0 = [b_0, b_1]`, :math:`B_i = (b_i, b_{i+1}]`, where :math:`b_i` s are the separators of bins. """ _min = raw.min(axis=0) _max = raw.max(axis=0) bins = np.array( [np.linspace(start, end, num=n_bins + 1) for start, end in zip(_min, _max)] ) return bins.T