Source code for mprod._ml_helpers

import numpy as np
from typing import List, Tuple, Dict, Mapping
from ._base import NumpynDArray
from sklearn.base import TransformerMixin, BaseEstimator
import pandas as pd
from itertools import product


[docs]def table2tensor(table: pd.DataFrame, missing_flag: bool = False) -> Tuple[np.ma.core.MaskedArray, Mapping, Mapping]:
    """
    Reshapes a  `nm x p` (`(samples x reps) x features`) multi-indexed datafram to form a `m x p x n` tensor
    `(subjects, features, reps)`

    Parameters
    ----------
    table: pd.DataFrame
        a `nm x p` table of sampels x features

    missing_flag: `bool`, default = False
        When set to `False` (default), the function will raise an error in case there are missing samples.
        Setting to `True` will result in a tensor with masked entries.

    Returns
    -------
    tensor: ndarray, np.ma.array
        3'rd order tensor `m x p x n` (subjects, features, reps)

    mode1_mapping : dict
        The mapping of each mode1 (frontal) slice  index of the tensor to the table's original subject name

    mode3_mapping : dict
        The mapping of each mode3 (lateral) slice index of the tensor to the table's original rep id


    Examples
    --------
    Suppose that ``table_data`` is a dataframe with no missing values.

    >>> from mprod import table2tensor
    >>> import pandas as pd
    >>> np.random.seed(0)
    >>> table_data.iloc[:5,:4]
                        f1        f2        f3        f4
    SubjetID rep
    a        t1   0.251259  0.744838  -0.45889 -0.208525
             t10   2.39831  0.248772   0.65873   1.36994
             t2  -0.303154 -0.337603 -0.568608   -1.0239
             t3    1.36369  0.978895  0.161972 -0.804368
             t4     1.8548   1.52954   0.78576  0.538041
    >>> msk_tensor, mode1_mapping, mode3_mapping = table2tensor(table_data, missing_flag=False)
    >>> msk_tensor[:3,:3,:2]
    [[[0.25125853442243695 2.398308745102709]
      [0.7448378210349296 0.2487716728987871]
      [-0.4588901621837434 0.6587302072601999]]
     [[-0.5689263433318329 -0.06564253839123065]
      [1.0017636851038796 -0.49265853128383713]
      [0.45266517056628647 -1.4812390563653883]]
     [[0.7690616486878629 0.49302719962677855]
      [0.3186320585255899 1.469576084933633]
      [0.9609169837347897 -0.19564077520234632]]]
    >>> mode1_mapping
    {'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4}
    >>> mode3_mapping
    {'t1': 0,
     't10': 1,
     't2': 2,
     't3': 3,
     't4': 4,
     't5': 5,
     't6': 6,
     't7': 7,
     't8': 8,
     't9': 9}

    **missing values**

    >>> msk_tensor, mode1_mapping, mode3_mapping = table2tensor(table_data.sample(40)
    ...                                                          , missing_flag=True)
    >>> msk_tensor[:3,:3,:2]
    masked_array(
      data=[[[0.07664420134210018, --],
             [-0.7358062254334045, --],
             [0.5562074188402509, --]],
            [[2.088982483926928, -0.06564253839123065],
             [0.7697757466063808, -0.49265853128383713],
             [0.4147812728859107, -1.4812390563653883]],
            [[-0.004794963866429985, 1.2262908375944879],
             [-0.15033350807209261, -0.3068131758163276],
             [0.6461670563178799, 0.1769508046682527]]],
      mask=[[[False,  True],
             [False,  True],
             [False,  True]],
            [[False, False],
             [False, False],
             [False, False]],
            [[False, False],
             [False, False],
             [False, False]]], fill_value=0.0)
    >>> mode1_mapping
    {'a': 3, 'b': 1, 'c': 0, 'd': 4, 'e': 2}
    >>> mode3_mapping
    {'t1': 2,
     't10': 1,
     't2': 3,
     't3': 6,
     't4': 5,
     't5': 7,
     't6': 8,
     't7': 4,
     't8': 0,
     't9': 9}
    """

    samples_map, usamples = table.index.get_level_values(0).factorize()
    reps_map, ureps = table.index.get_level_values(1).factorize()

    m, p, n = usamples.size, table.shape[1], ureps.size

    samples_map_dict = pd.Series(np.unique(samples_map), usamples).to_dict()
    reps_map_dict = pd.Series(np.unique(reps_map), ureps).to_dict()

    if missing_flag:
        tensor = np.ma.array(np.zeros((m, p, n)), mask=np.ones((m, p, n)), fill_value=0)
        index_iterator = table.iterrows()
    else:
        tensor = np.zeros((m, p, n))
        index_iterator = (((i, j), table.loc[(i, j)].copy()) for i, j in product(usamples, ureps))

    try:
        for (m1, m3), val in index_iterator:
            tensor[samples_map_dict[m1], :, reps_map_dict[m3]] = val.values
    except KeyError as ke:
        raise KeyError("Discovered missing data in the table, which is not allowed by default. "
                       "To work with missing data and have a masked array returned, set missing_flag to True")

    return tensor, samples_map_dict, reps_map_dict


# noinspection PyPep8Naming
# noinspection PyUnusedLocal
[docs]class MeanDeviationForm(TransformerMixin, BaseEstimator):
    """Standardize the data by subtracting the mean (or empiric mean) sample
    The mean deviation form of a tensor :math:`X \\in \mathbb{R}^{m \\times p \\times n}` is calculated as:

            Z = X - U

    where `U` is the mean sample of `X` , calculated as follows:

    .. math::
        U = \\frac{1}{m} \\sum_{i=1}^{m} X[i,:,:]

    and for the empiric mean deviation form:

    .. math::
        U = \\frac{1}{m-1} \\sum_{i=1}^{m} X[i,:,:]


    Attributes
    ----------
    _mean_sample : ndarray of shape (p_features, n_repeats), or `None`
        The mean sample of the dataset


    Methods
    -------
    fit:
        Fits a MeanDeviationForm transformer by computing the mean sample of a training dataset
    transform:
        Shift dataset by fitted sample mean
    fit_transform:
        Compute the mean sample of a dataset and transform it to its mean deviation form
    inverse_transform:
        Add precomputed mean sample to a dataset




    """

    def __init__(self):
        # super(MeanDeviationForm, self).__init__()

        self._mean_sample = None

    def _fit(self, X):
        denum = X.shape[0]
        self._mean_sample = np.nansum(X, axis=0, keepdims=True) / denum

[docs]    def fit(self, X, y=None, **fit_param):
        """Compute the mean (or empiric mean) sample of a tensor

        Parameters
        ----------
        X : {array-like} of shape (m_samples, p_features, n_repeats)
            The data used to compute the mean sample
            used for later cantering along the features-repeats axes.
        y : None
            Ignored.

        Returns
        -------
        self : object
            Fitted MeanDeviationForm object

        Examples
        --------
        >>> from mprod import MeanDeviationForm
        >>> import numpy as np
        >>> X = np.random.randn(10,20,4)
        >>> mdf = MeanDeviationForm()
        >>> mdf = mdf.fit(X)
        """
        self._fit(X)
        return self

[docs]    def transform(self, X, y=None):
        """Perform standardization by centering.

        Parameters
        ----------
        X : array-like of shape (k_samples, p_features, n_repeats)
            The data used to center along the features-repeats axes.

        Returns
        -------
        X_tr : ndarray of shape (k_samples, p_features, n_repeats)
            Transformed tensor.

        Examples
        --------
        >>> from mprod import MeanDeviationForm
        >>> import numpy as np
        >>> X = np.random.randn(10,20,4)
        >>> y = np.random.randn(50,20,4)
        >>> mdf = MeanDeviationForm()
        >>> mdf_fit = mdf.fit(X)
        >>> yt = mdf.transform(yt)

        """

        X_transform = X - self._mean_sample
        if type(X_transform) == np.ma.core.MaskedArray:
            return X_transform.filled().data
        else:
            return X_transform

[docs]    def fit_transform(self, X, y=None, **fit_params):
        self.fit(X, y, **fit_params)
        return self.transform(X)

[docs]    def inverse_transform(self, Y):
        """Undo the centering of X according to mean sample.

        Parameters
        ----------
        X : array-like of shape (m_samples, p_features, n_repeats)
            Input data that will be transformed.

        Returns
        -------
        Xt : ndarray of shape (m_samples, p_features, n_repeats)
            Transformed data.

        Examples
        --------
        >>> from mprod import MeanDeviationForm
        >>> import numpy as np
        >>> X = np.random.randn(10,20,4)
        >>> mdf = MeanDeviationForm()
        >>> Xt = mdf.fit_transform(X)
        >>> mdf.inverse_transform(Xt) - X

        """
        Y_transform = Y + self._mean_sample

        if type(Y) == np.ma.core.MaskedArray:
            return Y_transform.filled().data
        else:
            return Y_transform