Source code for mprod._ml_helpers
import numpy as np
from typing import List, Tuple, Dict, Mapping
from ._base import NumpynDArray
from sklearn.base import TransformerMixin, BaseEstimator
import pandas as pd
from itertools import product
[docs]def table2tensor(table: pd.DataFrame, missing_flag: bool = False) -> Tuple[np.ma.core.MaskedArray, Mapping, Mapping]:
"""
Reshapes a `nm x p` (`(samples x reps) x features`) multi-indexed datafram to form a `m x p x n` tensor
`(subjects, features, reps)`
Parameters
----------
table: pd.DataFrame
a `nm x p` table of sampels x features
missing_flag: `bool`, default = False
When set to `False` (default), the function will raise an error in case there are missing samples.
Setting to `True` will result in a tensor with masked entries.
Returns
-------
tensor: ndarray, np.ma.array
3'rd order tensor `m x p x n` (subjects, features, reps)
mode1_mapping : dict
The mapping of each mode1 (frontal) slice index of the tensor to the table's original subject name
mode3_mapping : dict
The mapping of each mode3 (lateral) slice index of the tensor to the table's original rep id
Examples
--------
Suppose that ``table_data`` is a dataframe with no missing values.
>>> from mprod import table2tensor
>>> import pandas as pd
>>> np.random.seed(0)
>>> table_data.iloc[:5,:4]
f1 f2 f3 f4
SubjetID rep
a t1 0.251259 0.744838 -0.45889 -0.208525
t10 2.39831 0.248772 0.65873 1.36994
t2 -0.303154 -0.337603 -0.568608 -1.0239
t3 1.36369 0.978895 0.161972 -0.804368
t4 1.8548 1.52954 0.78576 0.538041
>>> msk_tensor, mode1_mapping, mode3_mapping = table2tensor(table_data, missing_flag=False)
>>> msk_tensor[:3,:3,:2]
[[[0.25125853442243695 2.398308745102709]
[0.7448378210349296 0.2487716728987871]
[-0.4588901621837434 0.6587302072601999]]
[[-0.5689263433318329 -0.06564253839123065]
[1.0017636851038796 -0.49265853128383713]
[0.45266517056628647 -1.4812390563653883]]
[[0.7690616486878629 0.49302719962677855]
[0.3186320585255899 1.469576084933633]
[0.9609169837347897 -0.19564077520234632]]]
>>> mode1_mapping
{'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4}
>>> mode3_mapping
{'t1': 0,
't10': 1,
't2': 2,
't3': 3,
't4': 4,
't5': 5,
't6': 6,
't7': 7,
't8': 8,
't9': 9}
**missing values**
>>> msk_tensor, mode1_mapping, mode3_mapping = table2tensor(table_data.sample(40)
... , missing_flag=True)
>>> msk_tensor[:3,:3,:2]
masked_array(
data=[[[0.07664420134210018, --],
[-0.7358062254334045, --],
[0.5562074188402509, --]],
[[2.088982483926928, -0.06564253839123065],
[0.7697757466063808, -0.49265853128383713],
[0.4147812728859107, -1.4812390563653883]],
[[-0.004794963866429985, 1.2262908375944879],
[-0.15033350807209261, -0.3068131758163276],
[0.6461670563178799, 0.1769508046682527]]],
mask=[[[False, True],
[False, True],
[False, True]],
[[False, False],
[False, False],
[False, False]],
[[False, False],
[False, False],
[False, False]]], fill_value=0.0)
>>> mode1_mapping
{'a': 3, 'b': 1, 'c': 0, 'd': 4, 'e': 2}
>>> mode3_mapping
{'t1': 2,
't10': 1,
't2': 3,
't3': 6,
't4': 5,
't5': 7,
't6': 8,
't7': 4,
't8': 0,
't9': 9}
"""
samples_map, usamples = table.index.get_level_values(0).factorize()
reps_map, ureps = table.index.get_level_values(1).factorize()
m, p, n = usamples.size, table.shape[1], ureps.size
samples_map_dict = pd.Series(np.unique(samples_map), usamples).to_dict()
reps_map_dict = pd.Series(np.unique(reps_map), ureps).to_dict()
if missing_flag:
tensor = np.ma.array(np.zeros((m, p, n)), mask=np.ones((m, p, n)), fill_value=0)
index_iterator = table.iterrows()
else:
tensor = np.zeros((m, p, n))
index_iterator = (((i, j), table.loc[(i, j)].copy()) for i, j in product(usamples, ureps))
try:
for (m1, m3), val in index_iterator:
tensor[samples_map_dict[m1], :, reps_map_dict[m3]] = val.values
except KeyError as ke:
raise KeyError("Discovered missing data in the table, which is not allowed by default. "
"To work with missing data and have a masked array returned, set missing_flag to True")
return tensor, samples_map_dict, reps_map_dict
# noinspection PyPep8Naming
# noinspection PyUnusedLocal
[docs]class MeanDeviationForm(TransformerMixin, BaseEstimator):
"""Standardize the data by subtracting the mean (or empiric mean) sample
The mean deviation form of a tensor :math:`X \\in \mathbb{R}^{m \\times p \\times n}` is calculated as:
Z = X - U
where `U` is the mean sample of `X` , calculated as follows:
.. math::
U = \\frac{1}{m} \\sum_{i=1}^{m} X[i,:,:]
and for the empiric mean deviation form:
.. math::
U = \\frac{1}{m-1} \\sum_{i=1}^{m} X[i,:,:]
Attributes
----------
_mean_sample : ndarray of shape (p_features, n_repeats), or `None`
The mean sample of the dataset
Methods
-------
fit:
Fits a MeanDeviationForm transformer by computing the mean sample of a training dataset
transform:
Shift dataset by fitted sample mean
fit_transform:
Compute the mean sample of a dataset and transform it to its mean deviation form
inverse_transform:
Add precomputed mean sample to a dataset
"""
def __init__(self):
# super(MeanDeviationForm, self).__init__()
self._mean_sample = None
def _fit(self, X):
denum = X.shape[0]
self._mean_sample = np.nansum(X, axis=0, keepdims=True) / denum
[docs] def fit(self, X, y=None, **fit_param):
"""Compute the mean (or empiric mean) sample of a tensor
Parameters
----------
X : {array-like} of shape (m_samples, p_features, n_repeats)
The data used to compute the mean sample
used for later cantering along the features-repeats axes.
y : None
Ignored.
Returns
-------
self : object
Fitted MeanDeviationForm object
Examples
--------
>>> from mprod import MeanDeviationForm
>>> import numpy as np
>>> X = np.random.randn(10,20,4)
>>> mdf = MeanDeviationForm()
>>> mdf = mdf.fit(X)
"""
self._fit(X)
return self
[docs] def transform(self, X, y=None):
"""Perform standardization by centering.
Parameters
----------
X : array-like of shape (k_samples, p_features, n_repeats)
The data used to center along the features-repeats axes.
Returns
-------
X_tr : ndarray of shape (k_samples, p_features, n_repeats)
Transformed tensor.
Examples
--------
>>> from mprod import MeanDeviationForm
>>> import numpy as np
>>> X = np.random.randn(10,20,4)
>>> y = np.random.randn(50,20,4)
>>> mdf = MeanDeviationForm()
>>> mdf_fit = mdf.fit(X)
>>> yt = mdf.transform(yt)
"""
X_transform = X - self._mean_sample
if type(X_transform) == np.ma.core.MaskedArray:
return X_transform.filled().data
else:
return X_transform
[docs] def fit_transform(self, X, y=None, **fit_params):
self.fit(X, y, **fit_params)
return self.transform(X)
[docs] def inverse_transform(self, Y):
"""Undo the centering of X according to mean sample.
Parameters
----------
X : array-like of shape (m_samples, p_features, n_repeats)
Input data that will be transformed.
Returns
-------
Xt : ndarray of shape (m_samples, p_features, n_repeats)
Transformed data.
Examples
--------
>>> from mprod import MeanDeviationForm
>>> import numpy as np
>>> X = np.random.randn(10,20,4)
>>> mdf = MeanDeviationForm()
>>> Xt = mdf.fit_transform(X)
>>> mdf.inverse_transform(Xt) - X
"""
Y_transform = Y + self._mean_sample
if type(Y) == np.ma.core.MaskedArray:
return Y_transform.filled().data
else:
return Y_transform