Minimal working example

[1]:
import pandas as pd
import numpy as np

from mprod.dimensionality_reduction import TCAM
from mprod import table2tensor

import matplotlib.pyplot as plt
import seaborn as sns; sns.set_style('ticks')
%matplotlib inline

Load files

[2]:
file_path = "./data/Suez2018.txt"
data_raw = pd.read_csv(file_path, index_col=[0,1,2,3]
                       , dtype={'Participant':int,'Day':int})
meta_time = data_raw.index.to_frame().reset_index(drop = True)
meta = meta_time.drop(['Day','Phase'], axis=1).drop_duplicates()

display(data_raw.head())
display(meta_time.head())
display(meta.head())


s__Vagococcus_lutrae s__Asaccharobacter_celatus s__Megasphaera_elsdenii s__Leuconostoc_carnosum s__Streptococcus_agalactiae s__Tyzzerella_nexilis s__Akkermansia_muciniphila s__Alistipes_timonensis s__Peptostreptococcus_anaerobius s__Streptococcus_anginosus_group ... s__Clostridium_celatum s__Fusobacterium_periodonticum s__Acidaminococcus_intestini s__Streptococcus_sobrinus s__Anaerostipes_caccae s__Enterococcus_faecium s__Eubacterium_sp_CAG_180 s__Veillonella_dispar s__Actinomyces_sp_S6_Spd3 s__Firmicutes_bacterium_CAG_238
Participant Phase Group Day
602 BAS FMT 3 9.999999e-07 0.000051 0.018276 9.999999e-07 9.999999e-07 9.999999e-07 0.000198 9.999999e-07 9.999999e-07 9.999999e-07 ... 9.999999e-07 9.999999e-07 2.247164e-04 9.999999e-07 9.999999e-07 9.999999e-07 0.005217 9.999999e-07 9.999999e-07 5.632903e-05
6 9.999999e-07 0.000263 0.017956 9.999999e-07 9.999999e-07 9.999999e-07 0.000175 9.999999e-07 9.999999e-07 9.999999e-07 ... 9.586594e-05 9.999999e-07 3.205194e-04 9.999999e-07 9.999999e-07 9.999999e-07 0.005515 9.999999e-07 9.999999e-07 9.263375e-05
ABX FMT 10 1.000001e-06 0.000001 0.000001 1.000001e-06 1.000001e-06 1.000001e-06 0.000001 1.000001e-06 1.000001e-06 2.271773e-03 ... 1.000001e-06 1.000001e-06 1.000001e-06 1.000001e-06 1.000001e-06 1.000001e-06 0.000001 1.000001e-06 1.000001e-06 1.000001e-06
13 1.000000e-06 0.000012 0.000270 1.000000e-06 1.000000e-06 1.000000e-06 0.000209 1.000000e-06 1.000000e-06 1.298585e-03 ... 1.000000e-06 1.000000e-06 1.000000e-06 1.000000e-06 1.000000e-06 5.046216e-04 0.001349 1.000000e-06 1.000000e-06 1.000000e-06
INT FMT 17 9.999999e-07 0.000027 0.000628 9.999999e-07 9.999999e-07 9.999999e-07 0.000486 9.999999e-07 9.999999e-07 9.999999e-07 ... 9.999999e-07 9.999999e-07 9.999999e-07 9.999999e-07 9.999999e-07 1.176117e-03 0.003147 9.999999e-07 9.999999e-07 9.999999e-07

5 rows × 482 columns

Participant Phase Group Day
0 602 BAS FMT 3
1 602 BAS FMT 6
2 602 ABX FMT 10
3 602 ABX FMT 13
4 602 INT FMT 17
Participant Group
0 602 FMT
9 603 FMT
18 604 FMT
27 605 FMT
36 606 FMT

Data normalization and mode-mapping

[3]:
data_normalized = data_raw.groupby(level = 'Participant' )\
                    .apply(lambda x:np.log2(x/x.query("Phase == 'BAS'").mean()))
data_normalized = data_normalized.reset_index(level=['Phase','Group']
                                              , drop = True)
display(data_normalized.head())
s__Vagococcus_lutrae s__Asaccharobacter_celatus s__Megasphaera_elsdenii s__Leuconostoc_carnosum s__Streptococcus_agalactiae s__Tyzzerella_nexilis s__Akkermansia_muciniphila s__Alistipes_timonensis s__Peptostreptococcus_anaerobius s__Streptococcus_anginosus_group ... s__Clostridium_celatum s__Fusobacterium_periodonticum s__Acidaminococcus_intestini s__Streptococcus_sobrinus s__Anaerostipes_caccae s__Enterococcus_faecium s__Eubacterium_sp_CAG_180 s__Veillonella_dispar s__Actinomyces_sp_S6_Spd3 s__Firmicutes_bacterium_CAG_238
Participant Day
602 3 -8.872499e-09 -1.623746 0.012707 -8.872499e-09 -8.872499e-09 -8.872499e-09 0.086871 -8.872499e-09 -8.872499e-09 -8.872499e-09 ... -5.597918 -8.872499e-09 -0.278775 -8.872499e-09 -8.872499e-09 -8.872499e-09 -0.040705 -8.872499e-09 -8.872499e-09 -0.403001
6 8.872499e-09 0.744599 -0.012820 8.872499e-09 8.872499e-09 8.872499e-09 -0.092439 8.872499e-09 8.872499e-09 8.872499e-09 ... 0.985029 8.872499e-09 0.233532 8.872499e-09 8.872499e-09 8.872499e-09 0.039588 8.872499e-09 8.872499e-09 0.314658
10 1.337567e-06 -7.295634 -14.144966 1.337567e-06 1.337567e-06 1.337567e-06 -7.542312 1.337567e-06 1.337567e-06 1.114960e+01 ... -5.597916 1.337567e-06 -8.090735 1.337567e-06 1.337567e-06 1.337567e-06 -12.389628 1.337567e-06 1.337567e-06 -6.218807
13 7.558580e-07 -3.707767 -6.069695 7.558580e-07 7.558580e-07 7.558580e-07 0.164989 7.558580e-07 7.558580e-07 1.034272e+01 ... -5.597917 7.558580e-07 -8.090736 7.558580e-07 7.558580e-07 8.979058e+00 -1.991681 7.558580e-07 7.558580e-07 -6.218808
17 -1.975416e-08 -2.555618 -4.850362 -1.975416e-08 -1.975416e-08 -1.975416e-08 1.383432 -1.975416e-08 -1.975416e-08 -1.975416e-08 ... -5.597918 -1.975416e-08 -8.090737 -1.975416e-08 -1.975416e-08 1.019982e+01 -0.769899 -1.975416e-08 -1.975416e-08 -6.218808

5 rows × 482 columns

[4]:
tensor_data, mode1_map, mode3_map = table2tensor(data_normalized)
mode1_reverse_map = {val:k for k,val in mode1_map.items()}

display(tensor_data.shape)
print(mode1_map)
print(mode1_reverse_map)

(17, 482, 9)
{602: 0, 603: 1, 604: 2, 605: 3, 606: 4, 701: 5, 702: 6, 703: 7, 704: 8, 707: 9, 708: 10, 801: 11, 802: 12, 803: 13, 804: 14, 806: 15, 807: 16}
{0: 602, 1: 603, 2: 604, 3: 605, 4: 606, 5: 701, 6: 702, 7: 703, 8: 704, 9: 707, 10: 708, 11: 801, 12: 802, 13: 803, 14: 804, 15: 806, 16: 807}

Transform

The TCAM transformation is done by calling .fit and .transform methods of a TCAM object.

The TCAM class implements scikit-learn’s transformer-mixin methods, so its API is rather similar to sklearn’s PCA.

[5]:
tcam = TCAM(n_components=1.)    # will produce all components

transformed_data = tcam.fit_transform(tensor_data)
rounded_expvar = np.round(100*tcam.explained_variance_ratio_,2) # store explained variance ratios

# cast the results data into a pandas dataframe, using the first mode identifiers (subjects)
# as indices (row names)
df_tca = pd.DataFrame(transformed_data).rename(index = mode1_reverse_map)
df_tca.columns = [f'F{i}:{val}%' for i,val in enumerate(rounded_expvar, start = 1)]



# append metadata to the factors dataframe
df_plot = meta.merge(df_tca, left_on = 'Participant', right_index=True)





f1, f2 = df_tca.columns[:2].tolist() # get the column names for the first 2 factors
fig, ax = plt.subplots(figsize = [2,2], dpi = 500)
sns.scatterplot(data = df_plot, x = f1, y = f2 , hue='Group', ax = ax, edgecolor = 'k')
ax.legend(fontsize = 6, fancybox = False, framealpha = 1, edgecolor = 'k' )
plt.show()
../_images/examples_basic_example_9_0.png
[ ]: