Minimal working example¶

[1]:

import pandas as pd
import numpy as np

from mprod.dimensionality_reduction import TCAM
from mprod import table2tensor

import matplotlib.pyplot as plt
import seaborn as sns; sns.set_style('ticks')
%matplotlib inline

Load files¶

[2]:

file_path = "./data/Suez2018.txt"
data_raw = pd.read_csv(file_path, index_col=[0,1,2,3]
                       , dtype={'Participant':int,'Day':int})
meta_time = data_raw.index.to_frame().reset_index(drop = True)
meta = meta_time.drop(['Day','Phase'], axis=1).drop_duplicates()

display(data_raw.head())
display(meta_time.head())
display(meta.head())

				s__Vagococcus_lutrae	s__Asaccharobacter_celatus	s__Megasphaera_elsdenii	s__Leuconostoc_carnosum	s__Streptococcus_agalactiae	s__Tyzzerella_nexilis	s__Akkermansia_muciniphila	s__Alistipes_timonensis	s__Peptostreptococcus_anaerobius	s__Streptococcus_anginosus_group	...	s__Clostridium_celatum	s__Fusobacterium_periodonticum	s__Acidaminococcus_intestini	s__Streptococcus_sobrinus	s__Anaerostipes_caccae	s__Enterococcus_faecium	s__Eubacterium_sp_CAG_180	s__Veillonella_dispar	s__Actinomyces_sp_S6_Spd3	s__Firmicutes_bacterium_CAG_238
Participant	Phase	Group	Day
602	BAS	FMT	3	9.999999e-07	0.000051	0.018276	9.999999e-07	9.999999e-07	9.999999e-07	0.000198	9.999999e-07	9.999999e-07	9.999999e-07	...	9.999999e-07	9.999999e-07	2.247164e-04	9.999999e-07	9.999999e-07	9.999999e-07	0.005217	9.999999e-07	9.999999e-07	5.632903e-05
	BAS	FMT	6	9.999999e-07	0.000263	0.017956	9.999999e-07	9.999999e-07	9.999999e-07	0.000175	9.999999e-07	9.999999e-07	9.999999e-07	...	9.586594e-05	9.999999e-07	3.205194e-04	9.999999e-07	9.999999e-07	9.999999e-07	0.005515	9.999999e-07	9.999999e-07	9.263375e-05
	ABX	FMT	10	1.000001e-06	0.000001	0.000001	1.000001e-06	1.000001e-06	1.000001e-06	0.000001	1.000001e-06	1.000001e-06	2.271773e-03	...	1.000001e-06	1.000001e-06	1.000001e-06	1.000001e-06	1.000001e-06	1.000001e-06	0.000001	1.000001e-06	1.000001e-06	1.000001e-06
	ABX	FMT	13	1.000000e-06	0.000012	0.000270	1.000000e-06	1.000000e-06	1.000000e-06	0.000209	1.000000e-06	1.000000e-06	1.298585e-03	...	1.000000e-06	1.000000e-06	1.000000e-06	1.000000e-06	1.000000e-06	5.046216e-04	0.001349	1.000000e-06	1.000000e-06	1.000000e-06
	INT	FMT	17	9.999999e-07	0.000027	0.000628	9.999999e-07	9.999999e-07	9.999999e-07	0.000486	9.999999e-07	9.999999e-07	9.999999e-07	...	9.999999e-07	9.999999e-07	9.999999e-07	9.999999e-07	9.999999e-07	1.176117e-03	0.003147	9.999999e-07	9.999999e-07	9.999999e-07

5 rows × 482 columns

	Participant	Phase	Group	Day
0	602	BAS	FMT	3
1	602	BAS	FMT	6
2	602	ABX	FMT	10
3	602	ABX	FMT	13
4	602	INT	FMT	17

	Participant	Group
0	602	FMT
9	603	FMT
18	604	FMT
27	605	FMT
36	606	FMT

Data normalization and mode-mapping¶

[3]:

data_normalized = data_raw.groupby(level = 'Participant' )\
                    .apply(lambda x:np.log2(x/x.query("Phase == 'BAS'").mean()))
data_normalized = data_normalized.reset_index(level=['Phase','Group']
                                              , drop = True)
display(data_normalized.head())

		s__Vagococcus_lutrae	s__Asaccharobacter_celatus	s__Megasphaera_elsdenii	s__Leuconostoc_carnosum	s__Streptococcus_agalactiae	s__Tyzzerella_nexilis	s__Akkermansia_muciniphila	s__Alistipes_timonensis	s__Peptostreptococcus_anaerobius	s__Streptococcus_anginosus_group	...	s__Clostridium_celatum	s__Fusobacterium_periodonticum	s__Acidaminococcus_intestini	s__Streptococcus_sobrinus	s__Anaerostipes_caccae	s__Enterococcus_faecium	s__Eubacterium_sp_CAG_180	s__Veillonella_dispar	s__Actinomyces_sp_S6_Spd3	s__Firmicutes_bacterium_CAG_238
Participant	Day
602	3	-8.872499e-09	-1.623746	0.012707	-8.872499e-09	-8.872499e-09	-8.872499e-09	0.086871	-8.872499e-09	-8.872499e-09	-8.872499e-09	...	-5.597918	-8.872499e-09	-0.278775	-8.872499e-09	-8.872499e-09	-8.872499e-09	-0.040705	-8.872499e-09	-8.872499e-09	-0.403001
	6	8.872499e-09	0.744599	-0.012820	8.872499e-09	8.872499e-09	8.872499e-09	-0.092439	8.872499e-09	8.872499e-09	8.872499e-09	...	0.985029	8.872499e-09	0.233532	8.872499e-09	8.872499e-09	8.872499e-09	0.039588	8.872499e-09	8.872499e-09	0.314658
	10	1.337567e-06	-7.295634	-14.144966	1.337567e-06	1.337567e-06	1.337567e-06	-7.542312	1.337567e-06	1.337567e-06	1.114960e+01	...	-5.597916	1.337567e-06	-8.090735	1.337567e-06	1.337567e-06	1.337567e-06	-12.389628	1.337567e-06	1.337567e-06	-6.218807
	13	7.558580e-07	-3.707767	-6.069695	7.558580e-07	7.558580e-07	7.558580e-07	0.164989	7.558580e-07	7.558580e-07	1.034272e+01	...	-5.597917	7.558580e-07	-8.090736	7.558580e-07	7.558580e-07	8.979058e+00	-1.991681	7.558580e-07	7.558580e-07	-6.218808
	17	-1.975416e-08	-2.555618	-4.850362	-1.975416e-08	-1.975416e-08	-1.975416e-08	1.383432	-1.975416e-08	-1.975416e-08	-1.975416e-08	...	-5.597918	-1.975416e-08	-8.090737	-1.975416e-08	-1.975416e-08	1.019982e+01	-0.769899	-1.975416e-08	-1.975416e-08	-6.218808

5 rows × 482 columns

[4]:

tensor_data, mode1_map, mode3_map = table2tensor(data_normalized)
mode1_reverse_map = {val:k for k,val in mode1_map.items()}

display(tensor_data.shape)
print(mode1_map)
print(mode1_reverse_map)

(17, 482, 9)

{602: 0, 603: 1, 604: 2, 605: 3, 606: 4, 701: 5, 702: 6, 703: 7, 704: 8, 707: 9, 708: 10, 801: 11, 802: 12, 803: 13, 804: 14, 806: 15, 807: 16}
{0: 602, 1: 603, 2: 604, 3: 605, 4: 606, 5: 701, 6: 702, 7: 703, 8: 704, 9: 707, 10: 708, 11: 801, 12: 802, 13: 803, 14: 804, 15: 806, 16: 807}

Transform¶

The TCAM transformation is done by calling .fit and .transform methods of a TCAM object.

The TCAM class implements scikit-learn’s transformer-mixin methods, so its API is rather similar to sklearn’s PCA.

[5]:

tcam = TCAM(n_components=1.)    # will produce all components

transformed_data = tcam.fit_transform(tensor_data)
rounded_expvar = np.round(100*tcam.explained_variance_ratio_,2) # store explained variance ratios

# cast the results data into a pandas dataframe, using the first mode identifiers (subjects)
# as indices (row names)
df_tca = pd.DataFrame(transformed_data).rename(index = mode1_reverse_map)
df_tca.columns = [f'F{i}:{val}%' for i,val in enumerate(rounded_expvar, start = 1)]

# append metadata to the factors dataframe
df_plot = meta.merge(df_tca, left_on = 'Participant', right_index=True)

f1, f2 = df_tca.columns[:2].tolist() # get the column names for the first 2 factors
fig, ax = plt.subplots(figsize = [2,2], dpi = 500)
sns.scatterplot(data = df_plot, x = f1, y = f2 , hue='Group', ax = ax, edgecolor = 'k')
ax.legend(fontsize = 6, fancybox = False, framealpha = 1, edgecolor = 'k' )
plt.show()

../_images/examples_basic_example_9_0.png

[ ]: