multiomics

[ ]:

[1]:
#!pip install muon scanpy hvplot statsmodels scikit-learn
[7]:
pip install -e ../../STREAM2/
Obtaining file:///mnt/c/Users/jobac/Desktop/all/git/STREAM2
  Preparing metadata (setup.py) ... done
Requirement already satisfied: numpy>=1.17.0 in /home/jo/anaconda3/envs/stream2/lib/python3.8/site-packages (from stream2==0.1a0) (1.23.5)
Requirement already satisfied: numba>=0.52.0 in /home/jo/anaconda3/envs/stream2/lib/python3.8/site-packages (from stream2==0.1a0) (0.52.0)
Requirement already satisfied: networkx>=2.5 in /home/jo/anaconda3/envs/stream2/lib/python3.8/site-packages (from stream2==0.1a0) (3.1)
Requirement already satisfied: pandas!=1.1,>=1.0 in /home/jo/.local/lib/python3.8/site-packages (from stream2==0.1a0) (1.3.5)
Requirement already satisfied: anndata>=0.7.4 in /home/jo/anaconda3/envs/stream2/lib/python3.8/site-packages (from stream2==0.1a0) (0.8.0)
Requirement already satisfied: scanpy>=1.6.0 in /home/jo/anaconda3/envs/stream2/lib/python3.8/site-packages (from stream2==0.1a0) (1.9.3)
Requirement already satisfied: shapely>=2.0.1 in /home/jo/anaconda3/envs/stream2/lib/python3.8/site-packages (from stream2==0.1a0) (2.0.1)
Requirement already satisfied: statsmodels>=0.12.1 in /home/jo/anaconda3/envs/stream2/lib/python3.8/site-packages (from stream2==0.1a0) (0.13.5)
Requirement already satisfied: scikit-learn>=1.2 in /home/jo/anaconda3/envs/stream2/lib/python3.8/site-packages (from stream2==0.1a0) (1.2.2)
Requirement already satisfied: scipy>=1.4 in /home/jo/anaconda3/envs/stream2/lib/python3.8/site-packages (from stream2==0.1a0) (1.10.1)
Requirement already satisfied: kneed>=0.7 in /home/jo/anaconda3/envs/stream2/lib/python3.8/site-packages (from stream2==0.1a0) (0.7.0)
Requirement already satisfied: seaborn>=0.11 in /home/jo/anaconda3/envs/stream2/lib/python3.8/site-packages (from stream2==0.1a0) (0.12.2)
Requirement already satisfied: matplotlib>=3.3 in /home/jo/anaconda3/envs/stream2/lib/python3.8/site-packages (from stream2==0.1a0) (3.7.1)
Requirement already satisfied: plotly>=4.14.0 in /home/jo/anaconda3/envs/stream2/lib/python3.8/site-packages (from stream2==0.1a0) (5.14.1)
Requirement already satisfied: scikit-misc>=0.1.3 in /home/jo/anaconda3/envs/stream2/lib/python3.8/site-packages (from stream2==0.1a0) (0.2.0)
Requirement already satisfied: adjusttext>=0.7.3 in /home/jo/anaconda3/envs/stream2/lib/python3.8/site-packages (from stream2==0.1a0) (0.8)
Requirement already satisfied: umap-learn>=0.4.6 in /home/jo/anaconda3/envs/stream2/lib/python3.8/site-packages (from stream2==0.1a0) (0.5.3)
Requirement already satisfied: elpigraph-python>=0.3.1 in /home/jo/anaconda3/envs/stream2/lib/python3.8/site-packages (from stream2==0.1a0) (0.3.1)
Requirement already satisfied: python-slugify>=5.0.0 in /home/jo/anaconda3/envs/stream2/lib/python3.8/site-packages (from stream2==0.1a0) (8.0.1)
Requirement already satisfied: h5py>=3 in /home/jo/anaconda3/envs/stream2/lib/python3.8/site-packages (from anndata>=0.7.4->stream2==0.1a0) (3.8.0)
Requirement already satisfied: natsort in /home/jo/anaconda3/envs/stream2/lib/python3.8/site-packages (from anndata>=0.7.4->stream2==0.1a0) (8.3.1)
Requirement already satisfied: packaging>=20 in /home/jo/anaconda3/envs/stream2/lib/python3.8/site-packages (from anndata>=0.7.4->stream2==0.1a0) (23.1)
Requirement already satisfied: python-igraph>=0.7.1 in /home/jo/anaconda3/envs/stream2/lib/python3.8/site-packages (from elpigraph-python>=0.3.1->stream2==0.1a0) (0.10.4)
Requirement already satisfied: contourpy>=1.0.1 in /home/jo/anaconda3/envs/stream2/lib/python3.8/site-packages (from matplotlib>=3.3->stream2==0.1a0) (1.0.7)
Requirement already satisfied: cycler>=0.10 in /home/jo/anaconda3/envs/stream2/lib/python3.8/site-packages (from matplotlib>=3.3->stream2==0.1a0) (0.11.0)
Requirement already satisfied: fonttools>=4.22.0 in /home/jo/anaconda3/envs/stream2/lib/python3.8/site-packages (from matplotlib>=3.3->stream2==0.1a0) (4.39.3)
Requirement already satisfied: kiwisolver>=1.0.1 in /home/jo/anaconda3/envs/stream2/lib/python3.8/site-packages (from matplotlib>=3.3->stream2==0.1a0) (1.4.4)
Requirement already satisfied: pillow>=6.2.0 in /home/jo/anaconda3/envs/stream2/lib/python3.8/site-packages (from matplotlib>=3.3->stream2==0.1a0) (9.4.0)
Requirement already satisfied: pyparsing>=2.3.1 in /home/jo/anaconda3/envs/stream2/lib/python3.8/site-packages (from matplotlib>=3.3->stream2==0.1a0) (3.0.9)
Requirement already satisfied: python-dateutil>=2.7 in /home/jo/anaconda3/envs/stream2/lib/python3.8/site-packages (from matplotlib>=3.3->stream2==0.1a0) (2.8.2)
Requirement already satisfied: importlib-resources>=3.2.0 in /home/jo/anaconda3/envs/stream2/lib/python3.8/site-packages (from matplotlib>=3.3->stream2==0.1a0) (5.12.0)
Requirement already satisfied: llvmlite<0.36,>=0.35.0 in /home/jo/anaconda3/envs/stream2/lib/python3.8/site-packages (from numba>=0.52.0->stream2==0.1a0) (0.35.0)
Requirement already satisfied: setuptools in /home/jo/anaconda3/envs/stream2/lib/python3.8/site-packages (from numba>=0.52.0->stream2==0.1a0) (67.6.1)
Requirement already satisfied: pytz>=2017.3 in /home/jo/anaconda3/envs/stream2/lib/python3.8/site-packages (from pandas!=1.1,>=1.0->stream2==0.1a0) (2023.3)
Requirement already satisfied: tenacity>=6.2.0 in /home/jo/anaconda3/envs/stream2/lib/python3.8/site-packages (from plotly>=4.14.0->stream2==0.1a0) (8.2.2)
Requirement already satisfied: text-unidecode>=1.3 in /home/jo/anaconda3/envs/stream2/lib/python3.8/site-packages (from python-slugify>=5.0.0->stream2==0.1a0) (1.3)
Requirement already satisfied: tqdm in /home/jo/anaconda3/envs/stream2/lib/python3.8/site-packages (from scanpy>=1.6.0->stream2==0.1a0) (4.65.0)
Requirement already satisfied: patsy in /home/jo/anaconda3/envs/stream2/lib/python3.8/site-packages (from scanpy>=1.6.0->stream2==0.1a0) (0.5.3)
Requirement already satisfied: joblib in /home/jo/anaconda3/envs/stream2/lib/python3.8/site-packages (from scanpy>=1.6.0->stream2==0.1a0) (1.2.0)
Requirement already satisfied: session-info in /home/jo/anaconda3/envs/stream2/lib/python3.8/site-packages (from scanpy>=1.6.0->stream2==0.1a0) (1.0.0)
Requirement already satisfied: threadpoolctl>=2.0.0 in /home/jo/anaconda3/envs/stream2/lib/python3.8/site-packages (from scikit-learn>=1.2->stream2==0.1a0) (3.1.0)
Requirement already satisfied: pynndescent>=0.5 in /home/jo/anaconda3/envs/stream2/lib/python3.8/site-packages (from umap-learn>=0.4.6->stream2==0.1a0) (0.5.10)
Requirement already satisfied: zipp>=3.1.0 in /home/jo/anaconda3/envs/stream2/lib/python3.8/site-packages (from importlib-resources>=3.2.0->matplotlib>=3.3->stream2==0.1a0) (3.15.0)
Requirement already satisfied: six in /home/jo/anaconda3/envs/stream2/lib/python3.8/site-packages (from patsy->scanpy>=1.6.0->stream2==0.1a0) (1.16.0)
Requirement already satisfied: igraph==0.10.4 in /home/jo/anaconda3/envs/stream2/lib/python3.8/site-packages (from python-igraph>=0.7.1->elpigraph-python>=0.3.1->stream2==0.1a0) (0.10.4)
Requirement already satisfied: texttable>=1.6.2 in /home/jo/anaconda3/envs/stream2/lib/python3.8/site-packages (from igraph==0.10.4->python-igraph>=0.7.1->elpigraph-python>=0.3.1->stream2==0.1a0) (1.6.7)
Requirement already satisfied: stdlib-list in /home/jo/anaconda3/envs/stream2/lib/python3.8/site-packages (from session-info->scanpy>=1.6.0->stream2==0.1a0) (0.8.0)
Installing collected packages: stream2
  Attempting uninstall: stream2
    Found existing installation: stream2 0.1a0
    Uninstalling stream2-0.1a0:
      Successfully uninstalled stream2-0.1a0
  Running setup.py develop for stream2
Successfully installed stream2-0.1a0
Note: you may need to restart the kernel to use updated packages.
[1]:
import scanpy as sc
import muon as mu
import stream2 as st2

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import hvplot.pandas
from session_info import show
show()
/home/jo/anaconda3/envs/stream2/lib/python3.8/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
Cell In[1], line 11
      9 import hvplot.pandas
     10 from session_info import show
---> 11 show()

File ~/anaconda3/envs/stream2/lib/python3.8/site-packages/session_info/main.py:209, in show(na, os, cpu, jupyter, dependencies, std_lib, private, write_req_file, req_file_name, html, excludes)
    207 for mod_name in clean_modules:
    208     mod_names.append(mod_name)
--> 209     mod = sys.modules[mod_name]
    210     # Since modules use different attribute names to store version info,
    211     # try the most common ones.
    212     try:

KeyError: 'cupy'

Multi-modal trajectory inference demo

We load four assays (two scATAC and two scRNA) integrated into one anndata object.

adata_integrated.X contains gene expression (gene scores for ATAC cells) for 200 highly variable genes.

We will use this subset of measurements from hematopoietic cells as a toy example.

[2]:
adata_integrated = st2.read_h5ad('../data/multiomics/atac_rna_toy200.h5ad')

#sc.pp.neighbors(adata_integrated,n_neighbors=50,metric='cosine',use_rep='X_pca_transp_emd')
#sc.tl.umap(adata_integrated,min_dist=0.05)
sc.pl.umap(adata_integrated[::-1],color=['assay_detailed','leiden_identity_simple2',],wspace=.2)
/home/jo/anaconda3/envs/stream2/lib/python3.8/site-packages/anndata/compat/_overloaded_dict.py:106: ImplicitModificationWarning: Trying to modify attribute `._uns` of view, initializing view as actual.
  self.data[key] = value
/home/jo/anaconda3/envs/stream2/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:392: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
  cax = scatter(
_images/multiomics_7_1.png

In this case we have 4 assays with same features (since we only loaded gene scores for ATAC), which fits nicely into a single anndata.

However in general it is not possible to simply concatenate assays with possibly different features and observations into a single anndata.

The solution is to use the muon package and its MuData format to store our multimodal assays

[3]:
assays = adata_integrated.obs['assay_detailed']
adatas = {assay: adata_integrated[assays==assay] for assay in assays.unique()}
mdata = mu.MuData(adatas)
mdata
/home/jo/anaconda3/envs/stream2/lib/python3.8/site-packages/mudata/_core/mudata.py:457: UserWarning: Cannot join columns with the same name because var_names are intersecting.
  warnings.warn(
[3]:
MuData object with n_obs × n_vars = 99955 × 800
  4 modalities
    ATAC_CD:        13575 x 200
      obs:  'leiden_identity_simple2', 'label', 'assay_detailed'
      obsm: 'X_pca_transp_emd', 'X_umap'
    ATAC_days:      19206 x 200
      obs:  'leiden_identity_simple2', 'label', 'assay_detailed'
      obsm: 'X_pca_transp_emd', 'X_umap'
    RNA_CD: 22804 x 200
      obs:  'leiden_identity_simple2', 'label', 'assay_detailed'
      obsm: 'X_pca_transp_emd', 'X_umap'
    RNA_days:       44370 x 200
      obs:  'leiden_identity_simple2', 'label', 'assay_detailed'
      obsm: 'X_pca_transp_emd', 'X_umap'

In our case cells are not matched between assays so we had to perform data integration to get a joint UMAP representation We will learn a joint trajectory based on this joint representation. Let’s store it along with cell annotation at the global rather than assay level in mdata

[4]:
mdata.obsm['X_dr'] = mdata.obsm['X_umap'] = np.vstack([adata.obsm['X_umap'] for adata in mdata.mod.values()])
mdata.obsm['X_pca_transp_emd'] = np.vstack([adata.obsm['X_pca_transp_emd'] for adata in mdata.mod.values()])
mdata.obs['leiden_identity_simple2'] = np.hstack([adata.obs['leiden_identity_simple2'] for adata in mdata.mod.values()])

If your data contains matched cells, you have more choice and can learn the trajectory based on: - some RNA representation - some ATAC representation - some joint RNA+ATAC representation obtained using your favorite method e.g.: - MOFA (mu.tl.mofa(mdata)) - J-UMAP https://github.com/canzarlab/JVis-learn - UMAP tutorial for multiple views https://umap-learn.readthedocs.io/en/latest/composing_models.html

If your data contains non-matched cells then assays can be integrated into a joint representation. Some suggestions of methods to obtain it: - Harmony, Scanorama (scanpy.pp.external) - Optimal transport https://pythonot.github.io/ - Joint MDS https://github.com/BorgwardtLab/JointMDS - MultiMAP https://github.com/Teichlab/MultiMAP

Let’s work with a subsample (just to speed up trajectory inference, it’s also fine to use mdata directly)

[5]:
idx = np.random.RandomState(0).choice(mdata.shape[0],5000,replace=False)

a_s = sc.AnnData(np.zeros((5000,1)))
a_s.obsm['X_pca_transp_emd'] = mdata.obsm['X_pca_transp_emd'][idx]
a_s.obsm['X_dr'] = a_s.obsm['X_umap'] = mdata.obsm['X_umap'][idx]
a_s.obs['leiden_identity_simple2'] = mdata.obs['leiden_identity_simple2'][idx].values
/tmp/ipykernel_2000/1930611853.py:3: FutureWarning: X.dtype being converted to np.float32 from float64. In the next version of anndata (0.9) conversion will not be automatic. Pass dtype explicitly to avoid this warning. Pass `AnnData(X, dtype=X.dtype, ...)` to get the future behavour.
  a_s = sc.AnnData(np.zeros((5000,1)))

Trajectory inference in 2d

We will first illustrate how to learn a trajectory in 2D before illustrating higher dimensions.

We seed graph with a minimum spanning tree. Forbid the tree to form paths connecting Neu-like to DC-like monocytes

[12]:
st2.tl.seed_graph(a_s,n_clusters=80, label_strength=2.5,
                  paths_disfavored=[['Neu-like monocytes','DC-like monocytes']],label='leiden_identity_simple2')
st2.pl.graph(a_s,color=['leiden_identity_simple2'],alpha=.5,size=10,fig_size=(8,5),key='seed_epg')
Seeding initial graph...
Clustering...
K-Means clustering ...
Calculating minimum spanning tree...
/mnt/c/Users/jobac/Desktop/all/git/STREAM2/stream2/tools/_elpigraph.py:865: FutureWarning: Unlike other reduction functions (e.g. `skew`, `kurtosis`), the default behavior of `mode` typically preserves the axis it acts along. In SciPy 1.11.0, this behavior will change: the default value of `keepdims` will become False, the `axis` over which the statistic is taken will be eliminated, and the value None will no longer be accepted. Set `keepdims` to True or False to avoid this warning.
  modes[i] = scipy.stats.mode(labels[part == i]).mode[0]
/home/jo/anaconda3/envs/stream2/lib/python3.8/site-packages/scipy/stats/_stats_py.py:112: RuntimeWarning: The input array could not be properly checked for nan values. nan values will be ignored.
  warnings.warn("The input array could not be properly "
_images/multiomics_17_2.png

We refine the graph.

We speed up computations by limiting the max number of candidate graphs considered when fitting.

We emphasize less dense regions with slightly higher point weights

[13]:
st2.tl.get_weights(a_s,bandwidth=2)
st2.tl.learn_graph(a_s,n_nodes=95,epg_lambda=0.02,epg_mu=0.05,epg_alpha=0.02,use_weights=True,verbose=1,
                   max_candidates = {"AddNode2Node": 10,"BisectEdge": 10,"ShrinkEdge": 0,})
st2.pl.graph(a_s,color=['leiden_identity_simple2'],alpha=.5,size=10,fig_size=(10,6))
Constructing tree 1 of 1 / Subset 1 of 1
Computing EPG with  95  nodes on  5000  points and  2  dimensions

BARCODE ENERGY  NNODES  NEDGES  NRIBS   NSTARS  NRAYS   NRAYS2  MSE     MSEP    FVE     FVEP    UE      UR      URN     URN2    URSD

1|13||81        1.8126  81      80      50      13      0       0       0.504   0.454   0.9915  0.9924  1.2937  0.0148  1.1968  96.9448 0
1|12||82        1.7496  82      81      53      12      0       0       0.5022  0.4546  0.9916  0.9924  1.2332  0.0143  1.1693  95.8848 0
1|11||83        1.6986  83      82      56      11      0       0       0.5013  0.4559  0.9916  0.9923  1.1829  0.0144  1.1959  99.2564 0
12||84  1.6596  84      83      58      12      0       0       0.492   0.4479  0.9917  0.9925  1.1546  0.0131  1.0982  92.2525 0
11||85  1.6202  85      84      61      11      0       0       0.4905  0.4492  0.9918  0.9925  1.1164  0.0133  1.1298  96.0343 0
10||86  1.5849  86      85      64      10      0       0       0.4853  0.4457  0.9918  0.9925  1.0868  0.0128  1.1006  94.6513 0
9||87   1.5572  87      86      67      9       0       0       0.4834  0.4453  0.9919  0.9925  1.0609  0.0129  1.1238  97.7669 0
9||88   1.5259  88      87      68      9       0       0       0.4748  0.438   0.992   0.9926  1.0391  0.0119  1.046   92.046  0
8||89   1.4977  89      88      71      8       0       0       0.4675  0.432   0.9921  0.9927  1.019   0.0112  0.9961  88.656  0
8||90   1.4824  90      89      72      8       0       0       0.4763  0.4411  0.992   0.9926  0.9918  0.0143  1.2884  115.9605        0
7||91   1.4582  91      90      75      7       0       0       0.4765  0.4426  0.992   0.9926  0.9693  0.0124  1.1254  102.415 0
7||92   1.4527  92      91      76      7       0       0       0.4855  0.453   0.9918  0.9924  0.9506  0.0166  1.5287  140.639 0
7||93   1.4382  93      92      77      7       0       0       0.4822  0.4507  0.9919  0.9924  0.9394  0.0166  1.5483  143.9908        0
7||94   1.4305  94      93      78      7       0       0       0.4874  0.456   0.9918  0.9923  0.9256  0.0175  1.6493  155.031 0
7||95   1.417   95      94      79      7       0       0       0.4883  0.4581  0.9918  0.9923  0.9113  0.0173  1.6421  156.0022        0
36.4012  seconds elapsed
_images/multiomics_19_1.png

Visualize node labels

[10]:
st2.pl.graph(a_s,color=['leiden_identity_simple2'],alpha=.1,size=10,fig_size=(10,6),show_text=1)
_images/multiomics_21_0.png

Fine tune the graph by: - adding extra plausible paths - refitting the graph while forcing some nodes to remain fixed at chosen positions - extending leaves so branches reach the end of the distribution

[11]:
a_s2 = a_s.copy()

st2.tl.add_path(a_s2,38,41,refit_graph=True,use_weights=True)

st2.tl.refit_graph(a_s2,shift_nodes_pos={
    3 : a_s2.uns['epg']['node_pos'][3] + np.array([+2, +0]),
    24 : a_s2.uns['epg']['node_pos'][24] + np.array([-.6, +.2]),
    25 : a_s2.uns['epg']['node_pos'][25] + np.array([+0., +.2]),
    8  : a_s2.uns['epg']['node_pos'][8] + np.array([-1,  +.75]),
    44 : a_s2.uns['epg']['node_pos'][44] + np.array([-.8,+.8]),
    }
)

st2.tl.add_path(a_s2,2,32,epg_lambda=.2)
st2.tl.add_path(a_s2,30,35,epg_lambda=.2,epg_mu=.2)
st2.tl.add_path(a_s2,99,[2.,9.],epg_lambda=.2,epg_mu=.2)
st2.tl.extend_leaves(a_s2,Mode="QuantDists",ControlPar=.75)
st2.pl.graph(a_s2,color=['leiden_identity_simple2'],alpha=.5,size=10,fig_size=(8,5))
Performing simulated annealing. This may take a while
Performing simulated annealing. This may take a while
Performing simulated annealing. This may take a while
Performing simulated annealing. This may take a while
Performing simulated annealing. This may take a while
Performing simulated annealing. This may take a while
Performing simulated annealing. This may take a while
(7, 2)
(112, 2)
_images/multiomics_23_1.png

Store graph computed on the subsample back to mdata with the full dataset

[16]:
mdata.uns = a_s2.uns.copy()
# this recomputes the projection of cells onto the graph
st2.tl._elpigraph._store_graph_attributes(mdata,mdata.obsm['X_dr'],'epg')

Infer pseudotime for a given root node

[17]:
st2.tl.infer_pseudotime(mdata,source=2)
st2.pl.graph(mdata,color=['epg_pseudotime'],size=.1,alpha=.3)
_images/multiomics_27_0.png

Infer pseudotime for a given path

[18]:
st2.tl.infer_pseudotime(mdata,source=66,target=1)
st2.pl.graph(mdata,color=['epg_pseudotime'],size=.1,alpha=.3)
_images/multiomics_29_0.png

Use nodes_to_include to take a longer path rather than the shortest (default)

[19]:
st2.tl.infer_pseudotime(mdata,source=66,target=1,nodes_to_include=[26])
st2.pl.graph(mdata,color=['epg_pseudotime'],size=.1,alpha=.3)
_images/multiomics_31_0.png

Let’s investigate Erythroids. We compute transition markers (that go up or down along the path) for each assay separately

method = ‘spearman’ for spearman correlation

method = ‘xi’ for non-linear XI correlation, returns values between 0-1

[20]:
source, target = 24, 115

st2.tl.infer_pseudotime(mdata,source=24,target=115)
st2.pl.graph(mdata,color=['epg_pseudotime'],size=.1,alpha=.3)

transition_markers = {}
for assay,adata in mdata.mod.items():
    adata.uns['epg']=mdata.uns['epg']
    adata.obs = adata.obs.join(mdata.obs[[k for k in mdata.obs if 'epg' in k]])
    st2.tl.infer_pseudotime(adata,source=24,target=115)
    st2.tl.detect_transition_markers(adata,fc_cutoff=0., method='spearman')

    transition_markers[assay] = adata.uns['transition_markers'][f'Path_{source}-None-{target}']
/home/jo/anaconda3/envs/stream2/lib/python3.8/site-packages/anndata/compat/_overloaded_dict.py:106: ImplicitModificationWarning: Trying to modify attribute `._uns` of view, initializing view as actual.
  self.data[key] = value
2905 Cells are selected for Path_Source_Nodes-to-include_Target :  Path_24-None-115
Filtering out markers that are expressed in less than 5 cells ...
200 markers are being scanned ...
/home/jo/anaconda3/envs/stream2/lib/python3.8/site-packages/anndata/compat/_overloaded_dict.py:106: ImplicitModificationWarning: Trying to modify attribute `._uns` of view, initializing view as actual.
  self.data[key] = value
3508 Cells are selected for Path_Source_Nodes-to-include_Target :  Path_24-None-115
Filtering out markers that are expressed in less than 5 cells ...
200 markers are being scanned ...
3993 Cells are selected for Path_Source_Nodes-to-include_Target :  Path_24-None-115
Filtering out markers that are expressed in less than 5 cells ...
200 markers are being scanned ...
9514 Cells are selected for Path_Source_Nodes-to-include_Target :  Path_24-None-115
Filtering out markers that are expressed in less than 5 cells ...
200 markers are being scanned ...
_images/multiomics_33_4.png

Plot results

[21]:
def plot_top_markers(transition_markers,sort_by = 'logfc',n_genes = 25):

    f,axs=plt.subplots(2,2,figsize=(10,10),sharex=True)
    axs=axs.flat
    for i,n in zip(range(len(transition_markers)),transition_markers.keys()):

        # get data
        df = transition_markers[n][sort_by].sort_values()[::-1][:n_genes]
        gene_names, scores = df.index, df.values

        # plot
        ymax, ymin = np.max(scores), np.min(scores) - 0.05*np.min(scores)
        ymax += 0.3 * (ymax - ymin)

        ax=next(axs)
        ax.set_ylim(ymin, ymax)
        ax.set_xlim(-0.9, n_genes - 0.1)
        ax.set_title(n,fontsize=25)
        if i>1: ax.set_xlabel('ranking',fontsize=20)
        if i==0 or i==2: ax.set_ylabel(sort_by,fontsize=20)

        # Making labels
        for ig, gene_name in enumerate(gene_names):
            ax.text(ig,scores[ig],gene_name,rotation='vertical',verticalalignment='bottom',horizontalalignment='center',fontsize=10,)
    plt.tight_layout(); plt.show()


plot_top_markers(transition_markers, sort_by = 'stat',n_genes = 25,)

df = pd.concat(transition_markers).reset_index().rename(columns={'level_0':'assay_detailed','level_1':'var_names'})
df.hvplot.scatter("stat", "logfc", flip_yaxis=False, logy=False, by='assay_detailed',hover_cols=["var_names"],height=500,width=800)
_images/multiomics_35_0.png
[21]:

Let’s check some of those markers found in several assays

[22]:
def scatterplot_assays(color):
    f,axs=plt.subplots(2,2,figsize=(10,10),sharex=True)
    axs=axs.flat
    for assay,adata in mdata.mod.items():
        ax=next(axs)
        sc.pl.umap(adata,color=color,ax=ax,show=False)
        ax.set_title(assay)
    plt.show()

scatterplot_assays('HBB')
_images/multiomics_37_0.png

We can try to correlate assays’ variables directly.

In our case cells are not matched across assays; this means they will have different pseudotime values

To deal with this a simple approach is to interpolate data and resample at the same pseudotime values. If your cells are matched you don’t need this step

[23]:
import scipy

interpolated = {}
for assay, adata in mdata.mod.items():
    #arrays need to be dense for this step
    if scipy.sparse.issparse(adata.X):
        adata.X=adata.X.toarray()
    interpolated[assay] = st2.tl.interpolate(adata,t_len=200,method='knn',frac=.1,n_neighbors='auto',weights='uniform')[1]

Now that we have data with the same pseudotime values we can correlate variables with same name across assays

[24]:
correlations = {}
modalities = list(mdata.mod.keys())

for i in range(mdata.n_mod-1):
    for j in range(i+1,mdata.n_mod):

        correlations[modalities[i],modalities[j]] = st2.tl.spearman_columns(interpolated[modalities[i]],
                                                                            interpolated[modalities[j]])
                                                   #or st2.tl.xicorr_columns()[0]

correlations = pd.DataFrame(correlations)
correlations.index = mdata[modalities[0]].var_names
correlations.columns = correlations.columns.to_flat_index()
sns.clustermap(correlations.dropna(),vmin=-1,vmax=1)
[24]:
<seaborn.matrix.ClusterGrid at 0x7f6142416970>
_images/multiomics_41_1.png

If variables were different across assays we could correlate all with all instead

[25]:
correlations_all = {}
modalities = list(mdata.mod.keys())
for i in range(mdata.n_mod-1):
    for j in range(i+1,mdata.n_mod):
        correlations_all[modalities[i],modalities[j]] = st2.tl.spearman_pairwise(interpolated[modalities[i]],
                                                                                 interpolated[modalities[j]])
                                                        #or st2.tl.xicorr_pairwise()[0]


mi,mj = 'RNA_CD','RNA_days'
corrs_all = pd.DataFrame(correlations_all[mi,mj])
corrs_all.index, corrs_all.columns = mdata[mi].var_names, mdata[mj].var_names
corrs_all[corrs_all.isna()]=0.
sns.clustermap(corrs_all,vmin=-1,vmax=1)
/home/jo/anaconda3/envs/stream2/lib/python3.8/site-packages/seaborn/matrix.py:560: UserWarning: Clustering large matrix with scipy. Installing `fastcluster` may give better performance.
  warnings.warn(msg)
[25]:
<seaborn.matrix.ClusterGrid at 0x7f6142c8e160>
_images/multiomics_43_2.png

Let’s visualize results as scatter plots.

e.g., we can plot spearman correlation against pseudotime for two assays and color by their correlation.

[26]:
def plot_two_assays(transition_markers,mod1, mod2, plot_var='stat',color='dodgerblue',cmap='Viridis'):
    selected = [mod1,mod2]
    selected_dfs = {k:v for k,v in transition_markers.items() if k in selected}
    df = pd.concat(selected_dfs,axis=1,keys=selected).dropna()
    df.columns = df.columns.to_flat_index().map(lambda s:' '.join(s))

    if type(color)!=str:
        df = df.join(color)
        color, clabel = df[color.name], color.name
    else:
        clabel = color

    df.index.name='var_name'
    return df.hvplot.scatter(f'{mod1} {plot_var}',f'{mod2} {plot_var}', hover_cols=["var_name"],
                             color=color,cmap=cmap,clabel=clabel, hover=True,
                             xlim=(-1.05,1.05),ylim=(-1.05,1.05),height=500,width=600)

def compose_fig(plots,ncols=2):
    figure = '('+ '+'.join([f'plots[{i}]' for i in range(len(plots))]) + f').cols({ncols})'
    return eval(figure)

plots = []
for i in range(mdata.n_mod-1):
    for j in range(i+1,mdata.n_mod):
        color = correlations[modalities[i],modalities[j]].rename('correlation '+modalities[i]+' vs '+modalities[j])
        plots.append( plot_two_assays(transition_markers,modalities[i],modalities[j], plot_var='stat', color=color) )

compose_fig(plots,2)
[26]:

Trajectory inference in higher dimension

We now show how the trajectory can be learnt based on the higher dimensional PCA data instead

[27]:
a_s.obsm['X_dr'] = a_s.obsm['X_pca_transp_emd']
st2.tl.seed_graph(a_s,n_clusters=80,label_strength=2.5,paths_disfavored=[['Neu-like monocytes','DC-like monocytes']],label='leiden_identity_simple2')
st2.pl.graph(a_s,color=['leiden_identity_simple2'],alpha=.5,size=10,fig_size=(8,5),key='seed_epg')
Seeding initial graph...
Clustering...
K-Means clustering ...
Calculating minimum spanning tree...
/mnt/c/Users/jobac/Desktop/all/git/STREAM2/stream2/tools/_elpigraph.py:865: FutureWarning: Unlike other reduction functions (e.g. `skew`, `kurtosis`), the default behavior of `mode` typically preserves the axis it acts along. In SciPy 1.11.0, this behavior will change: the default value of `keepdims` will become False, the `axis` over which the statistic is taken will be eliminated, and the value None will no longer be accepted. Set `keepdims` to True or False to avoid this warning.
  modes[i] = scipy.stats.mode(labels[part == i]).mode[0]
/home/jo/anaconda3/envs/stream2/lib/python3.8/site-packages/scipy/stats/_stats_py.py:112: RuntimeWarning: The input array could not be properly checked for nan values. nan values will be ignored.
  warnings.warn("The input array could not be properly "
_images/multiomics_48_2.png

To make sense of how the graph fits to data, it can be useful to roughly represent its position on the 2D umap rather than PC axes.

We achieve this by projecting data points onto their closest node, and defining 2D node positions as the centroid of their associated points

[28]:
st2.tl.project_graph(a_s, to_basis='X_umap', key='seed_epg')
st2.pl.graph(a_s,color=['leiden_identity_simple2'],alpha=.5,size=10,fig_size=(8,5),key='seed_epg_from_X_dr_to_X_umap')
_images/multiomics_50_0.png
[29]:
st2.tl.learn_graph(a_s,n_nodes=100,epg_lambda=0.02,epg_mu=0.05,epg_alpha=0.02,verbose=1,store_evolution=True)
                   #max_candidates = {"AddNode2Node": 10,"BisectEdge": 10,"ShrinkEdge": 0,})
st2.pl.graph(a_s,color=['leiden_identity_simple2'],alpha=.5,size=10,fig_size=(8,5))
Constructing tree 1 of 1 / Subset 1 of 1
Computing EPG with  100  nodes on  5000  points and  14  dimensions

BARCODE ENERGY  NNODES  NEDGES  NRIBS   NSTARS  NRAYS   NRAYS2  MSE     MSEP    FVE     FVEP    UE      UR      URN     URN2    URSD

4|17||81        8.6429  81      80      33      17      0       0       6.072   5.9942  0.8758  0.8774  2.5113  0.0597  4.8363  391.7442        0
3|18||82        8.5494  82      81      35      18      0       0       6.0528  5.9804  0.8762  0.8777  2.4401  0.0565  4.637   380.2305        0
2|19||83        8.451   83      82      37      19      0       0       6.0332  5.9652  0.8766  0.878   2.3627  0.0551  4.5701  379.3221        0
1|20||84        8.3884  84      83      39      20      0       0       6.0106  5.9425  0.8771  0.8785  2.321   0.0568  4.7691  400.6011        0
1|19||85        8.3177  85      84      42      19      0       0       5.9905  5.9232  0.8775  0.8788  2.2717  0.0555  4.7141  400.6955        0
20||86  8.2488  86      85      44      20      0       0       5.9737  5.908   0.8778  0.8792  2.2199  0.0552  4.7432  407.9185        0
19||87  8.1799  87      86      47      19      0       0       5.9594  5.8963  0.8781  0.8794  2.1685  0.052   4.5215  393.3734        0
18||88  8.1456  88      87      50      18      0       0       5.9684  5.905   0.8779  0.8792  2.1217  0.0555  4.8835  429.7443        0
17||89  8.0872  89      88      53      17      0       0       5.9558  5.8941  0.8782  0.8794  2.0789  0.0524  4.6658  415.2524        0
16||90  8.0388  90      89      56      16      0       0       5.9549  5.8942  0.8782  0.8794  2.0317  0.0522  4.6985  422.8661        0
15||91  8.0027  91      90      59      15      0       0       5.9558  5.8958  0.8782  0.8794  1.9936  0.0533  4.8512  441.456 0
14||92  7.9597  92      91      62      14      0       0       5.9475  5.8889  0.8784  0.8796  1.9568  0.0553  5.0916  468.43  0
13||93  7.9538  93      92      65      13      0       0       5.9667  5.9098  0.878   0.8791  1.9219  0.0651  6.0563  563.238 0
13||94  7.918   94      93      66      13      0       0       5.9639  5.9085  0.878   0.8791  1.8932  0.0608  5.7199  537.666 0
12||95  7.8795  95      94      69      12      0       0       5.9659  5.9112  0.878   0.8791  1.8541  0.0595  5.6491  536.6651        0
12||96  7.8339  96      95      70      12      0       0       5.9546  5.9009  0.8782  0.8793  1.8237  0.0556  5.3356  512.2143        0
11||97  7.8004  97      96      73      11      0       0       5.9618  5.9089  0.8781  0.8791  1.7868  0.0519  5.0353  488.4195        0
11||98  7.7592  98      97      74      11      0       0       5.937   5.8848  0.8786  0.8796  1.7718  0.0504  4.9379  483.9173        0
11||99  7.73    99      98      75      11      0       0       5.9231  5.8732  0.8789  0.8799  1.7558  0.0511  5.0544  500.3827        0
11||100 7.7023  100     99      76      11      0       0       5.9142  5.8656  0.879   0.88    1.7378  0.0503  5.0299  502.9927        0
127.9997  seconds elapsed
_images/multiomics_51_1.png
[30]:
st2.tl.project_graph(a_s, to_basis='X_umap', key='epg')
st2.pl.graph(a_s,color=['leiden_identity_simple2'],alpha=.5,size=10,fig_size=(8,5),key='epg_from_X_dr_to_X_umap')
_images/multiomics_52_0.png

We can fine-tune the graph structure to: - add putative path from mixed progenitors to plasmacytoid DC - better fit paths for monocyte subtypes which are hard to capture - extend leaves of the graph to the very endpoints of the data distribution

[31]:
a_s2=a_s.copy()
st2.tl.del_path(a_s2,48,8)
st2.tl.add_path(a_s2,45,29)
st2.tl.add_path(a_s2,39,47,n_nodes=4)
st2.tl.add_path(a_s2,52,68)
st2.tl.del_path(a_s2,20,36)
st2.tl.del_path(a_s2,39,6)
st2.tl.del_path(a_s2,98,48)
st2.tl.del_path(a_s2,26,10)
st2.tl.del_path(a_s2,82,15)
st2.tl.add_path(a_s2,82,55)
st2.tl.refit_graph(a_s2,shift_nodes_pos={24 : a_s2.uns['epg']['node_pos'][24],# enforce same endpoint for monocytes
                                         26 : a_s2.uns['epg']['node_pos'][24],})
st2.tl.extend_leaves(a_s2,LeafIDs=np.array([18,12,98,45,48,24,50,32]),Mode="QuantDists",ControlPar=.7)
Performing simulated annealing. This may take a while
Performing simulated annealing. This may take a while
Performing simulated annealing. This may take a while
Performing simulated annealing. This may take a while
Performing simulated annealing. This may take a while
Performing simulated annealing. This may take a while
Performing simulated annealing. This may take a while
Performing simulated annealing. This may take a while
(8, 14)
(103, 14)
[32]:
st2.tl.project_graph(a_s2, to_basis='X_umap', key='epg')
st2.pl.graph(a_s2,color=['leiden_identity_simple2'],alpha=.5,size=10,fig_size=(16,10),show_text=1)
st2.pl.graph(a_s2,color=['leiden_identity_simple2'],alpha=.5,size=10,fig_size=(8,5),key='epg_from_X_dr_to_X_umap')
/home/jo/anaconda3/envs/stream2/lib/python3.8/site-packages/elpigraph/utils.py:403: RuntimeWarning: invalid value encountered in divide
  proj = (np.dot(X_embed.T, R) / R.sum(axis=0)).T
_images/multiomics_55_1.png
_images/multiomics_55_2.png

Now that we have the final graph we can compute pseudotime for further analyses

[33]:
adata_integrated.uns=a_s2.uns.copy()
st2.tl._elpigraph._store_graph_attributes(adata_integrated,adata_integrated.obsm['X_pca_transp_emd'],'epg')
st2.tl.infer_pseudotime(adata_integrated,40)

We can obtain a nicer 2D representation of nodes, e.g. by recomputing a UMAP directly including them

[34]:
# dummy anndata
a=sc.AnnData(np.concatenate((adata_integrated.obsm['X_pca_transp_emd'],adata_integrated.uns['epg']['node_pos'])))
sc.pp.neighbors(a,n_neighbors=50,metric='cosine')
sc.tl.umap(a,min_dist=0.05)

# store results
adata_integrated.obsm['X_umap_w_nodes']=a.obsm['X_umap'][:len(adata_integrated)]
adata_integrated.uns['umap_nodes']=a.obsm['X_umap'][len(adata_integrated):]

# plot
ax=plt.subplot()
_=sc.pl.embedding(adata_integrated,'X_umap_w_nodes',size=3,alpha=.5,
                  color='leiden_identity_simple2',show=False,return_fig=False,ax=ax)
ax.set_title(None)
ax.plot(adata_integrated.uns['umap_nodes'][adata_integrated.uns['epg']['edge'].T,0],
        adata_integrated.uns['umap_nodes'][adata_integrated.uns['epg']['edge'].T,1], c='k',linewidth=2)
plt.show()
/tmp/ipykernel_2000/3214916952.py:2: FutureWarning: X.dtype being converted to np.float32 from float64. In the next version of anndata (0.9) conversion will not be automatic. Pass dtype explicitly to avoid this warning. Pass `AnnData(X, dtype=X.dtype, ...)` to get the future behavour.
  a=sc.AnnData(np.concatenate((adata_integrated.obsm['X_pca_transp_emd'],adata_integrated.uns['epg']['node_pos'])))
/home/jo/anaconda3/envs/stream2/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:392: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
  cax = scatter(
_images/multiomics_59_1.png

We can also plot various combination of PCs to get a better high-dimensional picture

[35]:
n_comps=6
f,axs=plt.subplots(n_comps,n_comps,figsize=(32,32),sharex='col',sharey='row')
for i in range(n_comps):
    for j in range(i+1,n_comps):

        _=sc.pl.embedding(adata_integrated,'X_pca_transp_emd',dimensions=(i,j),size=3,alpha=.5,
                        color='leiden_identity_simple2',show=False,return_fig=False,ax=axs[j,i])

        handles, labels = axs[j,i].get_legend_handles_labels()
        axs[j,i].get_legend().remove()
        axs[j,i].set_title(None)
        axs[j,i].plot(adata_integrated.uns['epg']['node_pos'][adata_integrated.uns['epg']['edge'].T,i],
                      adata_integrated.uns['epg']['node_pos'][adata_integrated.uns['epg']['edge'].T,j],
                      c='k',linewidth=3)

        if i==0:
            axs[j,i].set_ylabel(f'PC{j+1}',fontsize=20)
        else:
            axs[j,i].set_ylabel(None)
        if j==(n_comps-1):
            axs[j,i].set_xlabel(f'PC{i+1}',fontsize=20)
        else:
            axs[j,i].set_xlabel(None)

        if i<j:
            axs[i,i].axis('off')
            axs[i,j].axis('off')
axs[-1,-1].axis('off')
lgnd=f.legend(handles, labels, loc='center right',bbox_to_anchor=[.8, 0.5],prop={'size': 30})
for handle in lgnd.legendHandles:
    handle.set_sizes([100.0])
plt.tight_layout(pad=0,w_pad=0)
plt.show()
/tmp/ipykernel_2000/2761055043.py:30: MatplotlibDeprecationWarning: The legendHandles attribute was deprecated in Matplotlib 3.7 and will be removed two minor releases later. Use legend_handles instead.
  for handle in lgnd.legendHandles:
_images/multiomics_61_1.png

Finally we can produce STREAM plots.

[36]:
st2.pl.stream_sc(adata_integrated,40,dist_scale=.25,preference=[64],color=['leiden_identity_simple2'],fig_size=(14,8),size=1)
st2.pl.stream(adata_integrated,40,preference=[64],color=['leiden_identity_simple2'],fig_size=(14,8),dist_scale=2,factor_min_win=1.3,factor_num_win=4)
st2.pl.stream(adata_integrated,40,preference=[64],color=['epg_pseudotime'],fig_size=(14,8),dist_scale=2,factor_min_win=1.3,factor_num_win=4)
_images/multiomics_63_0.png
_images/multiomics_63_1.png
_images/multiomics_63_2.png
[ ]: