Source code for sincei.multimodalClustering

import umap
import pandas as pd
import scanpy as sc
from scanpy.neighbors import (
    _compute_connectivities_umap,
    _get_indices_distances_from_dense_matrix,
)
from scanpy._utils import get_igraph_from_adjacency
import igraph as ig
import leidenalg as la
from scipy import sparse

import sys

# own modules
from sincei.TopicModels import TOPICMODEL
from sincei._deprecated import cluster_LSA

# umap.__version__ : should be >= 0.5.1

## literature on multi-graph clustering
"""
https://github.com/crisbodnar/regularised-spectral-clustering
https://arxiv.org/pdf/2103.16534.pdf
https://arxiv.org/pdf/2010.12301.pdf
https://arxiv.org/pdf/2010.15456.pdf
https://chenannie45.github.io/SDM18_MANE.pdf
https://www.researchgate.net/post/Python_or_R_packages_for_multilayer_network
http://www.mkivela.com/pymnet/visualizing.html#visualization-tutorial
https://cran.r-project.org/web/packages/multinet/multinet.pdf
https://sciendo.com/pdf/10.2478/amcs-2019-0010
https://web.media.mit.edu/~xdong/paper/tsp14.pdf
"""


[docs]def multiModal_clustering(mode1_adata, mode2_adata, column_key="barcode_nla", nK=20): r""" Performs multi-graph clustering on matched keys(barcodes) bw two anndata objects. Parameters ---------- mode1_adata : AnnData AnnData object for mode 1 mode2_adata : AnnData AnnData object for mode 2 column_key : str Column name for the barcode in mode 1 and 2 nK : int Number of clusters to use for clustering Returns ------- multi_umap : DataFrame DataFrame with UMAP coordinates and cluster labels mode1_adata : AnnData AnnData object for mode 1 mode2_adata : AnnData AnnData object for mode 2 """ # subset RNA anndata mode1_adata = mode1_adata[[i for i, v in enumerate(mode1_adata.obs[column_key]) if v in mode2_adata.obs.index]] # re-compute distances/clusters/umap sc.pp.neighbors(mode1_adata, n_neighbors=nK) sc.tl.louvain(mode1_adata) sc.tl.umap(mode1_adata, min_dist=0.5, spread=5, init_pos="random", random_state=42) # get graph G_rna = get_igraph_from_adjacency(mode1_adata.obsp["connectivities"], directed=True) # subset chic anndata # mode2_adata=mode2_adata[[i for i,v in enumerate(mode2_adata.obs_names.to_list()) if v in set(mode1_adata.obs[column_key])]] mode2_adata = mode2_adata[mode1_adata.obs[column_key].tolist()] # re-do LSA mtx = sparse.csr_matrix(mode2_adata.X.transpose()) dat = TOPICMODEL( mtx, list(mode2_adata.obs.index), list(mode2_adata.var.index), nTopics=20, smartCode="lfu", ) dat.runLSA() cell_topic = dat.get_cell_topic() # get graph chic_umap, G_chic = cluster_LSA(cell_topic, modularityAlg="leiden", resolution=1, nk=nK) mode2_adata.obsm["X_pca"] = cell_topic mode2_adata.obsm["X_umap"] = chic_umap # leiden multi-layer clustering optimiser = la.Optimiser() part_rna = la.ModularityVertexPartition(G_rna) part_chic = la.ModularityVertexPartition(G_chic) # # part_rna = la.CPMVertexPartition(G_rna, resolution=res_layer1) # part_chic = la.CPMVertexPartition(G_chic, resolution=res_layer2) optimiser.optimise_partition_multiplex([part_rna, part_chic], layer_weights=[2, 1], n_iterations=-1) print("Detected clusters: ", set(part_chic.membership)) chic_umap["cluster_multi"] = part_chic.membership # merge RNA and ChIC UMAPs rna_umap = pd.DataFrame(mode1_adata.obsm["X_umap"], columns=["RNA_UMAP1", "RNA_UMAP2"]) rna_umap.index = mode1_adata.obs.index rna_umap["cluster_mode1"] = mode1_adata.obs.louvain rna_umap[column_key] = mode1_adata.obs[column_key] multi_umap = rna_umap.merge(chic_umap, left_index=False, right_index=True, left_on=column_key) multi_umap["cluster_mode1"] = [int(x) for x in multi_umap["cluster_mode1"].to_list()] return multi_umap, mode1_adata, mode2_adata
## run aligned UMAPs between two dfs with PCs
[docs]def umap_aligned(pca_mode1, pca_mode2, nK=15, distance_metric="eucledian"): r""" Aligns two UMAP embeddings using the UMAP AlignedUMAP class Parameters ---------- pca_mode1 : pandas.DataFrame UMAP embedding of RNA data pca_mode2 : pandas.DataFrame UMAP embedding of CHiC data nK : int Number of nearest neighbors to use for UMAP distance_metric : str Distance metric to use for UMAP Returns ------- pandas.DataFrame Aligned UMAP embedding of RNA data pandas.DataFrame Aligned UMAP embedding of CHiC data Examples -------- >>> test = Tester() >>> pca_mode1 = test.pca_mode1 >>> pca_mode2 = test.pca_mode2 >>> umap_aligned(pca_mode1, pca_mode2) ( aligned_UMAP1 aligned_UMAP2 0 -0.012995 0.001206 1 0. """ pca_mode1 = pca_mode1.loc[pca_mode2.index] keys = [x for x in range(len(pca_mode1.index))] values = [] for x in pca_mode1.index: values.append([i for i, v in enumerate(pca_mode2.index) if v == x]) values = [x[0] for x in values] relation_dict = {i: v for i, v in zip(keys, values)} chic_idx = pca_mode2.index rna_idx = pca_mode1.index pca_mode2 = pca_mode2.reset_index().drop("index", axis=1) pca_mode1 = pca_mode1.reset_index().drop("index", axis=1) # UMAP UA = umap.AlignedUMAP( spread=10, min_dist=0.01, n_neighbors=nK, metric=distance_metric, init="random", random_state=42, ) aligned_umap = UA.fit([pca_mode1, pca_mode2], relations=[relation_dict]) mode1_umap_aligned = pd.DataFrame(aligned_umap.embeddings_[0]) mode2_umap_aligned = pd.DataFrame(aligned_umap.embeddings_[1]) mode2_umap_aligned.index = chic_idx mode1_umap_aligned.index = rna_idx mode2_umap_aligned.columns = ["aligned_UMAP1", "aligned_UMAP2"] mode1_umap_aligned.columns = ["aligned_UMAP1", "aligned_UMAP2"] return mode1_umap_aligned, mode2_umap_aligned