Source code for stream2.preprocessing._variable_genes

"""Preprocess."""

import numpy as np
from scipy.sparse import (
    csr_matrix,
)
from sklearn.utils import sparsefuncs
from skmisc.loess import loess


[docs] def select_variable_genes( adata, layer="raw", span=0.3, n_top_genes=2000, ): """Select highly variable genes. This function implenments the method 'vst' in Seurat v3. Inspired by Scanpy. Parameters ---------- Returns ------- """ if layer is None: X = adata.X else: X = adata.layers[layer].astype(np.float64).copy() mean, variance = sparsefuncs.mean_variance_axis(X, axis=0) variance_expected = np.zeros(adata.shape[1], dtype=np.float64) not_const = variance > 0 model = loess( np.log10(mean[not_const]), np.log10(variance[not_const]), span=span, degree=2, ) model.fit() variance_expected[not_const] = 10**model.outputs.fitted_values N = adata.shape[0] clip_max = np.sqrt(N) clip_val = np.sqrt(variance_expected) * clip_max + mean X = csr_matrix(X) mask = X.data > clip_val[X.indices] X.data[mask] = clip_val[X.indices[mask]] squared_X_sum = np.array(X.power(2).sum(axis=0)) X_sum = np.array(X.sum(axis=0)) norm_gene_var = (1 / ((N - 1) * variance_expected)) * ( (N * np.square(mean)) + squared_X_sum - 2 * X_sum * mean ) norm_gene_var = norm_gene_var.flatten() adata.var["variances_norm"] = norm_gene_var adata.var["variances"] = variance adata.var["means"] = mean ids_top = norm_gene_var.argsort()[-n_top_genes:][::-1] adata.var["highly_variable"] = np.isin(range(adata.shape[1]), ids_top) print(f"{n_top_genes} variable genes are selected.")