Source code for stream2.preprocessing._pca

"""Principal component analysis."""

import numpy as np
from sklearn.decomposition import TruncatedSVD
from ._utils import (
    locate_elbow,
)


[docs] def pca( adata, n_components=50, algorithm="randomized", n_iter=5, random_state=2021, tol=0.0, feature=None, **kwargs, ): """perform Principal Component Analysis (PCA) Parameters ---------- adata: AnnData Annotated data matrix. n_components: `int`, optional (default: 50) Desired dimensionality of output data algorithm: `str`, optional (default: 'randomized') SVD solver to use. Choose from {'arpack', 'randomized'}. n_iter: `int`, optional (default: '5') Number of iterations for randomized SVD solver. Not used by ARPACK. tol: `float`, optional (default: 0) Tolerance for ARPACK. 0 means machine precision. Ignored by randomized SVD solver. feature: `str`, optional (default: None) Feature used to perform PCA. The data type of `.var[feature]` needs to be `bool` If None, adata.X will be used. kwargs: Other keyword arguments are passed down to `TruncatedSVD()` Returns ------- updates `adata` with the following fields: `.obsm['X_pca']` : `array` PCA transformed X. `.uns['pca']['PCs']` : `array` Principal components in feature space, representing the directions of maximum variance in the data. `.uns['pca']['variance']` : `array` The variance of the training samples transformed by a projection to each component. `.uns['pca']['variance_ratio']` : `array` Percentage of variance explained by each of the selected components. """ if feature is None: X = adata.X.copy() else: mask = adata.var[feature] X = adata[:, mask].X.copy() svd = TruncatedSVD( n_components=n_components, algorithm=algorithm, n_iter=n_iter, random_state=random_state, tol=tol, **kwargs, ) svd.fit(X) adata.obsm["X_pca"] = svd.transform(X) adata.uns["pca"] = dict() adata.uns["pca"]["n_pcs"] = n_components adata.uns["pca"]["PCs"] = svd.components_.T adata.uns["pca"]["variance"] = svd.explained_variance_ adata.uns["pca"]["variance_ratio"] = svd.explained_variance_ratio_
def select_pcs( adata, n_pcs=None, S=1, curve="convex", direction="decreasing", online=False, min_elbow=None, **kwargs, ): """select top PCs based on variance_ratio.""" if n_pcs is None: n_components = adata.obsm["X_pca"].shape[1] if min_elbow is None: min_elbow = n_components / 10 n_pcs = locate_elbow( range(n_components), adata.uns["pca"]["variance_ratio"], S=S, curve=curve, min_elbow=min_elbow, direction=direction, online=online, **kwargs, ) adata.uns["pca"]["n_pcs"] = n_pcs else: adata.uns["pca"]["n_pcs"] = n_pcs def select_pcs_features( adata, S=1, curve="convex", direction="decreasing", online=False, min_elbow=None, **kwargs, ): """select features that contribute to the top PCs. S : `float`, optional (default: 10) Sensitivity min_elbow: `int`, optional (default: 0) The minimum elbow location curve: `str`, optional (default: 'convex') Choose from {'convex','concave'} If 'concave', algorithm will detect knees, If 'convex', algorithm will detect elbows. direction: `str`, optional (default: 'decreasing') Choose from {'decreasing','increasing'} online: `bool`, optional (default: False) kneed will correct old knee points if True, kneed will return first knee if False. **kwargs: `dict`, optional Extra arguments to KneeLocator. """ n_pcs = adata.uns["pca"]["n_pcs"] n_features = adata.uns["pca"]["PCs"].shape[0] if min_elbow is None: min_elbow = n_features / 6 adata.uns["pca"]["features"] = dict() ids_features = list() for i in range(n_pcs): elbow = locate_elbow( range(n_features), np.sort( np.abs( adata.uns["pca"]["PCs"][:, i], ) )[::-1], S=S, min_elbow=min_elbow, curve=curve, direction=direction, online=online, **kwargs, ) ids_features_i = list( np.argsort(np.abs(adata.uns["pca"]["PCs"][:, i],))[ ::-1 ][:elbow] ) adata.uns["pca"]["features"][f"pc_{i}"] = ids_features_i ids_features = ids_features + ids_features_i print(f"#features selected from PC {i}: {len(ids_features_i)}") adata.var["top_pcs"] = False adata.var.loc[adata.var_names[np.unique(ids_features)], "top_pcs"] = True print(f'#features in total: {adata.var["top_pcs"].sum()}')