Source code for stream2.preprocessing._qc

"""Quality Control."""

import numpy as np
from scipy.sparse import (
    issparse,
    csr_matrix,
)
import re


def cal_qc(adata, expr_cutoff=1):
    """Calculate quality control metrics.

    Parameters
    ----------
    adata: AnnData
        Annotated data matrix.
    expr_cutoff: `float`, optional (default: 1)
        Expression cutoff.
        If greater than expr_cutoff,the feature is considered 'expressed'
    assay: `str`, optional (default: 'rna')
            Choose from {'rna','atac'},case insensitive
    Returns
    -------
    updates `adata` with the following fields.
    n_counts: `pandas.Series` (`adata.var['n_counts']`,dtype `int`)
       The number of read count each gene has.
    n_cells: `pandas.Series` (`adata.var['n_cells']`,dtype `int`)
       The number of cells in which each gene is expressed.
    pct_cells: `pandas.Series` (`adata.var['pct_cells']`,dtype `float`)
       The percentage of cells in which each gene is expressed.
    n_counts: `pandas.Series` (`adata.obs['n_counts']`,dtype `int`)
       The number of read count each cell has.
    n_genes: `pandas.Series` (`adata.obs['n_genes']`,dtype `int`)
       The number of genes expressed in each cell.
    pct_genes: `pandas.Series` (`adata.obs['pct_genes']`,dtype `float`)
       The percentage of genes expressed in each cell.
    n_peaks: `pandas.Series` (`adata.obs['n_peaks']`,dtype `int`)
       The number of peaks expressed in each cell.
    pct_peaks: `pandas.Series` (`adata.obs['pct_peaks']`,dtype `int`)
       The percentage of peaks expressed in each cell.
    pct_mt: `pandas.Series` (`adata.obs['pct_mt']`,dtype `float`)
       the percentage of counts in mitochondrial genes
    """

    if not issparse(adata.X):
        adata.X = csr_matrix(adata.X)

    n_counts = adata.X.sum(axis=0).A1
    adata.var["n_counts"] = n_counts
    n_samples = (adata.X >= expr_cutoff).sum(axis=0).A1
    adata.var["n_samples"] = n_samples
    adata.var["pct_samples"] = n_samples / adata.shape[0]

    n_counts = adata.X.sum(axis=1).A1
    adata.obs["n_counts"] = n_counts
    n_features = (adata.X >= expr_cutoff).sum(axis=1).A1
    adata.obs["n_features"] = n_features
    adata.obs["pct_features"] = n_features / adata.shape[1]



[docs]
def cal_qc_rna(adata, expr_cutoff=1):
    """Calculate quality control metrics.

    Parameters
    ----------
    adata: AnnData
        Annotated data matrix.
    expr_cutoff: `float`, optional (default: 1)
        Expression cutoff.
        If greater than expr_cutoff,the feature is considered 'expressed'
    assay: `str`, optional (default: 'rna')
            Choose from {'rna','atac'},case insensitive
    Returns
    -------
    updates `adata` with the following fields.
    n_counts: `pandas.Series` (`adata.var['n_counts']`,dtype `int`)
       The number of read count each gene has.
    n_cells: `pandas.Series` (`adata.var['n_cells']`,dtype `int`)
       The number of cells in which each gene is expressed.
    pct_cells: `pandas.Series` (`adata.var['pct_cells']`,dtype `float`)
       The percentage of cells in which each gene is expressed.
    n_counts: `pandas.Series` (`adata.obs['n_counts']`,dtype `int`)
       The number of read count each cell has.
    n_genes: `pandas.Series` (`adata.obs['n_genes']`,dtype `int`)
       The number of genes expressed in each cell.
    pct_genes: `pandas.Series` (`adata.obs['pct_genes']`,dtype `float`)
       The percentage of genes expressed in each cell.
    n_peaks: `pandas.Series` (`adata.obs['n_peaks']`,dtype `int`)
       The number of peaks expressed in each cell.
    pct_peaks: `pandas.Series` (`adata.obs['pct_peaks']`,dtype `int`)
       The percentage of peaks expressed in each cell.
    pct_mt: `pandas.Series` (`adata.obs['pct_mt']`,dtype `float`)
       the percentage of counts in mitochondrial genes
    """

    if not issparse(adata.X):
        adata.X = csr_matrix(adata.X)

    n_counts = adata.X.sum(axis=0).A1
    adata.var["n_counts"] = n_counts
    n_cells = (adata.X >= expr_cutoff).sum(axis=0).A1
    adata.var["n_cells"] = n_cells
    adata.var["pct_cells"] = n_cells / adata.shape[0]

    n_counts = adata.X.sum(axis=1).A1
    adata.obs["n_counts"] = n_counts
    n_features = (adata.X >= expr_cutoff).sum(axis=1).A1
    adata.obs["n_genes"] = n_features
    adata.obs["pct_genes"] = n_features / adata.shape[1]
    r = re.compile("^MT-", flags=re.IGNORECASE)
    mt_genes = list(filter(r.match, adata.var_names))
    if len(mt_genes) > 0:
        n_counts_mt = adata[:, mt_genes].X.sum(axis=1).A1
        adata.obs["pct_mt"] = n_counts_mt / n_counts
    else:
        adata.obs["pct_mt"] = 0



def cal_qc_atac(adata, expr_cutoff=1):
    """Calculate quality control metrics.

    Parameters
    ----------
    adata: AnnData
        Annotated data matrix.
    expr_cutoff: `float`, optional (default: 1)
        Expression cutoff.
        If greater than expr_cutoff,the feature is considered 'expressed'
    assay: `str`, optional (default: 'rna')
            Choose from {'rna','atac'},case insensitive
    Returns
    -------
    updates `adata` with the following fields.
    n_counts: `pandas.Series` (`adata.var['n_counts']`,dtype `int`)
       The number of read count each gene has.
    n_cells: `pandas.Series` (`adata.var['n_cells']`,dtype `int`)
       The number of cells in which each gene is expressed.
    pct_cells: `pandas.Series` (`adata.var['pct_cells']`,dtype `float`)
       The percentage of cells in which each gene is expressed.
    n_counts: `pandas.Series` (`adata.obs['n_counts']`,dtype `int`)
       The number of read count each cell has.
    n_genes: `pandas.Series` (`adata.obs['n_genes']`,dtype `int`)
       The number of genes expressed in each cell.
    pct_genes: `pandas.Series` (`adata.obs['pct_genes']`,dtype `float`)
       The percentage of genes expressed in each cell.
    n_peaks: `pandas.Series` (`adata.obs['n_peaks']`,dtype `int`)
       The number of peaks expressed in each cell.
    pct_peaks: `pandas.Series` (`adata.obs['pct_peaks']`,dtype `int`)
       The percentage of peaks expressed in each cell.
    pct_mt: `pandas.Series` (`adata.obs['pct_mt']`,dtype `float`)
       the percentage of counts in mitochondrial genes
    """

    if not issparse(adata.X):
        adata.X = csr_matrix(adata.X)

    n_counts = adata.X.sum(axis=0).A1
    adata.var["n_counts"] = n_counts
    n_cells = (adata.X >= expr_cutoff).sum(axis=0).A1
    adata.var["n_cells"] = n_cells
    adata.var["pct_cells"] = n_cells / adata.shape[0]

    n_counts = adata.X.sum(axis=1).A1
    adata.obs["n_counts"] = n_counts
    n_features = (adata.X >= expr_cutoff).sum(axis=1).A1
    adata.obs["n_peaks"] = n_features
    adata.obs["pct_peaks"] = n_features / adata.shape[1]


def filter_samples(
    adata,
    min_n_features=1,
    max_n_features=None,
    min_pct_features=None,
    max_pct_features=None,
    min_n_counts=None,
    max_n_counts=None,
    expr_cutoff=1,
):
    """Filter out samples based on different metrics.

    Parameters
    ----------
    adata: AnnData
        Annotated data matrix.
    min_n_features: `int`, optional (default: None)
        Minimum number of features expressed
    min_pct_features: `float`, optional (default: None)
        Minimum percentage of features expressed
    min_n_counts: `int`, optional (default: None)
        Minimum number of read count for one cell
    expr_cutoff: `float`, optional (default: 1)
        Expression cutoff.
        If greater than expr_cutoff,the gene is considered 'expressed'
    assay: `str`, optional (default: 'rna')
            Choose from {{'rna','atac'}},case insensitive
    Returns
    -------
    updates `adata` with a subset of cells that pass the filtering.
    updates `adata` with the following fields if cal_qc() was not performed.
    n_counts: `pandas.Series` (`adata.obs['n_counts']`,dtype `int`)
       The number of read count each cell has.
    n_genes: `pandas.Series` (`adata.obs['n_genes']`,dtype `int`)
       The number of genes expressed in each cell.
    pct_genes: `pandas.Series` (`adata.obs['pct_genes']`,dtype `float`)
       The percentage of genes expressed in each cell.
    n_peaks: `pandas.Series` (`adata.obs['n_peaks']`,dtype `int`)
       The number of peaks expressed in each cell.
    pct_peaks: `pandas.Series` (`adata.obs['pct_peaks']`,dtype `int`)
       The percentage of peaks expressed in each cell.
    """

    if not issparse(adata.X):
        adata.X = csr_matrix(adata.X)
    if "n_counts" in adata.obs_keys():
        n_counts = adata.obs["n_counts"]
    else:
        n_counts = np.sum(adata.X, axis=1).A
        adata.obs["n_counts"] = n_counts
    if "n_features" in adata.obs_keys():
        n_features = adata.obs["n_features"]
    else:
        n_features = np.sum(adata.X >= expr_cutoff, axis=1).A1
        adata.obs["n_features"] = n_features
    if "pct_features" in adata.obs_keys():
        pct_features = adata.obs["pct_features"]
    else:
        pct_features = n_features / adata.shape[1]
        adata.obs["pct_features"] = pct_features

    print("before filtering: ")
    print(f"{adata.shape[0]} samples, {adata.shape[1]} feature")
    if (
        sum(
            list(
                map(
                    lambda x: x is None,
                    [
                        min_n_features,
                        min_pct_features,
                        min_n_counts,
                        max_n_features,
                        max_pct_features,
                        max_n_counts,
                    ],
                )
            )
        )
        == 6
    ):
        print("No filtering")
    else:
        cell_subset = np.ones(len(adata.obs_names), dtype=bool)
        if min_n_features is not None:
            print("filter samples based on min_n_features")
            cell_subset = (n_features >= min_n_features) & cell_subset
        if max_n_features is not None:
            print("filter samples based on max_n_features")
            cell_subset = (n_features <= max_n_features) & cell_subset
        if min_pct_features is not None:
            print("filter samples based on min_pct_features")
            cell_subset = (pct_features >= min_pct_features) & cell_subset
        if max_pct_features is not None:
            print("filter samples based on max_pct_features")
            cell_subset = (pct_features <= max_pct_features) & cell_subset
        if min_n_counts is not None:
            print("filter samples based on min_n_counts")
            cell_subset = (n_counts >= min_n_counts) & cell_subset
        if max_n_counts is not None:
            print("filter samples based on max_n_counts")
            cell_subset = (n_counts <= max_n_counts) & cell_subset
        adata._inplace_subset_obs(cell_subset)
        print("after filtering out low-quality samples: ")
        print(f"{adata.shape[0]} samples, {adata.shape[1]} feature")
    return None


def filter_cells_rna(
    adata,
    min_n_genes=None,
    max_n_genes=None,
    min_pct_genes=None,
    max_pct_genes=None,
    min_n_counts=None,
    max_n_counts=None,
    expr_cutoff=1,
):
    """Filter out cells for RNA-seq based on different metrics.

    Parameters
    ----------
    adata: AnnData
        Annotated data matrix.
    min_n_genes: `int`, optional (default: None)
        Minimum number of genes expressed
    min_pct_genes: `float`, optional (default: None)
        Minimum percentage of genes expressed
    min_n_counts: `int`, optional (default: None)
        Minimum number of read count for one cell
    expr_cutoff: `float`, optional (default: 1)
        Expression cutoff.
        If greater than expr_cutoff,the gene is considered 'expressed'
    assay: `str`, optional (default: 'rna')
            Choose from {{'rna','atac'}},case insensitive
    Returns
    -------
    updates `adata` with a subset of cells that pass the filtering.
    updates `adata` with the following fields if cal_qc() was not performed.
    n_counts: `pandas.Series` (`adata.obs['n_counts']`,dtype `int`)
       The number of read count each cell has.
    n_genes: `pandas.Series` (`adata.obs['n_genes']`,dtype `int`)
       The number of genes expressed in each cell.
    pct_genes: `pandas.Series` (`adata.obs['pct_genes']`,dtype `float`)
       The percentage of genes expressed in each cell.
    n_peaks: `pandas.Series` (`adata.obs['n_peaks']`,dtype `int`)
       The number of peaks expressed in each cell.
    pct_peaks: `pandas.Series` (`adata.obs['pct_peaks']`,dtype `int`)
       The percentage of peaks expressed in each cell.
    """

    if not issparse(adata.X):
        adata.X = csr_matrix(adata.X)
    if "n_counts" in adata.obs_keys():
        n_counts = adata.obs["n_counts"]
    else:
        n_counts = np.sum(adata.X, axis=1).A1
        adata.obs["n_counts"] = n_counts

    if "n_genes" in adata.obs_keys():
        n_genes = adata.obs["n_genes"]
    else:
        n_genes = np.sum(adata.X >= expr_cutoff, axis=1).A1
        adata.obs["n_genes"] = n_genes
    if "pct_genes" in adata.obs_keys():
        pct_genes = adata.obs["pct_genes"]
    else:
        pct_genes = n_genes / adata.shape[1]
        adata.obs["pct_genes"] = pct_genes

    print("before filtering: ")
    print(f"{adata.shape[0]} cells,  {adata.shape[1]} genes")
    if (
        sum(
            list(
                map(
                    lambda x: x is None,
                    [
                        min_n_genes,
                        min_pct_genes,
                        min_n_counts,
                        max_n_genes,
                        max_pct_genes,
                        max_n_counts,
                    ],
                )
            )
        )
        == 6
    ):
        print("No filtering")
    else:
        cell_subset = np.ones(len(adata.obs_names), dtype=bool)
        if min_n_genes is not None:
            print("filter cells based on min_n_genes")
            cell_subset = (n_genes >= min_n_genes) & cell_subset
        if max_n_genes is not None:
            print("filter cells based on max_n_genes")
            cell_subset = (n_genes <= max_n_genes) & cell_subset
        if min_pct_genes is not None:
            print("filter cells based on min_pct_genes")
            cell_subset = (pct_genes >= min_pct_genes) & cell_subset
        if max_pct_genes is not None:
            print("filter cells based on max_pct_genes")
            cell_subset = (pct_genes <= max_pct_genes) & cell_subset
        if min_n_counts is not None:
            print("filter cells based on min_n_counts")
            cell_subset = (n_counts >= min_n_counts) & cell_subset
        if max_n_counts is not None:
            print("filter cells based on max_n_counts")
            cell_subset = (n_counts <= max_n_counts) & cell_subset
        adata._inplace_subset_obs(cell_subset)
        print("after filtering out low-quality cells: ")
        print(f"{adata.shape[0]} cells,  {adata.shape[1]} genes")
    return None


def filter_cells_atac(
    adata,
    min_n_peaks=None,
    max_n_peaks=None,
    min_pct_peaks=None,
    max_pct_peaks=None,
    min_n_counts=None,
    max_n_counts=None,
    expr_cutoff=1,
):
    """Filter out cells for ATAC-seq based on different metrics.

    Parameters
    ----------
    adata: AnnData
        Annotated data matrix.
    min_n_peaks: `int`, optional (default: None)
        Minimum number of peaks expressed
    min_pct_peaks: `float`, optional (default: None)
        Minimum percentage of peaks expressed
    min_n_counts: `int`, optional (default: None)
        Minimum number of read count for one cell
    expr_cutoff: `float`, optional (default: 1)
        Expression cutoff.
        If greater than expr_cutoff,the gene is considered 'expressed'
    assay: `str`, optional (default: 'rna')
            Choose from {{'rna','atac'}},case insensitive
    Returns
    -------
    updates `adata` with a subset of cells that pass the filtering.
    updates `adata` with the following fields if cal_qc() was not performed.
    n_counts: `pandas.Series` (`adata.obs['n_counts']`,dtype `int`)
       The number of read count each cell has.
    n_genes: `pandas.Series` (`adata.obs['n_genes']`,dtype `int`)
       The number of genes expressed in each cell.
    pct_genes: `pandas.Series` (`adata.obs['pct_genes']`,dtype `float`)
       The percentage of genes expressed in each cell.
    n_peaks: `pandas.Series` (`adata.obs['n_peaks']`,dtype `int`)
       The number of peaks expressed in each cell.
    pct_peaks: `pandas.Series` (`adata.obs['pct_peaks']`,dtype `int`)
       The percentage of peaks expressed in each cell.
    """

    if not issparse(adata.X):
        adata.X = csr_matrix(adata.X)
    if "n_counts" in adata.obs_keys():
        n_counts = adata.obs["n_counts"]
    else:
        n_counts = np.sum(adata.X, axis=1).A1
        adata.obs["n_counts"] = n_counts

    if "n_peaks" in adata.obs_keys():
        n_peaks = adata.obs["n_peaks"]
    else:
        n_peaks = np.sum(adata.X >= expr_cutoff, axis=1).A1
        adata.obs["n_peaks"] = n_peaks
    if "pct_peaks" in adata.obs_keys():
        pct_peaks = adata.obs["pct_peaks"]
    else:
        pct_peaks = n_peaks / adata.shape[1]
        adata.obs["pct_peaks"] = pct_peaks

    print("before filtering: ")
    print(f"{adata.shape[0]} cells,  {adata.shape[1]} peaks")
    if (
        sum(
            list(
                map(
                    lambda x: x is None,
                    [
                        min_n_peaks,
                        min_pct_peaks,
                        min_n_counts,
                        max_n_peaks,
                        max_pct_peaks,
                        max_n_counts,
                    ],
                )
            )
        )
        == 6
    ):
        print("No filtering")
    else:
        cell_subset = np.ones(len(adata.obs_names), dtype=bool)
        if min_n_peaks is not None:
            print("filter cells based on min_n_peaks")
            cell_subset = (n_peaks >= min_n_peaks) & cell_subset
        if max_n_peaks is not None:
            print("filter cells based on max_n_peaks")
            cell_subset = (n_peaks <= max_n_peaks) & cell_subset
        if min_pct_peaks is not None:
            print("filter cells based on min_pct_peaks")
            cell_subset = (pct_peaks >= min_pct_peaks) & cell_subset
        if max_pct_peaks is not None:
            print("filter cells based on max_pct_peaks")
            cell_subset = (pct_peaks <= max_pct_peaks) & cell_subset
        if min_n_counts is not None:
            print("filter cells based on min_n_counts")
            cell_subset = (n_counts >= min_n_counts) & cell_subset
        if max_n_counts is not None:
            print("filter cells based on max_n_counts")
            cell_subset = (n_counts <= max_n_counts) & cell_subset
        adata._inplace_subset_obs(cell_subset)
        print("after filtering out low-quality cells: ")
        print(f"{adata.shape[0]} cells,  {adata.shape[1]} peaks")
    return None



[docs]
def filter_genes(
    adata,
    min_n_cells=3,
    max_n_cells=None,
    min_pct_cells=None,
    max_pct_cells=None,
    min_n_counts=None,
    max_n_counts=None,
    expr_cutoff=1,
):
    """Filter out features based on different metrics.

    Parameters
    ----------
    adata: AnnData
        Annotated data matrix.
    min_n_cells: `int`, optional (default: 5)
        Minimum number of cells expressing one feature
    min_pct_cells: `float`, optional (default: None)
        Minimum percentage of cells expressing one feature
    min_n_counts: `int`, optional (default: None)
        Minimum number of read count for one feature
    expr_cutoff: `float`, optional (default: 1)
        Expression cutoff.
        If greater than expr_cutoff,the feature is considered 'expressed'
    assay: `str`, optional (default: 'rna')
            Choose from {{'rna','atac'}},case insensitive

    Returns
    -------
    updates `adata` with a subset of features that pass the filtering.
    updates `adata` with the following fields if cal_qc() was not performed.
    n_counts: `pandas.Series` (`adata.var['n_counts']`,dtype `int`)
       The number of read count each gene has.
    n_cells: `pandas.Series` (`adata.var['n_cells']`,dtype `int`)
       The number of cells in which each gene is expressed.
    pct_cells: `pandas.Series` (`adata.var['pct_cells']`,dtype `float`)
       The percentage of cells in which each gene is expressed.
    """

    feature = "genes"
    if not issparse(adata.X):
        adata.X = csr_matrix(adata.X)

    if "n_counts" in adata.var_keys():
        n_counts = adata.var["n_counts"]
    else:
        n_counts = np.sum(adata.X, axis=0).A1
        adata.var["n_counts"] = n_counts
    if "n_cells" in adata.var_keys():
        n_cells = adata.var["n_cells"]
    else:
        n_cells = np.sum(adata.X >= expr_cutoff, axis=0).A1
        adata.var["n_cells"] = n_cells
    if "pct_cells" in adata.var_keys():
        pct_cells = adata.var["pct_cells"]
    else:
        pct_cells = n_cells / adata.shape[0]
        adata.var["pct_cells"] = pct_cells

    print("Before filtering: ")
    print(
        str(adata.shape[0]) + " cells, " + str(adata.shape[1]) + " " + feature
    )
    if (
        sum(
            list(
                map(
                    lambda x: x is None,
                    [
                        min_n_cells,
                        min_pct_cells,
                        min_n_counts,
                        max_n_cells,
                        max_pct_cells,
                        max_n_counts,
                    ],
                )
            )
        )
        == 6
    ):
        print("No filtering")
    else:
        feature_subset = np.ones(len(adata.var_names), dtype=bool)
        if min_n_cells is not None:
            print("Filter " + feature + " based on min_n_cells")
            feature_subset = (n_cells >= min_n_cells) & feature_subset
        if max_n_cells is not None:
            print("Filter " + feature + " based on max_n_cells")
            feature_subset = (n_cells <= max_n_cells) & feature_subset
        if min_pct_cells is not None:
            print("Filter " + feature + " based on min_pct_cells")
            feature_subset = (pct_cells >= min_pct_cells) & feature_subset
        if max_pct_cells is not None:
            print("Filter " + feature + " based on max_pct_cells")
            feature_subset = (pct_cells <= max_pct_cells) & feature_subset
        if min_n_counts is not None:
            print("Filter " + feature + " based on min_n_counts")
            feature_subset = (n_counts >= min_n_counts) & feature_subset
        if max_n_counts is not None:
            print("Filter " + feature + " based on max_n_counts")
            feature_subset = (n_counts <= max_n_counts) & feature_subset
        adata._inplace_subset_var(feature_subset)
        print("After filtering out low-expressed " + feature + ": ")
        print(
            str(adata.shape[0])
            + " cells, "
            + str(adata.shape[1])
            + " "
            + feature
        )
    return None



def filter_peaks(
    adata,
    min_n_cells=5,
    max_n_cells=None,
    min_pct_cells=None,
    max_pct_cells=None,
    min_n_counts=None,
    max_n_counts=None,
    expr_cutoff=1,
):
    """Filter out features based on different metrics.

    Parameters
    ----------
    adata: AnnData
        Annotated data matrix.
    min_n_cells: `int`, optional (default: 5)
        Minimum number of cells expressing one feature
    min_pct_cells: `float`, optional (default: None)
        Minimum percentage of cells expressing one feature
    min_n_counts: `int`, optional (default: None)
        Minimum number of read count for one feature
    expr_cutoff: `float`, optional (default: 1)
        Expression cutoff.
        If greater than expr_cutoff,the feature is considered 'expressed'
    assay: `str`, optional (default: 'rna')
            Choose from {{'rna','atac'}},case insensitive

    Returns
    -------
    updates `adata` with a subset of features that pass the filtering.
    updates `adata` with the following fields if cal_qc() was not performed.
    n_counts: `pandas.Series` (`adata.var['n_counts']`,dtype `int`)
       The number of read count each gene has.
    n_cells: `pandas.Series` (`adata.var['n_cells']`,dtype `int`)
       The number of cells in which each gene is expressed.
    pct_cells: `pandas.Series` (`adata.var['pct_cells']`,dtype `float`)
       The percentage of cells in which each gene is expressed.
    """

    feature = "peaks"
    if not issparse(adata.X):
        adata.X = csr_matrix(adata.X)

    if "n_counts" in adata.var_keys():
        n_counts = adata.var["n_counts"]
    else:
        n_counts = np.sum(adata.X, axis=0).A1
        adata.var["n_counts"] = n_counts
    if "n_cells" in adata.var_keys():
        n_cells = adata.var["n_cells"]
    else:
        n_cells = np.sum(adata.X >= expr_cutoff, axis=0).A1
        adata.var["n_cells"] = n_cells
    if "pct_cells" in adata.var_keys():
        pct_cells = adata.var["pct_cells"]
    else:
        pct_cells = n_cells / adata.shape[0]
        adata.var["pct_cells"] = pct_cells

    print("Before filtering: ")
    print(
        str(adata.shape[0]) + " cells, " + str(adata.shape[1]) + " " + feature
    )
    if (
        sum(
            list(
                map(
                    lambda x: x is None,
                    [
                        min_n_cells,
                        min_pct_cells,
                        min_n_counts,
                        max_n_cells,
                        max_pct_cells,
                        max_n_counts,
                    ],
                )
            )
        )
        == 6
    ):
        print("No filtering")
    else:
        feature_subset = np.ones(len(adata.var_names), dtype=bool)
        if min_n_cells is not None:
            print("Filter " + feature + " based on min_n_cells")
            feature_subset = (n_cells >= min_n_cells) & feature_subset
        if max_n_cells is not None:
            print("Filter " + feature + " based on max_n_cells")
            feature_subset = (n_cells <= max_n_cells) & feature_subset
        if min_pct_cells is not None:
            print("Filter " + feature + " based on min_pct_cells")
            feature_subset = (pct_cells >= min_pct_cells) & feature_subset
        if max_pct_cells is not None:
            print("Filter " + feature + " based on max_pct_cells")
            feature_subset = (pct_cells <= max_pct_cells) & feature_subset
        if min_n_counts is not None:
            print("Filter " + feature + " based on min_n_counts")
            feature_subset = (n_counts >= min_n_counts) & feature_subset
        if max_n_counts is not None:
            print("Filter " + feature + " based on max_n_counts")
            feature_subset = (n_counts <= max_n_counts) & feature_subset
        adata._inplace_subset_var(feature_subset)
        print("After filtering out low-expressed " + feature + ": ")
        print(
            str(adata.shape[0])
            + " cells, "
            + str(adata.shape[1])
            + " "
            + feature
        )
    return None


def filter_features(
    adata,
    min_n_samples=5,
    max_n_samples=None,
    min_pct_samples=None,
    max_pct_samples=None,
    min_n_counts=None,
    max_n_counts=None,
    expr_cutoff=1,
):
    """Filter out features based on different metrics.

    Parameters
    ----------
    adata: AnnData
        Annotated data matrix.
    min_n_cells: `int`, optional (default: 5)
        Minimum number of cells expressing one feature
    min_pct_cells: `float`, optional (default: None)
        Minimum percentage of cells expressing one feature
    min_n_counts: `int`, optional (default: None)
        Minimum number of read count for one feature
    expr_cutoff: `float`, optional (default: 1)
        Expression cutoff.
        If greater than expr_cutoff,the feature is considered 'expressed'
    assay: `str`, optional (default: 'rna')
            Choose from {{'rna','atac'}},case insensitive

    Returns
    -------
    updates `adata` with a subset of features that pass the filtering.
    updates `adata` with the following fields if cal_qc() was not performed.
    n_counts: `pandas.Series` (`adata.var['n_counts']`,dtype `int`)
       The number of read count each gene has.
    n_cells: `pandas.Series` (`adata.var['n_cells']`,dtype `int`)
       The number of cells in which each gene is expressed.
    pct_cells: `pandas.Series` (`adata.var['pct_cells']`,dtype `float`)
       The percentage of cells in which each gene is expressed.
    """

    if not issparse(adata.X):
        adata.X = csr_matrix(adata.X)
    if "n_counts" in adata.var_keys():
        n_counts = adata.var["n_counts"]
    else:
        n_counts = np.sum(adata.X, axis=0).A1
        adata.var["n_counts"] = n_counts
    if "n_samples" in adata.var_keys():
        n_samples = adata.var["n_samples"]
    else:
        n_samples = np.sum(adata.X >= expr_cutoff, axis=0).A1
        adata.var["n_samples"] = n_samples
    if "pct_samples" in adata.var_keys():
        pct_samples = adata.var["pct_samples"]
    else:
        pct_samples = n_samples / adata.shape[0]
        adata.var["pct_samples"] = pct_samples

    print("Before filtering: ")
    print(f"{adata.shape[0]} samples,  {adata.shape[1]} features")

    if (
        sum(
            list(
                map(
                    lambda x: x is None,
                    [
                        min_n_samples,
                        min_pct_samples,
                        min_n_counts,
                        max_n_samples,
                        max_pct_samples,
                        max_n_counts,
                    ],
                )
            )
        )
        == 6
    ):
        print("No filtering")
    else:
        feature_subset = np.ones(len(adata.var_names), dtype=bool)
        if min_n_samples is not None:
            print("Filter features based on min_n_samples")
            feature_subset = (n_samples >= min_n_samples) & feature_subset
        if max_n_samples is not None:
            print("Filter features based on max_n_samples")
            feature_subset = (n_samples <= max_n_samples) & feature_subset
        if min_pct_samples is not None:
            print("Filter features based on min_pct_samples")
            feature_subset = (pct_samples >= min_pct_samples) & feature_subset
        if max_pct_samples is not None:
            print("Filter features based on max_pct_samples")
            feature_subset = (pct_samples <= max_pct_samples) & feature_subset
        if min_n_counts is not None:
            print("Filter features based on min_n_counts")
            feature_subset = (n_counts >= min_n_counts) & feature_subset
        if max_n_counts is not None:
            print("Filter features based on max_n_counts")
            feature_subset = (n_counts <= max_n_counts) & feature_subset
        adata._inplace_subset_var(feature_subset)
        print("After filtering out low-expressed features: ")
        print(f"{adata.shape[0]} samples,  {adata.shape[1]} features")
    return None