Source code for sincei.scCountQC

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import sys
import os
import re
import argparse
import numpy as np
import pandas as pd
from scipy import sparse, io
from itertools import compress
import copy
import scanpy as sc

# logs
import warnings
import logging

logger = logging.getLogger()
warnings.simplefilter(action="ignore", category=FutureWarning)

## own Functions
# scriptdir=os.path.abspath(os.path.join(__file__, "../../sincei"))
# sys.path.append(scriptdir)
from sincei import ParserCommon
from sincei.Utilities import gini


### ------ Functions ------
[docs]def filter_adata( adata, filter_region_dict=None, filter_cell_dict=None, bad_chrom=None, bad_cells=None, ): # 1. regions if bad_chrom: adata = adata[:, ~adata.var.chrom.isin(bad_chrom)] if filter_region_dict: for key in filter_region_dict.keys(): adata = adata[ :, (adata.var[key] >= filter_region_dict[key][0]) & (adata.var[key] <= filter_region_dict[key][1]), ] # 2. Cells if bad_cells: adata = adata[~adata.obs.index.isin(bad_cells)] if filter_cell_dict: for key in filter_cell_dict.keys(): adata = adata[ (adata.obs[key] >= filter_cell_dict[key][0]) & (adata.obs[key] <= filter_cell_dict[key][1]), :, ] return adata
""" def make_plots(adata, fname=None): # plotting import matplotlib import matplotlib.pyplot as plt matplotlib.use('Agg') matplotlib.rcParams['pdf.fonttype'] = 42 matplotlib.rcParams['svg.fonttype'] = 'none' import seaborn as sns# seaborn colormaps conflicts with deeptools colormaps ## filtering of regions plt.figure() pl1=sns.violinplot(data=adata.var, x='total_counts', y='chrom') plt.figure() pl2=sns.distplot(np.log10(adata.var['total_counts']+1)) plt.figure() pl3=sns.distplot(adata.var['mean_counts']) plt.figure() pl4=sns.distplot(adata.var['fraction_cells_with_signal']) ## cells plt.figure() pl5=sns.scatterplot(data=adata.obs, x='total_counts', y='pct_counts_in_top_100_genes') pl5.set(xscale="log") plt.figure() pl6=sns.distplot(adata.obs['fraction_regions_with_signal']) plt.figure() pl7=sns.scatterplot(data=adata.obs, x='total_counts', y='fraction_regions_with_signal') plist=[pl1, pl2, pl3, pl4, pl5, pl6, pl7] if fname: with PdfPages(fname) as pp: for plot in plist: pp.savefig(plot.figure) return plist general.add_argument('--outPlot', '-op', type=str, help='The output plot file. This describes the distribution of filtering metrics pre and post filtering') if args.outPlot: make_plots(adata, fname=args.outPlot) if cellfilter or regionfilter or badcells or badchrom: make_plots(adata_filt, fname=args.outPlot+".filtered.pdf") """
[docs]def parseArguments(): io_args = ParserCommon.inputOutputOptions(opts=["loomfile", "outFile"]) plot_args = ParserCommon.plotOptions() other_args = ParserCommon.otherOptions() parser = argparse.ArgumentParser( parents=[io_args, get_args(), plot_args, other_args], formatter_class=argparse.RawDescriptionHelpFormatter, description=""" This tool performs calculates multiple quality controls metrics on the input .loom file (output of scCountReads) and (optionally) filters the input file based on filterCellArgs/filterRegionArgs. The output is either an updated .loom object (if filtering is requested) or the filtering metrics (--outMetrics) and plots (--outPlot). """, usage="Example usage: scCountQC -i cellCounts.loom -om qc_metrics.tsv > log.txt", add_help=False, ) return parser
[docs]def get_args(): parser = argparse.ArgumentParser(add_help=False) general = parser.add_argument_group("Filtering arguments") general.add_argument( "--describe", "-d", action="store_true", help="Print a list of cell and region metrics available for QC/filtering.", ) general.add_argument( "--outMetrics", "-om", type=str, help="Prefix of the output file with calculated QC metrics. If given, the cell metrics are printed in " "<prefix>.cell.tsv and region metrics as <prefix>.region.tsv", ) general.add_argument( "--filterCellArgs", "-fc", type=str, help='List of arguments to filter cells. The format is "arg_name: minvalue, maxvalue; arg_name: minvalue, maxvalue; ...." ' "where arg_name is the QC metric for cells present in the input loom file. In order to view all available " 'cell filtering metrics, run scCountFilter with "--describe". The two arguments are supplied (minvalue, maxvalue) ' "they are used as lower and upper bounds to filter cells. Make sure they are float/integer numbers.", ) general.add_argument( "--filterRegionArgs", "-fr", type=str, help='List of arguments to filter regions. The format is "arg_name: minvalue, maxvalue; arg_name: minvalue; ...." ' "where arg_name is the QC metric for regions present in the input loom file. In order to view all available " 'cell filtering metrics, run scCountFilter with "--describe". The two arguments are supplied (minvalue, maxvalue) ' "they are used as lower and upper bounds to filter cells. Make sure they are float/integer numbers.", ) general.add_argument( "--cell_blacklist", "-cb", default=None, type=argparse.FileType("r"), help="A list of barcodes to be included for the clustering. The barcodes " "(along with sample labels) must be present in the input object.", ) general.add_argument( "--chrom_blacklist", "-chb", default=None, type=str, help="A comma separated list of chromosomes to exclude. eg. chrM, chrUn", ) return parser
[docs]def main(args=None): args = parseArguments().parse_args(args) if not args.verbose: logger.setLevel(logging.CRITICAL) warnings.filterwarnings("ignore") adata = sc.read_loom(args.input, obs_names="obs_names", var_names="var_names") ## add QC stats to the anndata object # 1. scanpy metrics # fraction of regions/genes with signal are included in the metrics (pct_dropouts/n_genes_by_counts) try: sc.pp.calculate_qc_metrics(adata, inplace=True) except IndexError: # not enough genes/regions sys.stderr.write("\n Error: Too few regions in the input file to perform QC \n") exit() # pool = multiprocessing.Pool(args.numberOfProcessors) # func=partial(gini, X=adata.X) # gini_list = pool.map(func, range(adata.shape[0]) ) gini_list = [gini(i, adata.X) for i in range(adata.shape[0])] adata.obs["gini_coefficient"] = gini_list if args.outMetrics: mat = re.sub(".txt|.tsv|.csv", "", args.outMetrics) obs_tsv = mat + ".cells.tsv" var_tsv = mat + ".regions.tsv" adata.obs.to_csv(obs_tsv, sep="\t", index_label="Cell_ID") adata.var.to_csv(var_tsv, sep="\t", index_label="Feature_ID") # if --describe is asked, only print the numeric vars and obs columns if args.describe: is_num_col = [(pd.api.types.is_float_dtype(x) | pd.api.types.is_integer_dtype(x)) for x in adata.obs.dtypes] cols = adata.obs.loc[:, is_num_col] sys.stdout.write("\n Cell metrics: \n") sys.stdout.write("Total cells: {} \n".format(adata.shape[0])) print(pd.DataFrame({"min": cols.min(), "max": cols.max()})) is_num_row = [(pd.api.types.is_float_dtype(x) | pd.api.types.is_integer_dtype(x)) for x in adata.var.dtypes] rows = adata.var.loc[:, is_num_row] sys.stdout.write("\n Feature metrics: \n") sys.stdout.write("Total features: {} \n".format(adata.shape[1])) print(pd.DataFrame({"min": rows.min(), "max": rows.max()})) exit() if args.filterCellArgs: cellfilter = dict() for x in args.filterCellArgs.strip().split(";"): key = x.strip().split(":")[0] v = x.strip().split(":")[1] value = [float(x) for x in v.strip().split(",")] cellfilter[key] = value else: cellfilter = None if args.filterRegionArgs: regionfilter = dict() for x in args.filterRegionArgs.strip().split(";"): key = x.strip().split(":")[0] v = x.strip().split(":")[1] value = [float(x) for x in v.strip().split(",")] regionfilter[key] = value else: regionfilter = None if args.cell_blacklist: ## read the barcode file with open(args.cell_blacklist, "r") as f: badcells = f.read().splitlines() f.close() else: badcells = None if args.chrom_blacklist: badchrom = args.chrom_blacklist.strip().split(",") else: badchrom = None if cellfilter or regionfilter or badcells or badchrom: sys.stdout.write("Applying filters \n") adata_filt = filter_adata( adata, filter_region_dict=regionfilter, filter_cell_dict=cellfilter, bad_chrom=badchrom, bad_cells=badcells, ) sys.stdout.write("Remaining cells: {} \n".format(adata_filt.shape[0])) sys.stdout.write("Remaining features: {} \n".format(adata_filt.shape[1])) adata_filt.write_loom(args.outFile) return 0