Source code for sincei.scCombineCounts

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import sys
import os
import argparse
import numpy as np
import pandas as pd
from scipy import sparse, io

# logs
import warnings
import logging

logger = logging.getLogger()
warnings.simplefilter(action="ignore", category=FutureWarning)

# single-cell stuff
import anndata
import scanpy as sc


## own Functions
# scriptdir=os.path.abspath(os.path.join(__file__, "../../sincei"))
# sys.path.append(scriptdir)
from sincei import ParserCommon
from sincei.ParserCommon import smartLabel


[docs]def parseArguments(): other_args = ParserCommon.otherOptions() parser = argparse.ArgumentParser( parents=[get_args(), other_args], formatter_class=argparse.ArgumentDefaultsHelpFormatter, description=""" This tool combines multiple count matrices (output of scCountReads) into one, either assuming they are different samples (multi-sample) or different measurements on the same set of cells (multi-modal). The result is a .loom file with combined counts. NOTE: it doesn't perform any 'batch effect correction' or 'integration' of data from different technologies, which requires more sophisticated methods. """, usage="Example usage: scCombineCounts -i sample1.loom sample2.loom -o combined.loom > log.txt", add_help=False, ) return parser
[docs]def get_args(): parser = argparse.ArgumentParser(add_help=False) general = parser.add_argument_group("General Options") general.add_argument( "--input", "-i", metavar="LOOM", help="Input files in .loom format", nargs="+", required=True, ) general.add_argument( "--outFile", "-o", type=str, help="The file to write results to. For method: `multi-sample`, the output " "file is an updated .loom object, which can be used by other tools. " "For method: `multi-omic`, the output file is an .hdf5 file. This file can only be " "used by scClusterCells, to perform multi-modal clustering. ", required=True, ) general.add_argument( "--labels", "-l", metavar="sample1 sample2", help="User defined labels instead of default labels from " "file names. Multiple labels have to be separated by a space, e.g. " "--labels sample1 sample2 sample3", nargs="+", ) general.add_argument( "--method", "-m", type=str, choices=["multi-sample", "multi-modal"], default="multi-sample", help="How to merge the counts from the provided samples. " "`multi-sample`: assumes that each sample is the independent, " "but were counted in the same manner (i.e. on same features), therefore " "it looks for feature overlaps, but not for barcode overlaps. " "`multi-modal`: assumes that the counts were generated in 2 different ways, " "but from the same set of cells (for example, using a multi-omic technology), " "therefore it looks for the overlap of cell barcodes, but not for the overlaps " "of features (Default: %(default)s)", ) return parser
[docs]def main(args=None): args = parseArguments().parse_args(args) if not args.verbose: logger.setLevel(logging.CRITICAL) warnings.filterwarnings("ignore") if args.method != "multi-sample": sys.stderr.write("Only multi-sample method is currently implemented") sys.exit(1) if args.labels and len(args.input) != len(args.labels): print("The number of labels does not match the number of input files.") sys.exit(1) if not args.labels: # try smartlabel args.labels = [smartLabel(x) for x in args.input] adata_list = [sc.read_loom(x, obs_names="obs_names", var_names="var_names") for x in args.input] ## concatenate labels and match chrom, start, end var_list = [] var_cols = ["chrom", "start", "end"] for lab, ad in zip(args.labels, adata_list): obs = ad.obs_names.to_list() lab = [lab] * len(obs) new_names = ["_".join([x, y]) for x, y in zip(lab, obs)] ad.obs_names = new_names hasinfo = all([x in ad.var.columns for x in var_cols]) var_list.append(hasinfo) ## keep the chrom, start, end from original sample if present adata = anndata.concat(adata_list) if all(var_list): var_df = adata_list[0].var[var_cols] adata.var = adata.var.join(var_df) else: sys.stderr.write( "WARNING: Not all input files contain the 'chrom', 'start', 'end' information. " "The output will lack these fields. This might cause an error in some downstream tools" ) sys.stdout.write("Combined cells: {} \n".format(adata.shape[0])) sys.stdout.write("Combined features: {} \n".format(adata.shape[1])) adata.write_loom(args.outFile) return 0