#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys
import os
import argparse
import numpy as np
import pandas as pd
from scipy import sparse, io
# logs
import warnings
import logging
logger = logging.getLogger()
warnings.simplefilter(action="ignore", category=FutureWarning)
# single-cell stuff
import anndata
import scanpy as sc
## own Functions
# scriptdir=os.path.abspath(os.path.join(__file__, "../../sincei"))
# sys.path.append(scriptdir)
from sincei import ParserCommon
from sincei.ParserCommon import smartLabel
[docs]def parseArguments():
other_args = ParserCommon.otherOptions()
parser = argparse.ArgumentParser(
parents=[get_args(), other_args],
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
description="""
This tool combines multiple count matrices (output of scCountReads) into one, either assuming they are different samples (multi-sample)
or different measurements on the same set of cells (multi-modal). The result is a .loom file with combined counts. NOTE: it doesn't perform
any 'batch effect correction' or 'integration' of data from different technologies, which requires more sophisticated methods.
""",
usage="Example usage: scCombineCounts -i sample1.loom sample2.loom -o combined.loom > log.txt",
add_help=False,
)
return parser
[docs]def get_args():
parser = argparse.ArgumentParser(add_help=False)
general = parser.add_argument_group("General Options")
general.add_argument(
"--input",
"-i",
metavar="LOOM",
help="Input files in .loom format",
nargs="+",
required=True,
)
general.add_argument(
"--outFile",
"-o",
type=str,
help="The file to write results to. For method: `multi-sample`, the output "
"file is an updated .loom object, which can be used by other tools. "
"For method: `multi-omic`, the output file is an .hdf5 file. This file can only be "
"used by scClusterCells, to perform multi-modal clustering. ",
required=True,
)
general.add_argument(
"--labels",
"-l",
metavar="sample1 sample2",
help="User defined labels instead of default labels from "
"file names. Multiple labels have to be separated by a space, e.g. "
"--labels sample1 sample2 sample3",
nargs="+",
)
general.add_argument(
"--method",
"-m",
type=str,
choices=["multi-sample", "multi-modal"],
default="multi-sample",
help="How to merge the counts from the provided samples. "
"`multi-sample`: assumes that each sample is the independent, "
"but were counted in the same manner (i.e. on same features), therefore "
"it looks for feature overlaps, but not for barcode overlaps. "
"`multi-modal`: assumes that the counts were generated in 2 different ways, "
"but from the same set of cells (for example, using a multi-omic technology), "
"therefore it looks for the overlap of cell barcodes, but not for the overlaps "
"of features (Default: %(default)s)",
)
return parser
[docs]def main(args=None):
args = parseArguments().parse_args(args)
if not args.verbose:
logger.setLevel(logging.CRITICAL)
warnings.filterwarnings("ignore")
if args.method != "multi-sample":
sys.stderr.write("Only multi-sample method is currently implemented")
sys.exit(1)
if args.labels and len(args.input) != len(args.labels):
print("The number of labels does not match the number of input files.")
sys.exit(1)
if not args.labels:
# try smartlabel
args.labels = [smartLabel(x) for x in args.input]
adata_list = [sc.read_loom(x, obs_names="obs_names", var_names="var_names") for x in args.input]
## concatenate labels and match chrom, start, end
var_list = []
var_cols = ["chrom", "start", "end"]
for lab, ad in zip(args.labels, adata_list):
obs = ad.obs_names.to_list()
lab = [lab] * len(obs)
new_names = ["_".join([x, y]) for x, y in zip(lab, obs)]
ad.obs_names = new_names
hasinfo = all([x in ad.var.columns for x in var_cols])
var_list.append(hasinfo)
## keep the chrom, start, end from original sample if present
adata = anndata.concat(adata_list)
if all(var_list):
var_df = adata_list[0].var[var_cols]
adata.var = adata.var.join(var_df)
else:
sys.stderr.write(
"WARNING: Not all input files contain the 'chrom', 'start', 'end' information. "
"The output will lack these fields. This might cause an error in some downstream tools"
)
sys.stdout.write("Combined cells: {} \n".format(adata.shape[0]))
sys.stdout.write("Combined features: {} \n".format(adata.shape[1]))
adata.write_loom(args.outFile)
return 0