#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import sys
import argparse
import numpy as np
from scipy import sparse, io
import re
import pandas as pd
import anndata as ad
from deeptools import parserCommon
from deeptools.utilities import smartLabels
# logs
import warnings
import logging
logger = logging.getLogger()
warnings.simplefilter(action="ignore", category=FutureWarning)
# own functions
# scriptdir=os.path.abspath(os.path.join(__file__, "../../sincei"))
# sys.path.append(scriptdir)
from sincei import ReadCounter as countR
from sincei import ParserCommon
old_settings = np.seterr(all="ignore")
[docs]def parseArguments(args=None):
parser = argparse.ArgumentParser(
formatter_class=argparse.RawDescriptionHelpFormatter,
description="""
``scCountReads`` computes the read coverages per cell barcode for genomic regions in the provided BAM file(s).
The analysis can be performed for the entire genome by running the program in 'bins' mode.
If you want to count the read coverage for specific regions only, use the ``features`` mode instead.
The standard output of ``scCountReads`` is a ".loom" file with counts, along with rowName (features) and colNames (cell barcodes).
A detailed sub-commands help is available by typing:
scCountReads bins -h
scCountReads features -h
""",
epilog="example usages:\n"
"scCountReads bins --bamfiles file1.bam file2.bam --barcodes whitelist.txt -o results \n\n"
"scCountReads features --BED selection.bed --bamfiles file1.bam file2.bam --barcodes whitelist.txt \n"
"-o results"
" \n\n",
conflict_handler="resolve",
)
subparsers = parser.add_subparsers(
title="commands",
dest="command",
description="subcommands",
help="subcommands",
metavar="",
)
read_args = ParserCommon.readOptions(suppress_args=["filterRNAstrand"])
filter_args = ParserCommon.filterOptions()
other_args = ParserCommon.otherOptions()
# bins mode options
subparsers.add_parser(
"bins",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
parents=[
ParserCommon.inputOutputOptions(
opts=["bamfiles", "barcodes", "outFilePrefix", "BED"],
requiredOpts=["barcodes", "outFilePrefix"],
suppress_args=["BED"],
),
parserCommon.gtf_options(suppress=True),
ParserCommon.bamOptions(default_opts={"binSize": 10000, "distanceBetweenBins": 0}),
read_args,
filter_args,
get_args(),
other_args,
],
help="The reads are counted in bins of equal size. The bin size and distance between bins can be adjusted.",
add_help=False,
usage="%(prog)s -bs 100000 --bamfiles file1.bam file2.bam -o results \n",
)
# BED file arguments
subparsers.add_parser(
"features",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
parents=[
ParserCommon.inputOutputOptions(
opts=["bamfiles", "barcodes", "outFilePrefix", "BED"],
requiredOpts=["barcodes", "outFilePrefix", "BED"],
),
parserCommon.gtf_options(),
ParserCommon.bamOptions(
suppress_args=["binSize", "distanceBetweenBins"],
default_opts={"binSize": 10000, "distanceBetweenBins": 0},
),
read_args,
filter_args,
get_args(),
other_args,
],
help="The user provides a BED/GTF file containing all regions "
"that should be counted. A common use would be to count scRNA-seq reads on Genes.",
usage="%(prog)s --BED selection.bed --bamfiles file1.bam file2.bam --barcodes whitelist.txt -o results\n",
add_help=False,
)
return parser
[docs]def get_args():
parser = argparse.ArgumentParser(add_help=False)
optional = parser.add_argument_group("Misc arguments")
optional.add_argument(
"--genomeChunkSize",
type=int,
default=None,
help="Manually specify the size of the genome provided to each processor. "
"The default value of None specifies that this is determined by read "
"density of the BAM file.",
)
optional.add_argument(
"--outFileFormat",
type=str,
default="loom",
choices=["loom", "mtx"],
help="Output file format. Default is to write an anndata object of name "
"<prefix>.loom, which can either be opened in scanpy, or by downstream tools. "
'"mtx" refers to the MatrixMarket sparse-matrix format. The output in this case would be '
"<prefix>.counts.mtx, along with <prefix>.rownames.txt and <prefix>.colnames.txt",
)
return parser
[docs]def main(args=None):
"""
1. get read counts at different positions either
all of same length or from genomic regions from the BED file
2. save data for further plotting
"""
args, newlabels = ParserCommon.validateInputs(parseArguments().parse_args(args))
if not args.verbose:
logger.setLevel(logging.CRITICAL)
warnings.filterwarnings("ignore")
if "BED" in args:
bed_regions = args.BED
else:
bed_regions = None
## create row/colNames
if args.outFileFormat == "mtx":
mtxFile = args.outFilePrefix + ".counts.mtx"
rowNamesFile = args.outFilePrefix + ".rownames.txt"
colNamesFile = args.outFilePrefix + ".colnames.txt"
else:
rowNamesFile = None
stepSize = args.binSize + args.distanceBetweenBins
c = countR.CountReadsPerBin(
args.bamfiles,
binLength=args.binSize,
stepSize=stepSize,
barcodes=args.barcodes,
cellTag=args.cellTag,
groupTag=args.groupTag,
groupLabels=newlabels,
motifFilter=args.motifFilter,
genome2bit=args.genome2bit,
GCcontentFilter=args.GCcontentFilter,
numberOfSamples=None,
genomeChunkSize=args.genomeChunkSize,
numberOfProcessors=args.numberOfProcessors,
verbose=args.verbose,
region=args.region,
bedFile=bed_regions,
blackListFileName=args.blackListFileName,
extendReads=args.extendReads,
minMappingQuality=args.minMappingQuality,
duplicateFilter=args.duplicateFilter,
center_read=args.centerReads,
samFlag_include=args.samFlagInclude,
samFlag_exclude=args.samFlagExclude,
minFragmentLength=args.minFragmentLength,
maxFragmentLength=args.maxFragmentLength,
zerosToNans=False,
out_file_for_raw_data=rowNamesFile,
)
num_reads_per_bin, regionList = c.run(allArgs=args)
sys.stderr.write("Number of bins/features " "found: {}\n".format(num_reads_per_bin.shape[0]))
if num_reads_per_bin.shape[0] < 1:
exit(
"ERROR: too few non zero bins/features found.\n"
"If using --region please check that this "
"region is covered by reads.\n"
)
## write mtx/rownames if asked
if args.outFileFormat == "mtx":
f = open(colNamesFile, "w")
f.write("\n".join(newlabels))
f.write("\n")
f.close()
## write the matrix as .mtx
sp = sparse.csr_matrix(num_reads_per_bin)
io.mmwrite(mtxFile, sp, field="integer")
else:
# write anndata
adata = ad.AnnData(num_reads_per_bin.T)
adata.obs = pd.DataFrame(
{
"sample": [x.split("::")[-2] for x in newlabels],
"barcodes": [x.split("::")[-1] for x in newlabels],
},
index=newlabels,
)
rows = list(regionList)
adata.var = pd.DataFrame(
{
"chrom": [x.split("_")[0] for x in rows],
"start": [x.split("_")[1] for x in rows],
"end": [y.split("::")[0] for y in [x.split("_")[2] for x in rows]],
"name": [x.split("::")[1] for x in rows],
},
index=rows,
)
# export as loom
adata.write_loom(args.outFilePrefix + ".loom")