#!/usr/bin/env python
# -*- coding: utf-8 -*-
import argparse
import sys
import os
from deeptools import parserCommon, bamHandler, utilities
from deeptools.mapReduce import mapReduce
from deeptools.utilities import smartLabels
from deeptoolsintervals import GTF
import numpy as np
import py2bit
import pandas as pd
# logs
import warnings
import logging
logger = logging.getLogger()
warnings.simplefilter(action="ignore", category=FutureWarning)
## own functions
# scriptdir=os.path.abspath(os.path.join(__file__, "../../sincei"))
# sys.path.append(scriptdir)
from sincei.Utilities import *
from sincei import ParserCommon
[docs]def parseArguments():
filterParser = ParserCommon.filterOptions()
io_args = ParserCommon.inputOutputOptions(opts=["bamfiles", "barcodes", "outFile"], requiredOpts=["barcodes"])
bam_args = ParserCommon.bamOptions(
suppress_args=["region", "groupTag"],
default_opts={"binSize": 100000, "distanceBetweenBins": 1000000},
)
filter_args = ParserCommon.filterOptions()
read_args = ParserCommon.readOptions(
suppress_args=[
"minFragmentLength",
"maxFragmentLength",
"extendReads",
"centerReads",
]
)
other_args = ParserCommon.otherOptions()
parser = argparse.ArgumentParser(
parents=[io_args, bam_args, filter_args, read_args, other_args],
formatter_class=argparse.RawDescriptionHelpFormatter,
description="""
This tool estimates the number of reads that would be filtered given a set of settings and prints this to the terminal.
Further, it tracks the number of singleton reads. The following metrics will always be tracked regardless of what you specify (the order output also matches this):
* Total reads (including unmapped)
* Mapped reads
* Reads in blacklisted regions (--blackListFileName)
The following metrics are estimated according to the --binSize and --distanceBetweenBins parameters
* Estimated mapped reads filtered (the total number of mapped reads filtered for any reason)
* Alignments with a below threshold MAPQ (--minMappingQuality)
* Alignments with at least one missing flag (--samFlagInclude)
* Alignments with undesirable flags (--samFlagExclude)
* Duplicates determined by sincei (--duplicateFilter)
* Duplicates marked externally (e.g., by picard)
* Singletons (paired-end reads with only one mate aligning)
* Wrong strand (due to --filterRNAstrand)
The sum of these may be more than the total number of reads. Note that alignments are sampled from bins of size --binSize spaced --distanceBetweenBins apart.
""",
usage="Example usage: scFilterStats.py -b sample1.bam sample2.bam -bc barcodes.txt > log.txt",
add_help=False,
)
return parser
[docs]def getFiltered_worker(arglist):
chrom, start, end, args = arglist
# Fix the bounds
if end - start > args.binSize and end - start > args.distanceBetweenBins:
end -= args.distanceBetweenBins
if end <= start:
end = start + 1
## open genome if needed
if args.genome2bit:
twoBitGenome = py2bit.open(args.genome2bit, True)
## open blacklist file
blackList = None
if args.blackListFileName is not None:
blackList = GTF(args.blackListFileName)
o = []
for fname in args.bamfiles:
fh = bamHandler.openBam(fname)
chromUse = utilities.mungeChromosome(chrom, fh.references)
prev_pos = set()
lpos = None
## a dict with barcodes = keys
# metrics
blacklisted = {}
minMapq = {}
samFlagInclude = {}
samFlagExclude = {}
internalDupes = {}
externalDupes = {}
singletons = {}
filterRNAstrand = {}
filterMotifs = {}
filterGC = {}
minAlignedFraction = {}
# trackers
nFiltered = {}
total = {} # This is only used to estimate the percentage affected
filtered = {}
for b in args.barcodes:
total[b] = 0 # This is only used to estimate the percentage affected
nFiltered[b] = 0
blacklisted[b] = 0
minMapq[b] = 0
samFlagInclude[b] = 0
samFlagExclude[b] = 0
internalDupes[b] = 0
externalDupes[b] = 0
singletons[b] = 0
filterRNAstrand[b] = 0
filterMotifs[b] = 0
filterGC[b] = 0
minAlignedFraction[b] = 0
for read in fh.fetch(chromUse, start, end):
try:
bc = read.get_tag(args.cellTag)
except KeyError:
continue
# also keep a counter for barcodes not in whitelist?
if bc not in args.barcodes:
continue
filtered[bc] = 0
if read.pos < start:
# ensure that we never double count (in case distanceBetweenBins == 0)
continue
if read.flag & 4:
# Ignore unmapped reads, they were counted already
continue
if args.minMappingQuality and read.mapq < args.minMappingQuality:
filtered[bc] = 1
minMapq[bc] += 1
if args.samFlagInclude and read.flag & args.samFlagInclude != args.samFlagInclude:
filtered[bc] = 1
samFlagInclude[bc] += 1
if args.samFlagExclude and read.flag & args.samFlagExclude != 0:
filtered[bc] = 1
samFlagExclude[bc] += 1
if args.minAlignedFraction:
if not checkAlignedFraction(read, args.minAlignedFraction):
filtered[bc] = 1
minAlignedFraction[bc] += 1
## reads in blacklisted regions
if blackList and blackList.findOverlaps(
chrom,
read.reference_start,
read.reference_start + read.infer_query_length(always=False) - 1,
):
filtered[bc] = 1
blacklisted[bc] += 1
## Duplicates
if args.duplicateFilter:
tup = getDupFilterTuple(read, bc, args.duplicateFilter)
if lpos is not None and lpos == read.reference_start and tup in prev_pos:
filtered[bc] = 1
internalDupes[bc] += 1
if lpos != read.reference_start:
prev_pos.clear()
lpos = read.reference_start
prev_pos.add(tup)
if read.is_duplicate:
filtered[bc] = 1
externalDupes[bc] += 1
if read.is_paired and read.mate_is_unmapped:
filtered[bc] = 1
singletons[bc] += 1
## remove reads with low/high GC content
if args.GCcontentFilter:
if not checkGCcontent(read, args.GCcontentFilter[0], args.GCcontentFilter[1]):
filtered[bc] = 1
filterGC[bc] += 1
## remove reads that don't pass the motif filter
if args.motifFilter:
test = [checkMotifs(read, chrom, twoBitGenome, m[0], m[1]) for m in args.motifFilter]
# if none given motif found, return true
if not any(test):
filtered[bc] = 1
filterMotifs[bc] += 1
# filterRNAstrand
if args.filterRNAstrand:
if read.is_paired:
if args.filterRNAstrand == "forward":
if read.flag & 144 == 128 or read.flag & 96 == 64:
pass
else:
filtered[bc] = 1
filterRNAstrand[bc] += 1
elif args.filterRNAstrand == "reverse":
if read.flag & 144 == 144 or read.flag & 96 == 96:
pass
else:
filtered[bc] = 1
filterRNAstrand[bc] += 1
else:
if args.filterRNAstrand == "forward":
if read.flag & 16 == 16:
pass
else:
filtered[bc] = 1
filterRNAstrand[bc] += 1
elif args.filterRNAstrand == "reverse":
if read.flag & 16 == 0:
pass
else:
filtered[bc] = 1
filterRNAstrand[bc] += 1
total[bc] += 1
nFiltered[bc] += filtered[bc]
fh.close()
# first make a tuple where each entry is a dict of barcodes:value
tup = (
total,
nFiltered,
blacklisted,
minMapq,
samFlagInclude,
samFlagExclude,
internalDupes,
externalDupes,
singletons,
filterRNAstrand,
filterMotifs,
filterGC,
minAlignedFraction,
)
# now simplify it
merged = {}
for b in args.barcodes:
merged[b] = tuple(merged[b] for merged in tup)
# now merged is a dict with each key = barcode, values = tuple of stats
# Now convert it to array
out = np.stack([v for k, v in merged.items()])
# out is an array with row = len(barcode) [384], column = len(stats) [11]
o.append(out)
return o
[docs]def main(args=None):
args, rowLabels = ParserCommon.validateInputs(parseArguments().parse_args(args))
if not args.verbose:
logger.setLevel(logging.CRITICAL)
warnings.filterwarnings("ignore")
if args.outFile is None:
of = sys.stdout
else:
of = open(args.outFile, "w")
for bam in args.bamfiles:
x = bamHandler.openBam(bam, returnStats=True, nThreads=args.numberOfProcessors)[0]
chrom_sizes = list(zip(x.references, x.lengths))
checkBAMtag(x, bam, args.cellTag)
if args.groupTag:
checkBAMtag(x, bam, args.groupTag)
sys.stderr.write(
"--groupTag is not implemented for scFilterStats yet! \
Please split your BAM file by {} and re-run scFilterStats. \n".format(
args.groupTag
)
)
exit(1)
x.close()
# Get the remaining metrics
res = mapReduce(
[args],
getFiltered_worker,
chrom_sizes,
genomeChunkLength=args.binSize + args.distanceBetweenBins,
blackListFileName=args.blackListFileName,
numberOfProcessors=args.numberOfProcessors,
verbose=args.verbose,
)
## res, should be the list of np.arrays of length (len(barcodes) * 9)
## final output is an array where nrows = bamfiles*barcodes, ncol = No. of stats
final_array = np.asarray(res).sum(axis=0)
## get final row/col Names (bamnames_barcode)
colLabels = [
"Total_sampled",
"Filtered",
"Blacklisted",
"Low_MAPQ",
"Missing_Flags",
"Excluded_Flags",
"Internal_Duplicates",
"Marked_Duplicates",
"Singletons",
"Wrong_strand",
"Wrong_motif",
"Unwanted_GC_content",
"Low_aligned_fraction",
]
final_df = pd.DataFrame(data=np.concatenate(final_array), index=rowLabels, columns=colLabels)
final_df.index.name = "Cell_ID"
## since stats are approximate, present results as %
final_df.iloc[:, 1:] = final_df.iloc[:, 1:].div(final_df.Total_sampled, axis=0) * 100
if args.outFile is not None:
final_df.to_csv(args.outFile, sep="\t")
else:
print(final_df)
return 0