Source code for sincei.scFilterStats

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import argparse
import sys
import os

from deeptools import parserCommon, bamHandler, utilities
from deeptools.mapReduce import mapReduce
from deeptools.utilities import smartLabels
from deeptoolsintervals import GTF

import numpy as np
import py2bit
import pandas as pd

# logs
import warnings
import logging

logger = logging.getLogger()
warnings.simplefilter(action="ignore", category=FutureWarning)

## own functions
# scriptdir=os.path.abspath(os.path.join(__file__, "../../sincei"))
# sys.path.append(scriptdir)
from sincei.Utilities import *
from sincei import ParserCommon


[docs]def parseArguments():
    filterParser = ParserCommon.filterOptions()

    io_args = ParserCommon.inputOutputOptions(opts=["bamfiles", "barcodes", "outFile"], requiredOpts=["barcodes"])
    bam_args = ParserCommon.bamOptions(
        suppress_args=["region", "groupTag"],
        default_opts={"binSize": 100000, "distanceBetweenBins": 1000000},
    )
    filter_args = ParserCommon.filterOptions()
    read_args = ParserCommon.readOptions(
        suppress_args=[
            "minFragmentLength",
            "maxFragmentLength",
            "extendReads",
            "centerReads",
        ]
    )
    other_args = ParserCommon.otherOptions()

    parser = argparse.ArgumentParser(
        parents=[io_args, bam_args, filter_args, read_args, other_args],
        formatter_class=argparse.RawDescriptionHelpFormatter,
        description="""
This tool estimates the number of reads that would be filtered given a set of settings and prints this to the terminal.
Further, it tracks the number of singleton reads. The following metrics will always be tracked regardless of what you specify (the order output also matches this):

 * Total reads (including unmapped)
 * Mapped reads
 * Reads in blacklisted regions (--blackListFileName)

The following metrics are estimated according to the --binSize and --distanceBetweenBins parameters
 * Estimated mapped reads filtered (the total number of mapped reads filtered for any reason)
 * Alignments with a below threshold MAPQ (--minMappingQuality)
 * Alignments with at least one missing flag (--samFlagInclude)
 * Alignments with undesirable flags (--samFlagExclude)
 * Duplicates determined by sincei (--duplicateFilter)
 * Duplicates marked externally (e.g., by picard)
 * Singletons (paired-end reads with only one mate aligning)
 * Wrong strand (due to --filterRNAstrand)

The sum of these may be more than the total number of reads. Note that alignments are sampled from bins of size --binSize spaced --distanceBetweenBins apart.
""",
        usage="Example usage: scFilterStats.py -b sample1.bam sample2.bam -bc barcodes.txt > log.txt",
        add_help=False,
    )

    return parser


[docs]def getFiltered_worker(arglist):
    chrom, start, end, args = arglist
    # Fix the bounds
    if end - start > args.binSize and end - start > args.distanceBetweenBins:
        end -= args.distanceBetweenBins
    if end <= start:
        end = start + 1
    ## open genome if needed
    if args.genome2bit:
        twoBitGenome = py2bit.open(args.genome2bit, True)
    ## open blacklist file
    blackList = None
    if args.blackListFileName is not None:
        blackList = GTF(args.blackListFileName)

    o = []
    for fname in args.bamfiles:
        fh = bamHandler.openBam(fname)
        chromUse = utilities.mungeChromosome(chrom, fh.references)
        prev_pos = set()
        lpos = None

        ## a dict with barcodes = keys
        # metrics
        blacklisted = {}
        minMapq = {}
        samFlagInclude = {}
        samFlagExclude = {}
        internalDupes = {}
        externalDupes = {}
        singletons = {}
        filterRNAstrand = {}
        filterMotifs = {}
        filterGC = {}
        minAlignedFraction = {}
        # trackers
        nFiltered = {}
        total = {}  # This is only used to estimate the percentage affected
        filtered = {}

        for b in args.barcodes:
            total[b] = 0  # This is only used to estimate the percentage affected
            nFiltered[b] = 0
            blacklisted[b] = 0
            minMapq[b] = 0
            samFlagInclude[b] = 0
            samFlagExclude[b] = 0
            internalDupes[b] = 0
            externalDupes[b] = 0
            singletons[b] = 0
            filterRNAstrand[b] = 0
            filterMotifs[b] = 0
            filterGC[b] = 0
            minAlignedFraction[b] = 0

        for read in fh.fetch(chromUse, start, end):
            try:
                bc = read.get_tag(args.cellTag)
            except KeyError:
                continue
            # also keep a counter for barcodes not in whitelist?
            if bc not in args.barcodes:
                continue

            filtered[bc] = 0
            if read.pos < start:
                # ensure that we never double count (in case distanceBetweenBins == 0)
                continue

            if read.flag & 4:
                # Ignore unmapped reads, they were counted already
                continue

            if args.minMappingQuality and read.mapq < args.minMappingQuality:
                filtered[bc] = 1
                minMapq[bc] += 1
            if args.samFlagInclude and read.flag & args.samFlagInclude != args.samFlagInclude:
                filtered[bc] = 1
                samFlagInclude[bc] += 1
            if args.samFlagExclude and read.flag & args.samFlagExclude != 0:
                filtered[bc] = 1
                samFlagExclude[bc] += 1

            if args.minAlignedFraction:
                if not checkAlignedFraction(read, args.minAlignedFraction):
                    filtered[bc] = 1
                    minAlignedFraction[bc] += 1

            ## reads in blacklisted regions
            if blackList and blackList.findOverlaps(
                chrom,
                read.reference_start,
                read.reference_start + read.infer_query_length(always=False) - 1,
            ):
                filtered[bc] = 1
                blacklisted[bc] += 1

            ## Duplicates
            if args.duplicateFilter:
                tup = getDupFilterTuple(read, bc, args.duplicateFilter)
                if lpos is not None and lpos == read.reference_start and tup in prev_pos:
                    filtered[bc] = 1
                    internalDupes[bc] += 1
                if lpos != read.reference_start:
                    prev_pos.clear()
                lpos = read.reference_start
                prev_pos.add(tup)
            if read.is_duplicate:
                filtered[bc] = 1
                externalDupes[bc] += 1
            if read.is_paired and read.mate_is_unmapped:
                filtered[bc] = 1
                singletons[bc] += 1

            ## remove reads with low/high GC content
            if args.GCcontentFilter:
                if not checkGCcontent(read, args.GCcontentFilter[0], args.GCcontentFilter[1]):
                    filtered[bc] = 1
                    filterGC[bc] += 1

            ## remove reads that don't pass the motif filter
            if args.motifFilter:
                test = [checkMotifs(read, chrom, twoBitGenome, m[0], m[1]) for m in args.motifFilter]
                # if none given motif found, return true
                if not any(test):
                    filtered[bc] = 1
                    filterMotifs[bc] += 1

            # filterRNAstrand
            if args.filterRNAstrand:
                if read.is_paired:
                    if args.filterRNAstrand == "forward":
                        if read.flag & 144 == 128 or read.flag & 96 == 64:
                            pass
                        else:
                            filtered[bc] = 1
                            filterRNAstrand[bc] += 1
                    elif args.filterRNAstrand == "reverse":
                        if read.flag & 144 == 144 or read.flag & 96 == 96:
                            pass
                        else:
                            filtered[bc] = 1
                            filterRNAstrand[bc] += 1
                else:
                    if args.filterRNAstrand == "forward":
                        if read.flag & 16 == 16:
                            pass
                        else:
                            filtered[bc] = 1
                            filterRNAstrand[bc] += 1
                    elif args.filterRNAstrand == "reverse":
                        if read.flag & 16 == 0:
                            pass
                        else:
                            filtered[bc] = 1
                            filterRNAstrand[bc] += 1

            total[bc] += 1
            nFiltered[bc] += filtered[bc]
        fh.close()

        # first make a tuple where each entry is a dict of barcodes:value
        tup = (
            total,
            nFiltered,
            blacklisted,
            minMapq,
            samFlagInclude,
            samFlagExclude,
            internalDupes,
            externalDupes,
            singletons,
            filterRNAstrand,
            filterMotifs,
            filterGC,
            minAlignedFraction,
        )

        # now simplify it
        merged = {}
        for b in args.barcodes:
            merged[b] = tuple(merged[b] for merged in tup)
        # now merged is a dict with each key = barcode, values = tuple of stats
        # Now convert it to array
        out = np.stack([v for k, v in merged.items()])
        # out is an array with row = len(barcode) [384], column = len(stats) [11]
        o.append(out)
    return o


[docs]def main(args=None):
    args, rowLabels = ParserCommon.validateInputs(parseArguments().parse_args(args))
    if not args.verbose:
        logger.setLevel(logging.CRITICAL)
        warnings.filterwarnings("ignore")

    if args.outFile is None:
        of = sys.stdout
    else:
        of = open(args.outFile, "w")

    for bam in args.bamfiles:
        x = bamHandler.openBam(bam, returnStats=True, nThreads=args.numberOfProcessors)[0]
        chrom_sizes = list(zip(x.references, x.lengths))

        checkBAMtag(x, bam, args.cellTag)
        if args.groupTag:
            checkBAMtag(x, bam, args.groupTag)
            sys.stderr.write(
                "--groupTag is not implemented for scFilterStats yet! \
            Please split your BAM file by {} and re-run scFilterStats. \n".format(
                    args.groupTag
                )
            )
            exit(1)
        x.close()

    # Get the remaining metrics
    res = mapReduce(
        [args],
        getFiltered_worker,
        chrom_sizes,
        genomeChunkLength=args.binSize + args.distanceBetweenBins,
        blackListFileName=args.blackListFileName,
        numberOfProcessors=args.numberOfProcessors,
        verbose=args.verbose,
    )
    ## res, should be the list of np.arrays of length (len(barcodes) * 9)

    ## final output is an array where nrows = bamfiles*barcodes, ncol = No. of stats
    final_array = np.asarray(res).sum(axis=0)
    ## get final row/col Names (bamnames_barcode)
    colLabels = [
        "Total_sampled",
        "Filtered",
        "Blacklisted",
        "Low_MAPQ",
        "Missing_Flags",
        "Excluded_Flags",
        "Internal_Duplicates",
        "Marked_Duplicates",
        "Singletons",
        "Wrong_strand",
        "Wrong_motif",
        "Unwanted_GC_content",
        "Low_aligned_fraction",
    ]

    final_df = pd.DataFrame(data=np.concatenate(final_array), index=rowLabels, columns=colLabels)
    final_df.index.name = "Cell_ID"
    ## since stats are approximate, present results as %
    final_df.iloc[:, 1:] = final_df.iloc[:, 1:].div(final_df.Total_sampled, axis=0) * 100

    if args.outFile is not None:
        final_df.to_csv(args.outFile, sep="\t")
    else:
        print(final_df)

    return 0