Source code for sincei.scBAMops

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import argparse
import pysam
import os
import sys
import pandas as pd
import py2bit
from deeptools import parserCommon
from deeptools.bamHandler import openBam
from deeptools.mapReduce import mapReduce
from deeptools.utilities import getTLen, smartLabels, getTempFileName

# logs
import warnings
import logging

logger = logging.getLogger()
warnings.simplefilter(action="ignore", category=FutureWarning)

# own functions
# scriptdir=os.path.abspath(os.path.join(__file__, "../../sincei"))
# sys.path.append(scriptdir)

from sincei import ParserCommon
from sincei.Utilities import checkMotifs, checkGCcontent, getDupFilterTuple
from sincei._version import __version__

## UPDATE: add group tag to BAM file based on a 2-columns mapping file (barcode -> group)


[docs]def parseArguments():
    internalParser = get_args()
    ioParser = ParserCommon.inputOutputOptions(opts=["bamfile", "groupInfo", "outFile"])
    bamParser = ParserCommon.bamOptions(suppress_args=["binSize", "distanceBetweenBins"])
    filterParser = ParserCommon.filterOptions()
    readParser = ParserCommon.readOptions(suppress_args=["extendReads", "centerReads"])
    otherParser = ParserCommon.otherOptions()
    parser = argparse.ArgumentParser(
        parents=[
            ioParser,
            internalParser,
            bamParser,
            filterParser,
            readParser,
            otherParser,
        ],
        formatter_class=argparse.RawDescriptionHelpFormatter,
        description="This tool filters alignments in a BAM/CRAM file according the the specified parameters. It can optionally output to BEDPE format.",
        usage="Example usage: scBAMops -b sample1.bam -o sample1.filtered.bam --minMappingQuality 10 --filterMetrics log.txt",
        add_help=False,
    )
    return parser


[docs]def get_args():
    parser = argparse.ArgumentParser(add_help=False)

    mod = parser.add_argument_group("Read modifiction arguments")
    mod.add_argument(
        "--shift",
        nargs="+",
        type=int,
        help='Shift the left and right end of a read (for BAM files) or a fragment (for BED files). A positive value shift an end to the right (on the + strand) and a negative value shifts a fragment to the left. Either 2 or 4 integers can be provided. For example, "2 -3" will shift the left-most fragment end two bases to the right and the right-most end 3 bases to the left. If 4 integers are provided, then the first and last two refer to fragments whose read 1 is on the left or right, respectively. Consequently, it is possible to take strand into consideration for strand-specific protocols. A fragment whose length falls below 1 due to shifting will not be written to the output. See the online documentation for graphical examples. Note that non-properly-paired reads will be filtered.',
    )

    mod.add_argument(
        "--ATACshift",
        action="store_true",
        help="Shift the produced BAM file or BEDPE regions as commonly done for ATAC-seq. This is equivalent to --shift 4 -5 5 -4.",
    )

    output = parser.add_argument_group("Optional output arguments")
    output.add_argument(
        "--BED",
        action="store_true",
        help="Instead of producing BAM files, write output in BEDPE format (as defined by MACS2). Note that only reads/fragments passing filtering criterion are written in BEDPE format.",
    )

    output.add_argument(
        "--filterMetrics",
        metavar="FILE.log",
        help="The number of entries in total and filtered are saved to this file",
    )

    output.add_argument(
        "--filteredOutReads",
        metavar="filtered.bam",
        help="If desired, all reads NOT passing the filtering criteria can be written to this file.",
    )

    output.add_argument(
        "--outTagName",
        metavar="STR",
        type=str,
        help="In case where you want to group the BAM files based on user-defined --groupInfo, specify the output BAM tag which would contain the group name.",
        default=None,
    )

    return parser


[docs]def shiftRead(b, chromDict, args):
    if not b.is_proper_pair:
        return None
    tLen = getTLen(b, notAbs=True)
    start = b.pos
    end = start + b.query_alignment_end
    if b.is_reverse and not b.is_read2:
        end -= args.shift[2]
        deltaTLen = args.shift[3] - args.shift[2]
    elif b.is_reverse and b.is_read2:
        end += args.shift[1]
        deltaTLen = args.shift[1] - args.shift[0]
    elif not b.is_reverse and not b.is_read2:
        start += args.shift[0]
        deltaTLen = args.shift[1] - args.shift[0]
    else:
        start -= args.shift[3]
        deltaTLen = args.shift[3] - args.shift[2]

    # Sanity check
    if end - start < 1:
        if b.is_reverse:
            start = end - 1
        else:
            end = start + 1
    if start < 0:
        start = 0
    if end > chromDict[b.reference_name]:
        end = chromDict[b.reference_name]
    if end - start < 1:
        return None

    # create a new read
    b2 = pysam.AlignedSegment()
    b2.query_name = b.query_name
    b2.flag = b.flag
    b2.reference_id = b.reference_id
    b2.reference_start = start
    b2.mapping_quality = b.mapping_quality
    b2.cigar = ((0, end - start),)  # Returned cigar is only matches
    if tLen < 0:
        b2.template_length = tLen - deltaTLen
    else:
        b2.template_length = tLen + deltaTLen
    b2.next_reference_id = b.next_reference_id
    b2.next_reference_start = b.next_reference_start
    if b.is_proper_pair:
        if b2.is_read2 and b2.is_reverse:
            b2.next_reference_start += args.shift[0]
        elif not b2.is_read2 and b2.is_reverse:
            b2.next_reference_start -= args.shift[3]

    return b2


[docs]def filterWorker(arglist):
    chrom, start, end, args, chromDict = arglist
    fh = openBam(args.bamfile)
    # open 2 bit if needed
    if args.genome2bit:
        twoBitGenome = py2bit.open(args.genome2bit, True)
        if chrom not in twoBitGenome.chroms().keys():
            raise NameError("chromosome {} not found in 2bit file".format(chrom))

    mode = "wbu"
    oname = getTempFileName(suffix=".bam")
    ofh = pysam.AlignmentFile(oname, mode=mode, template=fh)

    if args.filteredOutReads:
        onameFiltered = getTempFileName(suffix=".bam")
        ofiltered = pysam.AlignmentFile(onameFiltered, mode=mode, template=fh)
    else:
        onameFiltered = None
        ofiltered = None

    prev_pos = set()
    lpos = None

    nFiltered = 0
    total = 0
    for read in fh.fetch(chrom, start, end):
        if read.pos < start:
            # ensure that we never double count (in case distanceBetweenBins == 0)
            continue
        total += 1

        try:
            bc = read.get_tag(args.cellTag)
        except KeyError:
            nFiltered += 1
            continue
        if isinstance(args.groupInfo, pd.DataFrame):
            smpl = read.get_tag(args.groupTag)
            try:
                tag_to_add = args.groupInfo.loc[
                    (args.groupInfo["sample"] == smpl) & (args.groupInfo["barcode"] == bc),
                    "cluster",
                ].values.item()
                read.set_tag(args.outTagName, tag_to_add, value_type="Z", replace=True)
            except:
                if args.verbose:
                    sys.stderr.write("Encountered read tags not in groupInfo file, skipped..")
                nFiltered += 1
                if ofiltered:
                    ofiltered.write(read)
                continue

        ## -- begin filtering -- ##
        if read.flag & 4:
            # Ignore unmapped reads, they were counted already
            nFiltered += 1
            if ofiltered:
                ofiltered.write(read)
            continue

        if args.minMappingQuality and read.mapq < args.minMappingQuality:
            nFiltered += 1
            if ofiltered:
                ofiltered.write(read)
            continue

        if args.samFlagInclude and read.flag & args.samFlagInclude != args.samFlagInclude:
            nFiltered += 1
            if ofiltered:
                ofiltered.write(read)
            continue
        if args.samFlagExclude and read.flag & args.samFlagExclude != 0:
            nFiltered += 1
            if ofiltered:
                ofiltered.write(read)
            continue

        tLen = getTLen(read)
        if args.minFragmentLength > 0 and tLen < args.minFragmentLength:
            nFiltered += 1
            if ofiltered:
                ofiltered.write(read)
            continue
        if args.maxFragmentLength > 0 and tLen > args.maxFragmentLength:
            nFiltered += 1
            if ofiltered:
                ofiltered.write(read)
            continue

        if args.duplicateFilter:
            tup = getDupFilterTuple(read, bc, args.duplicateFilter)
            if lpos is not None and lpos == read.reference_start and tup in prev_pos:
                nFiltered += 1
                if ofiltered:
                    ofiltered.write(read)
                continue
            if lpos != read.reference_start:
                prev_pos.clear()
            lpos = read.reference_start
            prev_pos.add(tup)

        # remove reads with low/high GC content
        if args.GCcontentFilter:
            if not checkGCcontent(read, args.GCcontentFilter[0], args.GCcontentFilter[1]):
                nFiltered += 1
                if ofiltered:
                    ofiltered.write(read)
                continue

        # remove reads that don't pass the motif filter
        if args.motifFilter:
            if not checkMotifs(read, chrom, twoBitGenome, args.motifFilter[0], args.motifFilter[1]):
                nFiltered += 1
                if ofiltered:
                    ofiltered.write(read)
                continue

        # filterRNAstrand
        if args.filterRNAstrand:
            if read.is_paired:
                if args.filterRNAstrand == "forward":
                    if read.flag & 144 == 128 or read.flag & 96 == 64:
                        pass
                    else:
                        nFiltered += 1
                        if ofiltered:
                            ofiltered.write(read)
                        continue
                elif args.filterRNAstrand == "reverse":
                    if read.flag & 144 == 144 or read.flag & 96 == 96:
                        pass
                    else:
                        nFiltered += 1
                        if ofiltered:
                            ofiltered.write(read)
                        continue
            else:
                if args.filterRNAstrand == "forward":
                    if read.flag & 16 == 16:
                        pass
                    else:
                        nFiltered += 1
                        if ofiltered:
                            ofiltered.write(read)
                        continue
                elif args.filterRNAstrand == "reverse":
                    if read.flag & 16 == 0:
                        pass
                    else:
                        nFiltered += 1
                        if ofiltered:
                            ofiltered.write(read)
                        continue

        if args.shift:
            read = shiftRead(read, chromDict, args)
            if not read:
                continue

        # Read survived filtering
        ofh.write(read)

    # The results from the workers will get sorted, so get the TID
    tid = fh.get_tid(chrom)

    ofh.close()
    if ofiltered:
        ofiltered.close()
    fh.close()
    if args.genome2bit:
        twoBitGenome.close()

    return tid, start, total, nFiltered, oname, onameFiltered


[docs]def convertBED(oname, tmpFiles, chromDict):
    """
    Stores results in BEDPE format, which is:
    chromosome	frag_leftend	frag_rightend

    The fragment ends can be shifted
    """
    ofile = open(oname, "w")
    for tmpFile in tmpFiles:
        fh = pysam.AlignmentFile(tmpFile)

        for b in fh.fetch(until_eof=True):
            tLen = getTLen(b, notAbs=True)
            if tLen > 0:
                start = b.pos
                end = start + tLen
                if end > chromDict[b.reference_name]:
                    end = chromDict[b.reference_name]
                if end - start < 1:
                    continue
                ofile.write("{}\t{}\t{}\n".format(b.reference_name, start, end))
        fh.close()
        os.unlink(tmpFile)
    ofile.close()


[docs]def main(args=None):
    args = parseArguments().parse_args(args)
    if not args.verbose:
        logger.setLevel(logging.CRITICAL)
        warnings.filterwarnings("ignore")
    # grouping and tagging the output alignments
    if args.groupInfo:
        if not args.outTagName:
            sys.stderr.write(
                "--groupInfo provided without --outTagName. By default, groups will be added to the 'RG' tag in the output BAM file."
            )
            args.outTagName = "RG"

        df = pd.read_csv(args.groupInfo, sep="\t", index_col=None, comment="#")
        if len(df.columns) == 3:
            df.columns = ["sample", "barcode", "cluster"]
            df.index = df[["sample", "barcode"]].apply(lambda x: "::".join(x), axis=1)
            args.groupInfo = df
        else:
            sys.stderr.write(
                "Error: The input to --groupInfo must be a 3-column tsv file with <sample> <barcode> and <group>"
            )

    # shifting the output alignments
    if args.shift:
        if len(args.shift) not in [2, 4]:
            sys.exit("The --shift option can accept either 2 or 4 values only.")
        if len(args.shift) == 2:
            args.shift.extend([-args.shift[1], -args.shift[0]])
    elif args.ATACshift:
        args.shift = [4, -5, 5, -4]

    bam, mapped, unmapped, stats = openBam(args.bamfile, returnStats=True, nThreads=args.numberOfProcessors)
    total = mapped + unmapped
    chrom_sizes = [(x, y) for x, y in zip(bam.references, bam.lengths)]
    chromDict = {x: y for x, y in zip(bam.references, bam.lengths)}

    # if motifFilter is given, convert the input to list
    if args.motifFilter:
        if not args.genome2bit:
            print("MotifFilter asked but genome (2bit) file not provided.")
            sys.exit(1)
        else:
            args.motifFilter = args.motifFilter.strip(" ").split(",")

    if args.GCcontentFilter:
        gc = args.GCcontentFilter.strip(" ").split(",")
        args.GCcontentFilter = [float(x) for x in gc]

    # Filter, writing the results to a bunch of temporary files
    res = mapReduce(
        [args, chromDict],
        filterWorker,
        chrom_sizes,
        region=args.region,
        blackListFileName=args.blackListFileName,
        numberOfProcessors=args.numberOfProcessors,
        verbose=args.verbose,
    )

    res = sorted(res)  # The temp files are now in order for concatenation
    nFiltered = sum([x[3] for x in res])
    totalSeen = sum([x[2] for x in res])  # The * contig isn't queried

    tmpFiles = [x[4] for x in res]
    if not args.BED:
        arguments = ["-o", args.outFile]
        arguments.extend(tmpFiles)  # [..., *someList] isn't available in python 2.7
        pysam.samtools.cat(*arguments)
        for tmpFile in tmpFiles:
            os.unlink(tmpFile)
    else:
        convertBED(args.outFile, tmpFiles, chromDict)

    if args.filteredOutReads:
        tmpFiles = [x[5] for x in res]
        if not args.BED:
            arguments = ["-o", args.filteredOutReads]
            arguments.extend(tmpFiles)  # [..., *someList] isn't available in python 2.7
            pysam.samtools.cat(*arguments)
            for tmpFile in tmpFiles:
                os.unlink(tmpFile)
        else:
            convertBED(args.outFile, tmpFiles, chromDict, args)

    if args.filterMetrics:
        sampleName = args.bamfile
        if args.label:
            sampleName = args.label
        if args.smartLabels:
            sampleName = smartLabels([args.bamfile])[0]

        of = open(args.filterMetrics, "w")
        of.write("#bamFilterReads --filterMetrics\n")
        of.write("#File\tReads Remaining\tTotal Initial Reads\n")
        of.write("{}\t{}\t{}\n".format(sampleName, totalSeen - nFiltered, total))
        of.close()

    return 0