Source code for sincei.scBAMops

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import argparse
import pysam
import os
import sys
import pandas as pd
import py2bit
from deeptools import parserCommon
from deeptools.bamHandler import openBam
from deeptools.mapReduce import mapReduce
from deeptools.utilities import getTLen, smartLabels, getTempFileName

# logs
import warnings
import logging

logger = logging.getLogger()
warnings.simplefilter(action="ignore", category=FutureWarning)

# own functions
# scriptdir=os.path.abspath(os.path.join(__file__, "../../sincei"))
# sys.path.append(scriptdir)

from sincei import ParserCommon
from sincei.Utilities import checkMotifs, checkGCcontent, getDupFilterTuple
from sincei._version import __version__

## UPDATE: add group tag to BAM file based on a 2-columns mapping file (barcode -> group)


[docs]def parseArguments(): internalParser = get_args() ioParser = ParserCommon.inputOutputOptions(opts=["bamfile", "groupInfo", "outFile"]) bamParser = ParserCommon.bamOptions(suppress_args=["binSize", "distanceBetweenBins"]) filterParser = ParserCommon.filterOptions() readParser = ParserCommon.readOptions(suppress_args=["extendReads", "centerReads"]) otherParser = ParserCommon.otherOptions() parser = argparse.ArgumentParser( parents=[ ioParser, internalParser, bamParser, filterParser, readParser, otherParser, ], formatter_class=argparse.RawDescriptionHelpFormatter, description="This tool filters alignments in a BAM/CRAM file according the the specified parameters. It can optionally output to BEDPE format.", usage="Example usage: scBAMops -b sample1.bam -o sample1.filtered.bam --minMappingQuality 10 --filterMetrics log.txt", add_help=False, ) return parser
[docs]def get_args(): parser = argparse.ArgumentParser(add_help=False) mod = parser.add_argument_group("Read modifiction arguments") mod.add_argument( "--shift", nargs="+", type=int, help='Shift the left and right end of a read (for BAM files) or a fragment (for BED files). A positive value shift an end to the right (on the + strand) and a negative value shifts a fragment to the left. Either 2 or 4 integers can be provided. For example, "2 -3" will shift the left-most fragment end two bases to the right and the right-most end 3 bases to the left. If 4 integers are provided, then the first and last two refer to fragments whose read 1 is on the left or right, respectively. Consequently, it is possible to take strand into consideration for strand-specific protocols. A fragment whose length falls below 1 due to shifting will not be written to the output. See the online documentation for graphical examples. Note that non-properly-paired reads will be filtered.', ) mod.add_argument( "--ATACshift", action="store_true", help="Shift the produced BAM file or BEDPE regions as commonly done for ATAC-seq. This is equivalent to --shift 4 -5 5 -4.", ) output = parser.add_argument_group("Optional output arguments") output.add_argument( "--BED", action="store_true", help="Instead of producing BAM files, write output in BEDPE format (as defined by MACS2). Note that only reads/fragments passing filtering criterion are written in BEDPE format.", ) output.add_argument( "--filterMetrics", metavar="FILE.log", help="The number of entries in total and filtered are saved to this file", ) output.add_argument( "--filteredOutReads", metavar="filtered.bam", help="If desired, all reads NOT passing the filtering criteria can be written to this file.", ) output.add_argument( "--outTagName", metavar="STR", type=str, help="In case where you want to group the BAM files based on user-defined --groupInfo, specify the output BAM tag which would contain the group name.", default=None, ) return parser
[docs]def shiftRead(b, chromDict, args): if not b.is_proper_pair: return None tLen = getTLen(b, notAbs=True) start = b.pos end = start + b.query_alignment_end if b.is_reverse and not b.is_read2: end -= args.shift[2] deltaTLen = args.shift[3] - args.shift[2] elif b.is_reverse and b.is_read2: end += args.shift[1] deltaTLen = args.shift[1] - args.shift[0] elif not b.is_reverse and not b.is_read2: start += args.shift[0] deltaTLen = args.shift[1] - args.shift[0] else: start -= args.shift[3] deltaTLen = args.shift[3] - args.shift[2] # Sanity check if end - start < 1: if b.is_reverse: start = end - 1 else: end = start + 1 if start < 0: start = 0 if end > chromDict[b.reference_name]: end = chromDict[b.reference_name] if end - start < 1: return None # create a new read b2 = pysam.AlignedSegment() b2.query_name = b.query_name b2.flag = b.flag b2.reference_id = b.reference_id b2.reference_start = start b2.mapping_quality = b.mapping_quality b2.cigar = ((0, end - start),) # Returned cigar is only matches if tLen < 0: b2.template_length = tLen - deltaTLen else: b2.template_length = tLen + deltaTLen b2.next_reference_id = b.next_reference_id b2.next_reference_start = b.next_reference_start if b.is_proper_pair: if b2.is_read2 and b2.is_reverse: b2.next_reference_start += args.shift[0] elif not b2.is_read2 and b2.is_reverse: b2.next_reference_start -= args.shift[3] return b2
[docs]def filterWorker(arglist): chrom, start, end, args, chromDict = arglist fh = openBam(args.bamfile) # open 2 bit if needed if args.genome2bit: twoBitGenome = py2bit.open(args.genome2bit, True) if chrom not in twoBitGenome.chroms().keys(): raise NameError("chromosome {} not found in 2bit file".format(chrom)) mode = "wbu" oname = getTempFileName(suffix=".bam") ofh = pysam.AlignmentFile(oname, mode=mode, template=fh) if args.filteredOutReads: onameFiltered = getTempFileName(suffix=".bam") ofiltered = pysam.AlignmentFile(onameFiltered, mode=mode, template=fh) else: onameFiltered = None ofiltered = None prev_pos = set() lpos = None nFiltered = 0 total = 0 for read in fh.fetch(chrom, start, end): if read.pos < start: # ensure that we never double count (in case distanceBetweenBins == 0) continue total += 1 try: bc = read.get_tag(args.cellTag) except KeyError: nFiltered += 1 continue if isinstance(args.groupInfo, pd.DataFrame): smpl = read.get_tag(args.groupTag) try: tag_to_add = args.groupInfo.loc[ (args.groupInfo["sample"] == smpl) & (args.groupInfo["barcode"] == bc), "cluster", ].values.item() read.set_tag(args.outTagName, tag_to_add, value_type="Z", replace=True) except: if args.verbose: sys.stderr.write("Encountered read tags not in groupInfo file, skipped..") nFiltered += 1 if ofiltered: ofiltered.write(read) continue ## -- begin filtering -- ## if read.flag & 4: # Ignore unmapped reads, they were counted already nFiltered += 1 if ofiltered: ofiltered.write(read) continue if args.minMappingQuality and read.mapq < args.minMappingQuality: nFiltered += 1 if ofiltered: ofiltered.write(read) continue if args.samFlagInclude and read.flag & args.samFlagInclude != args.samFlagInclude: nFiltered += 1 if ofiltered: ofiltered.write(read) continue if args.samFlagExclude and read.flag & args.samFlagExclude != 0: nFiltered += 1 if ofiltered: ofiltered.write(read) continue tLen = getTLen(read) if args.minFragmentLength > 0 and tLen < args.minFragmentLength: nFiltered += 1 if ofiltered: ofiltered.write(read) continue if args.maxFragmentLength > 0 and tLen > args.maxFragmentLength: nFiltered += 1 if ofiltered: ofiltered.write(read) continue if args.duplicateFilter: tup = getDupFilterTuple(read, bc, args.duplicateFilter) if lpos is not None and lpos == read.reference_start and tup in prev_pos: nFiltered += 1 if ofiltered: ofiltered.write(read) continue if lpos != read.reference_start: prev_pos.clear() lpos = read.reference_start prev_pos.add(tup) # remove reads with low/high GC content if args.GCcontentFilter: if not checkGCcontent(read, args.GCcontentFilter[0], args.GCcontentFilter[1]): nFiltered += 1 if ofiltered: ofiltered.write(read) continue # remove reads that don't pass the motif filter if args.motifFilter: if not checkMotifs(read, chrom, twoBitGenome, args.motifFilter[0], args.motifFilter[1]): nFiltered += 1 if ofiltered: ofiltered.write(read) continue # filterRNAstrand if args.filterRNAstrand: if read.is_paired: if args.filterRNAstrand == "forward": if read.flag & 144 == 128 or read.flag & 96 == 64: pass else: nFiltered += 1 if ofiltered: ofiltered.write(read) continue elif args.filterRNAstrand == "reverse": if read.flag & 144 == 144 or read.flag & 96 == 96: pass else: nFiltered += 1 if ofiltered: ofiltered.write(read) continue else: if args.filterRNAstrand == "forward": if read.flag & 16 == 16: pass else: nFiltered += 1 if ofiltered: ofiltered.write(read) continue elif args.filterRNAstrand == "reverse": if read.flag & 16 == 0: pass else: nFiltered += 1 if ofiltered: ofiltered.write(read) continue if args.shift: read = shiftRead(read, chromDict, args) if not read: continue # Read survived filtering ofh.write(read) # The results from the workers will get sorted, so get the TID tid = fh.get_tid(chrom) ofh.close() if ofiltered: ofiltered.close() fh.close() if args.genome2bit: twoBitGenome.close() return tid, start, total, nFiltered, oname, onameFiltered
[docs]def convertBED(oname, tmpFiles, chromDict): """ Stores results in BEDPE format, which is: chromosome frag_leftend frag_rightend The fragment ends can be shifted """ ofile = open(oname, "w") for tmpFile in tmpFiles: fh = pysam.AlignmentFile(tmpFile) for b in fh.fetch(until_eof=True): tLen = getTLen(b, notAbs=True) if tLen > 0: start = b.pos end = start + tLen if end > chromDict[b.reference_name]: end = chromDict[b.reference_name] if end - start < 1: continue ofile.write("{}\t{}\t{}\n".format(b.reference_name, start, end)) fh.close() os.unlink(tmpFile) ofile.close()
[docs]def main(args=None): args = parseArguments().parse_args(args) if not args.verbose: logger.setLevel(logging.CRITICAL) warnings.filterwarnings("ignore") # grouping and tagging the output alignments if args.groupInfo: if not args.outTagName: sys.stderr.write( "--groupInfo provided without --outTagName. By default, groups will be added to the 'RG' tag in the output BAM file." ) args.outTagName = "RG" df = pd.read_csv(args.groupInfo, sep="\t", index_col=None, comment="#") if len(df.columns) == 3: df.columns = ["sample", "barcode", "cluster"] df.index = df[["sample", "barcode"]].apply(lambda x: "::".join(x), axis=1) args.groupInfo = df else: sys.stderr.write( "Error: The input to --groupInfo must be a 3-column tsv file with <sample> <barcode> and <group>" ) # shifting the output alignments if args.shift: if len(args.shift) not in [2, 4]: sys.exit("The --shift option can accept either 2 or 4 values only.") if len(args.shift) == 2: args.shift.extend([-args.shift[1], -args.shift[0]]) elif args.ATACshift: args.shift = [4, -5, 5, -4] bam, mapped, unmapped, stats = openBam(args.bamfile, returnStats=True, nThreads=args.numberOfProcessors) total = mapped + unmapped chrom_sizes = [(x, y) for x, y in zip(bam.references, bam.lengths)] chromDict = {x: y for x, y in zip(bam.references, bam.lengths)} # if motifFilter is given, convert the input to list if args.motifFilter: if not args.genome2bit: print("MotifFilter asked but genome (2bit) file not provided.") sys.exit(1) else: args.motifFilter = args.motifFilter.strip(" ").split(",") if args.GCcontentFilter: gc = args.GCcontentFilter.strip(" ").split(",") args.GCcontentFilter = [float(x) for x in gc] # Filter, writing the results to a bunch of temporary files res = mapReduce( [args, chromDict], filterWorker, chrom_sizes, region=args.region, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose, ) res = sorted(res) # The temp files are now in order for concatenation nFiltered = sum([x[3] for x in res]) totalSeen = sum([x[2] for x in res]) # The * contig isn't queried tmpFiles = [x[4] for x in res] if not args.BED: arguments = ["-o", args.outFile] arguments.extend(tmpFiles) # [..., *someList] isn't available in python 2.7 pysam.samtools.cat(*arguments) for tmpFile in tmpFiles: os.unlink(tmpFile) else: convertBED(args.outFile, tmpFiles, chromDict) if args.filteredOutReads: tmpFiles = [x[5] for x in res] if not args.BED: arguments = ["-o", args.filteredOutReads] arguments.extend(tmpFiles) # [..., *someList] isn't available in python 2.7 pysam.samtools.cat(*arguments) for tmpFile in tmpFiles: os.unlink(tmpFile) else: convertBED(args.outFile, tmpFiles, chromDict, args) if args.filterMetrics: sampleName = args.bamfile if args.label: sampleName = args.label if args.smartLabels: sampleName = smartLabels([args.bamfile])[0] of = open(args.filterMetrics, "w") of.write("#bamFilterReads --filterMetrics\n") of.write("#File\tReads Remaining\tTotal Initial Reads\n") of.write("{}\t{}\t{}\n".format(sampleName, totalSeen - nFiltered, total)) of.close() return 0