Source code for sincei.GetStats

import sys
import os

from deeptools import parserCommon, bamHandler, utilities
from deeptools.mapReduce import mapReduce
from deeptoolsintervals import GTF

import numpy as np
import py2bit
import pandas as pd

## own functions
scriptdir = os.path.join(os.path.abspath(os.pardir), "sincei")
from sincei.Utilities import *


[docs]def getStats_worker(arglist):
    r"""Computes statistics for each read in a bam file

    This function computes statistics for each read in a bam file.

    Parameters
    ----------
    bamfiles : list
        List containing the names of indexed bam files. E.g. ['file1.bam', 'file2.bam']

    binSize : int
        Length of the window/bin. This value is overruled by ``bedFile`` if present.
    barcodes : list
        list of barcodes to count the reads from.
    numberOfSamples : int
        Total number of samples. The genome is divided into ``numberOfSamples``, each
        with a window/bin length equal to ``binLength``. This value is overruled
        by ``stepSize`` in case such value is present and by ``bedFile`` in which
        case the number of samples and bins are defined in the bed file

    numberOfProcessors : int
        Number of processors to use. Default is 4

    verbose : bool
        Output messages. Default: False

    region : str
        Region to limit the computation in the form chrom:start
    """

    chrom, start, end, args = arglist
    # Fix the bounds
    if end - start > args.binSize and end - start > args.distanceBetweenBins:
        end -= args.distanceBetweenBins
    if end <= start:
        end = start + 1
    ## open genome if needed
    if args.genome2bit:
        twoBitGenome = py2bit.open(args.genome2bit, True)
    ## open blacklist file
    blackList = None
    if args.blackListFileName is not None:
        blackList = GTF(args.blackListFileName)

    o = []
    for fname in args.bamfiles:
        fh = bamHandler.openBam(fname)
        chromUse = utilities.mungeChromosome(chrom, fh.references)
        prev_pos = set()
        lpos = None
        ## initiate a dict with all values to keep per read
        info_list = []  # dict.fromkeys(['barcode', 'position', 'duplicate', 'GCcontent', 'strand'])

        for read in fh.fetch(chromUse, start, end):
            ## general filtering
            if read.pos < start:
                # ensure that we never double count (in case distanceBetweenBins == 0)
                continue
            if read.flag & 4:
                # Ignore unmapped reads, they were counted already
                continue
            if args.minMappingQuality and read.mapq < args.minMappingQuality:
                continue
            if args.minAlignedFraction:
                if not checkAlignedFraction(read, args.minAlignedFraction):
                    continue
            if blackList and blackList.findOverlaps(
                chrom,
                read.reference_start,
                read.reference_start + read.infer_query_length(always=False) - 1,
            ):
                continue
            if args.motifFilter:
                test = [checkMotifs(read, chrom, twoBitGenome, m[0], m[1]) for m in args.motifFilter]
                # if none given motif found, return true
                if not any(test):
                    continue

            # now collect info
            info = [None for n in range(0, 8)]
            info[0] = chromUse
            if args.barcodes is not None:
                try:
                    bc = read.get_tag(args.cellTag)
                    if bc in args.barcodes:
                        info[1] = True
                    else:
                        info[1] = False
                except KeyError:
                    continue
            else:
                info[1] = False
                bc = None
            info[2] = read.reference_start

            ## Duplicates
            tup = getDupFilterTuple(read, bc, args.duplicateFilter)
            if lpos is not None and lpos == read.reference_start and tup in prev_pos:
                info[3] = True  # read is duplicate
                info[4] = 0
            else:
                info[3] = False
            if lpos != read.reference_start:
                prev_pos.clear()
                ## add distance to last read
                if lpos is None:
                    info[4] = 0
                else:
                    info[4] = abs(float(lpos - read.reference_start))
                ## also add the distance of this read to previous pos

            lpos = read.reference_start
            prev_pos.add(tup)

            info[5] = checkGCcontent(read, args.GCcontentFilter[0], args.GCcontentFilter[1], returnGC=True)
            # filterRNAstrand
            info[6] = read.is_reverse
            if args.getReadID:
                info[7] = read.query_name
            info_list.append(info)
        fh.close()

        # out is an array with row = len(barcode) [384], column = len(stats) [11]
        o.append(info_list)
    return o