Source code for sincei.scBulkCoverage

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import argparse
import sys
import os
import itertools
import numpy as np
import pandas as pd
from deeptools.getScaleFactor import get_scale_factor
from deeptools.bamHandler import openBam

# logs
import warnings
import logging

logger = logging.getLogger()
warnings.simplefilter(action="ignore", category=FutureWarning)

## own Functions
# scriptdir=os.path.abspath(os.path.join(__file__, "../../sincei"))
# sys.path.append(scriptdir)

from sincei import ParserCommon
from sincei import WriteBedGraph

debug = 0


[docs]def parseArguments():
    io_args = ParserCommon.inputOutputOptions(opts=["bamfiles", "groupInfo", "outFilePrefix"])
    bam_args = ParserCommon.bamOptions(default_opts={"binSize": 100})
    read_args = ParserCommon.readOptions()
    filter_args = ParserCommon.filterOptions()
    other_args = ParserCommon.otherOptions()
    parser = argparse.ArgumentParser(
        parents=[io_args, get_args(), bam_args, filter_args, read_args, other_args],
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This tool takes alignments of reads or fragments "
        "as input (BAM files), along with cell grouping information, such as "
        "barcode -> batch, or barcode -> cluster, as tsv file, and generates a "
        "coverage track (bigWig or bedGraph) per group as output. "
        "The coverage is calculated as the number of reads per bin, "
        "where bins are short consecutive counting windows of a defined "
        "size. It is possible to extended/change the length of the reads "
        "to better reflect the actual fragment length. *scBulkCoverage* "
        "offers normalization per cluster using different methods \n",
        usage="An example usage is:"
        "$ scBulkCoverage -b file1.bam file2.bam --labels file1 file2 -g scClusterCells_output.tsv -o coverage.bw",
        add_help=False,
    )

    return parser


[docs]def get_args():
    parser = argparse.ArgumentParser(add_help=False)

    optional = parser.add_argument_group("Coverage-related Options")
    optional.add_argument(
        "--outFileFormat",
        "-of",
        help='Output file type. Either "bigwig" or "bedgraph".',
        choices=["bigwig", "bedgraph"],
        default="bigwig",
    )

    optional.add_argument(
        "--normalizeUsing",
        "-n",
        help="How to normalize the pseudo-bulk counts. Options are "
        '"CPM": normalized each bin to the counts per million mapped reads in that group.\n'
        '"Frequency": binarize the coverage per bin and normalize to the total no. of cells per group. \n'
        '"Mean": get mean signal per bin across cells in each group.\n'
        '"None": simply return the sum of coverage per group.',
        choices=["CPM", "Frequency", "Mean", "None"],
        default="CPM",
    )

    optional.add_argument(
        "--ignoreForNormalization",
        "-ig",
        help="Chromosomes to skip while calculating normalization factors",
        nargs="+",
        default=None,
    )

    optional.add_argument(
        "--normalizeByReference",
        "-nr",
        help="NOT IMPLEMENTED: Normalize each group of cells by a reference group (which must be present in the --groupinfo file)"
        "Note that the --normalizeUsing method is applied beforehand.",
        choices=["ratio", "log2_ratio", "difference", "None"],
        default=None,
    )

    optional.add_argument(
        "--scaleFactor",
        help="The computed scaling factor (or 1, if not applicable) will "
        "be multiplied by this. (Default: %(default)s)",
        default=1.0,
        type=float,
        required=False,
    )

    optional.add_argument(
        "--MNase",
        help="Determine nucleosome positions from MNase-seq/CUTnRUN data. "
        "Only 3 nucleotides at the center of each fragment are counted. "
        "The fragment ends are defined by the two mate reads. Only fragment lengths"
        "between 130 - 200 bp are considered to avoid dinucleosomes or other artifacts. "
        "By default, any fragments smaller or larger than this are ignored. To "
        "over-ride this, use the --minFragmentLength and --maxFragmentLength options, "
        "which will default to 130 and 200 if not otherwise specified in the presence "
        "of --MNase. *NOTE*: Requires paired-end data. A bin size of 1 is recommended.",
        action="store_true",
    )

    optional.add_argument(
        "--Offset",
        help="Uses this offset inside of each read as the signal. This is useful in "
        "cases like RiboSeq or GROseq, where the signal is 12, 15 or 0 bases past the "
        "start of the read. This can be paired with the --filterRNAstrand option. "
        "Note that negative values indicate offsets from the end of each read. A value "
        "of 1 indicates the first base of the alignment (taking alignment orientation "
        "into account). Likewise, a value of -1 is the last base of the alignment. An "
        "offset of 0 is not permitted. If two values are specified, then they will be "
        "used to specify a range of positions. Note that specifying something like "
        "--Offset 5 -1 will result in the 5th through last position being used, which "
        "is equivalent to trimming 4 bases from the 5-prime end of alignments. Note "
        "that if you specify --centerReads, the centering will be performed before the "
        "offset.",
        metavar="INT",
        type=int,
        nargs="+",
        required=False,
    )

    return parser


[docs]def main(args=None):
    args = ParserCommon.validateInputs(
        parseArguments().parse_args(args), process_barcodes=False
    )  # newlabels are discarded, since they are re-created from groupInfo file
    global debug
    if args.verbose:
        sys.stderr.write("Specified --scaleFactor: {}\n".format(args.scaleFactor))
        debug = 1
    else:
        debug = 0
        logger.setLevel(logging.CRITICAL)
        warnings.filterwarnings("ignore")

    ## read the group info file (TODO: write a validator)
    df_err = """ *Error*:No. of columns in --groupInfo file not recognized.
             Please provide either 3 (sample, barcode, group) or 4 (sample::barcode, umap1, umap2, group) column file"""
    ## if the no. of columns are 3, expect "sample", "barcode", "cluster", if 4, expect sample:bc, umap1, umap2, cluster
    df = pd.read_csv(args.groupInfo, sep="\t", index_col=None, comment="#")
    if len(df.columns) == 3:
        df.columns = ["sample", "barcode", "cluster"]
    elif len(df.columns) == 4:
        df.columns = ["Cell_ID", "umap1", "umap2", "cluster"]
        try:
            df[["sample", "barcode"]] = df.Cell_ID.str.split("::", expand=True)
        except:
            sys.exit(df_err)
    else:
        sys.exit(df_err)

    df.index = df[["sample", "barcode"]].apply(lambda x: "::".join(x), axis=1)
    # barcodes = groupInfo.barcode.unique().tolist()

    ## match the sample labels with groupInfo labels before proceeding
    ## create new DF with user-provided bam+labels (this would match the counts)

    # check that sample names in df and --labels match
    labels_not_in_df = set(args.labels).difference(set(df["sample"]))
    df_not_in_labels = set(df["sample"]).difference(set(args.labels))
    if bool(df_not_in_labels):
        sys.stderr.write(
            "Some (or all) of the samples indicated in the groupInfo file "
            "are absent from the bam file labels! \n"
            "Mismatched samples are: {} \n".format(df_not_in_labels)
        )
        df = df.loc[df["sample"].isin(set(args.labels)),]
    elif bool(labels_not_in_df):
        sys.stderr.write(
            "Some (or all) of the samples indicated in --labels "
            "are absent from the in the groupInfo file! \n"
            "Mismatched samples are: {} \n".format(labels_not_in_df)
        )

    # make another df, containing union of all barcodes, and in the same order per sample
    barcodes = df["barcode"].unique().tolist()
    sm = list(itertools.chain.from_iterable(itertools.repeat(lab, len(barcodes)) for lab in args.labels))
    bc = barcodes * len(args.labels)
    groupInfo = pd.DataFrame({"sample": sm, "barcode": bc})
    groupInfo.index = groupInfo[["sample", "barcode"]].apply(lambda x: "::".join(x), axis=1)
    groupInfo = pd.merge(
        groupInfo,
        df["cluster"],
        how="left",
        left_index=True,
        right_index=True,
        sort=False,
    )
    groupInfo = groupInfo.reset_index()[["sample", "barcode", "cluster"]]
    groupInfo["cluster"] = groupInfo["cluster"].astype("category")
    # re-construct new labels (sample+bc)
    newlabels = ["::".join([x, y]) for x, y in zip(groupInfo["sample"], groupInfo["barcode"])]
    # Normalization options
    scale_factor = args.scaleFactor
    func_args = {"scaleFactor": args.scaleFactor}

    coverageAsFrequency = False
    if not args.ignoreForNormalization:
        args.ignoreForNormalization = []
    if args.normalizeUsing == "None":
        args.normalizeUsing = None  # For the sake of sanity
    elif args.normalizeUsing == "Frequency":
        coverageAsFrequency = True
        args.normalizeUsing = "Mean"  # mean of binarized counts shall give us frequency

    # This fixes issue #520, where --extendReads wasn't honored if --filterRNAstrand was used
    if args.filterRNAstrand and not args.Offset:
        args.Offset = [1, -1]

    if args.MNase:
        # check that library is paired end
        # using getFragmentAndReadSize
        from deeptools.getFragmentAndReadSize import get_read_and_fragment_length

        fraglengths = [
            get_read_and_fragment_length(
                file,
                return_lengths=False,
                blackListFileName=args.blackListFileName,
                numberOfProcessors=args.numberOfProcessors,
                verbose=args.verbose,
            )
            for file in args.bamfiles
        ]
        if any([x[0] is None for x in fraglengths]):
            sys.exit("*Error*: For the --MNAse function a paired end library is required. ")

        # Set some default fragment length bounds
        if args.minFragmentLength == 0:
            args.minFragmentLength = 130
        if args.maxFragmentLength == 0:
            args.maxFragmentLength = 200

        wr = CenterFragment(
            args.bamfiles,
            binLength=args.binSize,
            stepSize=args.binSize,
            barcodes=barcodes,
            clusterInfo=groupInfo,
            cellTag=args.cellTag,
            groupTag=args.groupTag,
            groupLabels=newlabels,
            motifFilter=args.motifFilter,
            genome2bit=args.genome2bit,
            GCcontentFilter=args.GCcontentFilter,
            region=args.region,
            blackListFileName=args.blackListFileName,
            numberOfProcessors=args.numberOfProcessors,
            extendReads=args.extendReads,
            minMappingQuality=args.minMappingQuality,
            duplicateFilter=args.duplicateFilter,
            center_read=args.centerReads,
            zerosToNans=False,
            samFlag_include=args.samFlagInclude,
            samFlag_exclude=args.samFlagExclude,
            minFragmentLength=args.minFragmentLength,
            maxFragmentLength=args.maxFragmentLength,
            chrsToSkip=args.ignoreForNormalization,
            binarizeCoverage=coverageAsFrequency,
            verbose=args.verbose,
        )

    elif args.Offset:
        if len(args.Offset) > 1:
            if args.Offset[0] == 0:
                sys.exit(
                    "*Error*: An offset of 0 isn't allowed, since offsets are 1-based positions inside each alignment."
                )
            if args.Offset[1] > 0 and args.Offset[1] < args.Offset[0]:
                sys.exit("'Error*: The right side bound is less than the left-side bound. This is inappropriate.")
        else:
            if args.Offset[0] == 0:
                sys.exit(
                    "*Error*: An offset of 0 isn't allowed, since offsets are 1-based positions inside each alignment."
                )
        wr = OffsetFragment(
            args.bamfiles,
            binLength=args.binSize,
            stepSize=args.binSize,
            barcodes=barcodes,
            clusterInfo=groupInfo,
            cellTag=args.cellTag,
            groupTag=args.groupTag,
            groupLabels=newlabels,
            motifFilter=args.motifFilter,
            genome2bit=args.genome2bit,
            GCcontentFilter=args.GCcontentFilter,
            region=args.region,
            numberOfProcessors=args.numberOfProcessors,
            extendReads=args.extendReads,
            minMappingQuality=args.minMappingQuality,
            duplicateFilter=args.duplicateFilter,
            center_read=args.centerReads,
            zerosToNans=False,
            samFlag_include=args.samFlagInclude,
            samFlag_exclude=args.samFlagExclude,
            minFragmentLength=args.minFragmentLength,
            maxFragmentLength=args.maxFragmentLength,
            chrsToSkip=args.ignoreForNormalization,
            binarizeCoverage=coverageAsFrequency,
            verbose=args.verbose,
        )
        wr.filter_strand = args.filterRNAstrand
        wr.Offset = args.Offset
    else:
        wr = WriteBedGraph.WriteBedGraph(
            args.bamfiles,
            binLength=args.binSize,
            stepSize=args.binSize,
            barcodes=barcodes,
            clusterInfo=groupInfo,
            cellTag=args.cellTag,
            groupTag=args.groupTag,
            groupLabels=newlabels,
            motifFilter=args.motifFilter,
            genome2bit=args.genome2bit,
            GCcontentFilter=args.GCcontentFilter,
            region=args.region,
            blackListFileName=args.blackListFileName,
            numberOfProcessors=args.numberOfProcessors,
            extendReads=args.extendReads,
            minMappingQuality=args.minMappingQuality,
            duplicateFilter=args.duplicateFilter,
            center_read=args.centerReads,
            zerosToNans=False,
            samFlag_include=args.samFlagInclude,
            samFlag_exclude=args.samFlagExclude,
            minFragmentLength=args.minFragmentLength,
            maxFragmentLength=args.maxFragmentLength,
            chrsToSkip=args.ignoreForNormalization,
            binarizeCoverage=coverageAsFrequency,
            verbose=args.verbose,
        )

    wr.run(
        WriteBedGraph.scaleCoverage,
        func_args,
        args.outFilePrefix,
        blackListFileName=args.blackListFileName,
        normUsing=args.normalizeUsing,
        format=args.outFileFormat,
        smoothLength=None,
    )


[docs]class OffsetFragment(WriteBedGraph.WriteBedGraph):
    """
    Class to redefine the get_fragment_from_read for the --Offset case
    """

[docs]    def filterStrand(self, read, rv):
        """
        A generic read filtering function that gets used by everything in this class.

        rv is returned if the strand is correct, otherwise [(None, None)]
        """
        # Filter by RNA strand, if desired
        if read.is_paired:
            if self.filter_strand == "forward":
                if read.flag & 144 == 128 or read.flag & 96 == 64:
                    return rv
            elif self.filter_strand == "reverse":
                if read.flag & 144 == 144 or read.flag & 96 == 96:
                    return rv
            else:
                return rv
        else:
            if self.filter_strand == "forward":
                if read.flag & 16 == 16:
                    return rv
            elif self.filter_strand == "reverse":
                if read.flag & 16 == 0:
                    return rv
            else:
                return rv

        return [(None, None)]

[docs]    def get_fragment_from_read_list(self, read, offset):
        """
        Return the range of exons from the 0th through 1st bases, inclusive. Positions are 1-based
        """
        rv = [(None, None)]
        blocks = read.get_blocks()
        blockLen = sum([x[1] - x[0] for x in blocks])

        if self.defaultFragmentLength != "read length":
            if self.is_proper_pair(read, self.maxPairedFragmentLength):
                if read.is_reverse:
                    foo = (read.next_reference_start, read.reference_start)
                    if foo[0] < foo[1]:
                        blocks.insert(0, foo)
                else:
                    foo = (
                        read.reference_end,
                        read.reference_end + abs(read.template_length) - read.infer_query_length(),
                    )
                    if foo[0] < foo[1]:
                        blocks.append(foo)

            # Extend using the default fragment length
            else:
                if read.is_reverse:
                    foo = (
                        read.reference_start - self.defaultFragmentLength + read.infer_query_length(),
                        read.reference_start,
                    )
                    if foo[0] < 0:
                        foo = (0, foo[1])
                    if foo[0] < foo[1]:
                        blocks.insert(0, foo)
                else:
                    foo = (
                        read.reference_end,
                        read.reference_end + self.defaultFragmentLength - read.infer_query_length(),
                    )
                    if foo[0] < foo[1]:
                        blocks.append(foo)

        stretch = []
        # For the sake of simplicity, convert [(10, 20), (30, 40)] to [10, 11, 12, 13, ..., 40]
        # Then subset accordingly
        for block in blocks:
            stretch.extend(range(block[0], block[1]))
        if read.is_reverse:
            stretch = stretch[::-1]

        # Handle --centerReads
        if self.center_read:
            _ = (len(stretch) - blockLen) // 2
            stretch = stretch[_ : _ + blockLen]

        # Subset by --Offset
        try:
            foo = stretch[offset[0] : offset[1]]
        except:
            return rv

        if len(foo) == 0:
            return rv
        if read.is_reverse:
            foo = foo[::-1]

        # Convert the stretch back to a list of tuples
        foo = np.array(foo)
        d = foo[1:] - foo[:-1]
        idx = np.argwhere(d > 1).flatten().tolist()  # This now holds the interval bounds as a list
        idx.append(-1)
        last = 0
        rv = []
        for i in idx:
            rv.append((foo[last].astype("int"), foo[i].astype("int") + 1))
            last = i + 1

        # Handle strand filtering, if needed
        return self.filterStrand(read, rv)

[docs]    def get_fragment_from_read(self, read):
        """
        This is mostly a wrapper for self.get_fragment_from_read_list(),
        which needs a list and for the offsets to be tweaked by 1.
        """
        offset = [x for x in self.Offset]
        if len(offset) > 1:
            if offset[0] > 0:
                offset[0] -= 1
            if offset[1] < 0:
                offset[1] += 1
        else:
            if offset[0] > 0:
                offset[0] -= 1
                offset = [offset[0], offset[0] + 1]
            else:
                if offset[0] < -1:
                    offset = [offset[0], offset[0] + 1]
                else:
                    offset = [offset[0], None]
        if offset[1] == 0:
            # -1 gets switched to 0, which screws things up
            offset = (offset[0], None)
        return self.get_fragment_from_read_list(read, offset)


[docs]class CenterFragment(WriteBedGraph.WriteBedGraph):
    """
    Class to redefine the get_fragment_from_read for the --MNase case

    The coverage of the fragment is defined as the 2 or 3 basepairs at the
    center of the fragment length.
    """

[docs]    def get_fragment_from_read(self, read):
        """
        Takes a proper pair fragment of high quality and limited
        to a certain length and outputs the center
        """
        fragment_start = fragment_end = None

        # only paired forward reads are considered
        # Fragments have already been filtered according to length
        if read.is_proper_pair and not read.is_reverse and 1 < abs(read.tlen):
            # distance between pairs is even return two bases at the center
            if read.tlen % 2 == 0:
                fragment_start = read.pos + read.tlen / 2 - 1
                fragment_end = fragment_start + 2

            # distance is odd return three bases at the center
            else:
                fragment_start = read.pos + read.tlen / 2 - 1
                fragment_end = fragment_start + 3

        return [(fragment_start, fragment_end)]