Source code for sincei.RegionQuery

from itertools import compress


## get overlap of GTF object (deeptoolsintervals) with anndata object (from sincei)
## output: dict (region->gene mapping)

[docs]
def get_gtf_adata_olaps(adata, gtf):
    r"""Get overlaps between AnnData features and GTF regions.

    Parameters
    ----------
    adata : AnnData
        AnnData with regions to overlap.

    gtf : GTF
        GTF object with regions to overlap.

    Returns
    -------
    dict
        Dictionary with overlaps for each feature in adata.

    Examples
    --------
    >>> test = Tester()
    >>> gtf = GTF(test.gtfFile)
    >>> adata = sc.read_10x_mtx(test.input_matrix_dir, var_names='gene_symbols', cache=True)
    >>> olaps=get_gtf_adata_olaps(adata, gtf)
    >>> olaps['Gm37381']
    [('ENSMUSG00000064372', '+'), ('ENSMUSG00000064372', '-')]
    """
    var = adata.var
    olaps = dict.fromkeys(var.index)
    for i, key in enumerate(var.index):
        try:
            chrom, start, end = (
                var["chrom"][i],
                int(var["start"][i]),
                int(var["end"][i]),
            )
            ol = gtf.findOverlaps(chrom, start, end, includeStrand=True)
            if ol:
                genelist = [(x[2], x[5]) for x in ol]
                olaps[key] = genelist
        except ValueError:
            olaps[key] = None
            continue
    return olaps



## Search for bins by gene name, return either the first bin (promoter) or all overlapping bins

[docs]
def get_bins_by_gene(dict, gene, firstBin=False):
    r"""
    Returns the bins for a given gene.

    Parameters
    ----------
    dict : dict
        Dictionary of bins and genes.
    gene : str
        Gene name.
    firstBin : bool
        If true, return only the first bin of the gene.

    Returns
    -------
    list
        List of bins.

    Examples
    --------
    >>> dict = {'chr1_1': [('gene1', '+'), ('gene2', '-')], 'chr1_2': [('gene1', '+')]}
    >>> get_bins_by_gene(dict, 'gene1')
    ['chr1_1', 'chr1_2']
    >>> get_bins_by_gene(dict, 'gene1', firstBin=True)
    'chr1_1'
    """
    klist = []
    for k, v in dict.items():
        if v:
            vlist = [x[0] for x in v]  # overlapping genes
            slist = [x[1] for x in v]  # overlapping gene strands
            match = [x.lower() == gene.lower() for x in vlist]
            if any(match):
                klist.append(k)
                # get strand of the gene
                strand = list(compress(slist, match))[0]
            else:
                strand = None

    # if firstBin is asked, sort the bins by start pos and
    # return only the firstBin by strand
    if klist and firstBin:
        spos = [x.split("_")[1] for x in klist]
        if strand == "+":
            first_bin = spos.index(min(spos))
        else:
            first_bin = spos.index(max(spos))
        return klist[first_bin]
    else:
        return klist