Source code for sincei.RegionQuery

from itertools import compress


## get overlap of GTF object (deeptoolsintervals) with anndata object (from sincei)
## output: dict (region->gene mapping)
[docs]def get_gtf_adata_olaps(adata, gtf): r"""Get overlaps between adata and gtf Parameters ---------- adata : AnnData Annotated data matrix. gtf : GTF GTF object Returns ------- dict Dictionary with overlaps for each gene in adata. Examples -------- >>> test = Tester() >>> gtf = GTF(test.gtfFile) >>> adata = sc.read_10x_mtx(test.input_matrix_dir, var_names='gene_symbols', cache=True) >>> olaps=get_gtf_adata_olaps(adata, gtf) >>> olaps['Gm37381'] [('ENSMUSG00000064372', '+'), ('ENSMUSG00000064372', '-')] """ var = adata.var olaps = dict.fromkeys(var.index) for i, key in enumerate(var.index): try: chrom, start, end = ( var["chrom"][i], int(var["start"][i]), int(var["end"][i]), ) ol = gtf.findOverlaps(chrom, start, end, includeStrand=True) if ol: genelist = [(x[2], x[5]) for x in ol] olaps[key] = genelist except ValueError: olaps[key] = None continue return olaps
## Search for bins by gene name, return either the first bin (promoter) or all overlapping bins
[docs]def get_bins_by_gene(dict, gene, firstBin=False): r""" Returns the bins for a given gene. Parameters ---------- dict : dict Dictionary of bins and genes. gene : str Gene name. firstBin : bool If true, return only the first bin of the gene. Returns ------- list List of bins. Examples -------- >>> dict = {'chr1_1': [('gene1', '+'), ('gene2', '-')], 'chr1_2': [('gene1', '+')]} >>> get_bins_by_gene(dict, 'gene1') ['chr1_1', 'chr1_2'] >>> get_bins_by_gene(dict, 'gene1', firstBin=True) 'chr1_1' """ klist = [] for k, v in dict.items(): if v: vlist = [x[0] for x in v] # overlapping genes slist = [x[1] for x in v] # overlapping gene strands match = [x.lower() == gene.lower() for x in vlist] if any(match): klist.append(k) # get strand of the gene strand = list(compress(slist, match))[0] else: strand = None # if firstBin is asked, sort the bins by start pos and # return only the firstBin by strand if klist and firstBin: spos = [x.split("_")[1] for x in klist] if strand == "+": first_bin = spos.index(min(spos)) else: first_bin = spos.index(max(spos)) return klist[first_bin] else: return klist