From 67a8931b099a40764fe5451af1ce67e4c35f63be Mon Sep 17 00:00:00 2001
From: Nezar Abdennur <nabdennur@gmail.com>
Date: Mon, 12 Feb 2024 02:21:43 -0500
Subject: [PATCH] Add exact summaries and zoom table fetcher

---
 bbi/_bbi.py  |  38 +++++---
 bbi/cbbi.pxd |   6 +-
 bbi/cbbi.pyx | 259 +++++++++++++++++++++++++++++++++++++++------------
 3 files changed, 230 insertions(+), 73 deletions(-)

diff --git a/bbi/_bbi.py b/bbi/_bbi.py
index 0e4eec4..c6b7b4f 100644
--- a/bbi/_bbi.py
+++ b/bbi/_bbi.py
@@ -25,7 +25,7 @@ def chromsizes(inFile):
 
     Returns
     -------
-    OrderedDict (str -> int)
+    dict[str, int]
 
     """
     with cbbi.open(inFile) as f:
@@ -44,7 +44,7 @@ def zooms(inFile):
 
     Returns
     -------
-    list of int
+    list[int]
 
     """
     with cbbi.open(inFile) as f:
@@ -70,26 +70,38 @@ def info(inFile):
 
 
 @documented_by(cbbi.BBIFile.fetch)
-def fetch(inFile, chrom, start, end, bins=-1, missing=0.0, oob=np.nan, summary="mean"):
+def fetch(
+    inFile,
+    chrom,
+    start,
+    end,
+    bins=-1,
+    missing=0.0,
+    oob=np.nan,
+    summary="mean",
+    exact=False,
+):
     with cbbi.open(inFile) as f:
-        return f.fetch(chrom, start, end, bins, missing, oob, summary)
+        return f.fetch(chrom, start, end, bins, missing, oob, summary, exact)
 
 
 @documented_by(cbbi.BBIFile.stackup)
 def stackup(
-    inFile, chroms, starts, ends, bins=-1, missing=0.0, oob=np.nan, summary="mean"
+    inFile,
+    chroms,
+    starts,
+    ends,
+    bins=-1,
+    missing=0.0,
+    oob=np.nan,
+    summary="mean",
+    exact=False,
 ):
     with cbbi.open(inFile) as f:
-        return f.stackup(chroms, starts, ends, bins, missing, oob, summary)
+        return f.stackup(chroms, starts, ends, bins, missing, oob, summary, exact)
 
 
 @documented_by(cbbi.BBIFile.fetch_intervals)
-def fetch_intervals(
-    inFile,
-    chrom,
-    start,
-    end,
-    iterator=True
-):
+def fetch_intervals(inFile, chrom, start, end, iterator=True):
     with cbbi.open(inFile) as f:
         return f.fetch_intervals(chrom, start, end, iterator)
diff --git a/bbi/cbbi.pxd b/bbi/cbbi.pxd
index 2425eea..fda70f8 100644
--- a/bbi/cbbi.pxd
+++ b/bbi/cbbi.pxd
@@ -256,7 +256,7 @@ cdef extern from "bigWig.h":
         int start,
         int end,
         bbiSummaryType summaryType,
-        double defaultVal);
+        double defaultVal)
     boolean bigWigSummaryArray(
         bbiFile *bwf,
         char *chrom,
@@ -312,7 +312,7 @@ cdef extern from "bigBed.h":
         bits32 start,
         bits32 end,
         int summarySize,
-        bbiSummaryElement *summary);
+        bbiSummaryElement *summary)
     char *bbiCachedChromLookup(
         bbiFile *bbi,
         int chromId,
@@ -320,7 +320,7 @@ cdef extern from "bigBed.h":
         char *chromBuf,
         int chromBufSize)
     char *bigBedAutoSqlText(bbiFile *bbi)
-    asObject *bigBedAs(bbiFile *bbi);
+    asObject *bigBedAs(bbiFile *bbi)
     asObject *bigBedAsOrDefault(bbiFile *bbi)
     asObject *bigBedFileAsObjOrDefault(char *fileName)
 
diff --git a/bbi/cbbi.pyx b/bbi/cbbi.pyx
index c2573d3..75307f8 100644
--- a/bbi/cbbi.pyx
+++ b/bbi/cbbi.pyx
@@ -9,7 +9,7 @@ import numpy as np
 
 from libc.math cimport sqrt
 from .cbbi cimport (
-    asObject, bbiSumMean, bbiSumMax, bbiSumMin, bbiSumCoverage, 
+    asObject, bbiSumMean, bbiSumMax, bbiSumMin, bbiSumCoverage,
     bbiSumStandardDeviation, bbiSumSum
 )
 
@@ -117,7 +117,6 @@ def is_bbi(inFile):
     Returns
     -------
     bool
-
     """
     return _read_sig(inFile) > 0
 
@@ -134,7 +133,6 @@ def is_bigwig(inFile):
     Returns
     -------
     bool
-
     """
     return _read_sig(inFile) == bigWigSig
 
@@ -151,7 +149,6 @@ def is_bigbed(inFile):
     Returns
     -------
     bool
-
     """
     return _read_sig(inFile) == bigBedSig
 
@@ -168,7 +165,6 @@ def open(str inFile):
     Returns
     -------
     BBIFile
-
     """
     return BBIFile(inFile)
 
@@ -179,7 +175,6 @@ cdef class BBIFile:
 
     The resource may be a bigWig or a bigBed file.
     BigBed AutoSql schemas are supported.
-
     """
     cdef bbiFile *bbi
     cdef bits32 sig
@@ -395,15 +390,16 @@ cdef class BBIFile:
         int bins=-1,
         double missing=0.0,
         double oob=np.nan,
-        str summary='mean'
+        str summary='mean',
+        bint exact=False,
     ):
         """
-        Read the signal data in a bbi file overlapping a genomic query interval
+        Read the signal data in a BBI file overlapping a genomic query interval
         into a numpy array.
 
-        If a number of bins is requested, this will interpolate the file's 
-        stored summary data from the closest available zoom level. Otherwise, 
-        the data is returned at base pair resolution (default).
+        If a number of bins is requested, this will return a summary statistic
+        for each bin. Otherwise, the data is returne unsummarized at base pair
+        resolution (default).
 
         Parameters
         ----------
@@ -414,19 +410,23 @@ cdef class BBIFile:
             track is not truncated but treated as out of bounds.
         end : int
             End coordinate. If end is less than zero, the end is set to the
-            chromosome size. If end is greater than the chromosome size, the 
+            chromosome size. If end is greater than the chromosome size, the
             end of the track is not truncated but treated as out of bounds.
         bins : int, optional
             Number of bins to divide the query interval into for coarsegraining.
             Default (-1) means no summarization (i.e., 1 bp bins).
-        missing : float, optional
+        missing : float, optional [default: 0.0]
             Fill-in value for unreported data in valid regions. Default is 0.
-        oob : float, optional
+        oob : float, optional [default: NaN]
             Fill-in value for out-of-bounds regions. Default is NaN.
-        summary : str, optional
-            Summary statistic to use if summarizing. Options are 'mean', 'min',
-            'max', 'cov' (coverage), and 'std' (standard deviation). Default is
-            'mean'.
+        summary : str, optional [default: 'mean']
+            Summary statistic to use if ``bins`` is specified. Options are
+            'mean', 'sum', 'min', 'max', 'cov' (coverage), and 'std'
+            (standard deviation). Default is 'mean'.
+        exact : bool, optional [default: False]
+            If True and ``bins`` is specified, return exact summary statistic
+            values instead of interpolating from the optimal zoom level.
+            Default is False.
 
         Returns
         -------
@@ -434,18 +434,25 @@ cdef class BBIFile:
 
         Notes
         -----
-        A BigWig file encodes a step function, and the value at a base is given 
-        by the quantitative "value" field of the unique interval that contains 
-        that base.
-
-        A BigBed file encodes a collection of (possibly overlapping) intervals,
-        and the "signal value" at a base is given by the coverage (i.e., pileup) 
-        of the intervals that overlap that base.
+        Values returned for a BigWig file are derived from its quantitative
+        "value" field. A BigWig file encodes a step function, and the value at
+        a base is given by the signal value of the unique interval that
+        contains that base.
+
+        Values returned for a BigBed file are derived from the coverage
+        (i.e., pileup) of the intervals. A BigBed file encodes a collection
+        of (possibly overlapping) intervals which may or may not be associated
+        with quantitative scores. The "value" at given base used here
+        summarizes the number of intervals overlapping that base, not a score.
+
+        If a number of bins is requested and ``exact`` is False, the summary
+        data is interpolated from the closest available zoom level. If you
+        need accurate summary data and are okay with small trade-off in speed,
+        set ``exact`` to True.
 
         See Also
         --------
         stackup : Stack signal tracks from many query intervals into a matrix
-
         """
         if self.bbi == NULL:
             raise OSError("File closed")
@@ -495,7 +502,7 @@ cdef class BBIFile:
                         set(BBI_SUMMARY_TYPES.keys())))
             array_query_summarized(
                 out, bins, self.bbi, fetcher,
-                chromName, start, end, chromSize, oob, summary_type)
+                chromName, start, end, chromSize, oob, summary_type, exact)
         else:
             array_query_full(
                 out, bins, self.bbi, fetcher,
@@ -511,7 +518,8 @@ cdef class BBIFile:
         int bins=-1,
         double missing=0.0,
         double oob=np.nan,
-        str summary='mean'
+        str summary='mean',
+        bint exact=False,
     ):
         """
         Vertically stack signal tracks from many query intervals into a matrix.
@@ -525,13 +533,13 @@ cdef class BBIFile:
             track is not truncated but treated as out of bounds.
         ends : array-like of int
             End coordinates. If end is less than zero, the end is set to the
-            chromosome size. If end is greater than the chromosome size, the 
+            chromosome size. If end is greater than the chromosome size, the
             end of the track is not truncated but treated as out of bounds.
         bins : int
             Number of bins to summarize the data. Default (-1) means no
             aggregation: each bin is 1 base pair and the query intervals must
-            have the same length. If bins > 0, the input query intervals can 
-            have different lengths, and the data will be rescaled to the same 
+            have the same length. If bins > 0, the input query intervals can
+            have different lengths, and the data will be rescaled to the same
             number of bins.
         missing : float
             Fill-in value for unreported data in valid regions. Default is 0.
@@ -541,6 +549,10 @@ cdef class BBIFile:
             Summary statistic to use if summarizing. Options are 'mean', 'min',
             'max', 'cov' (coverage), and 'std' (standard deviation). Default is
             'mean'.
+        exact : bool, optional [default: False]
+            If True and ``bins`` is specified, return exact summary statistic
+            values instead of interpolating from the optimal zoom level.
+            Default is False.
 
         Returns
         -------
@@ -548,14 +560,20 @@ cdef class BBIFile:
 
         Notes
         -----
-        Intervals must have the same length, unless ``bins`` is specified, in 
-        which case the data from each interval will be interpolated to the same 
-        number of bins (e.g., gene bodies).
+        If not summarizing, the input intervals must have the same length.
+
+        If a number of bins is requested, intervals may have variable lengths
+        and the data from each interval will be rescaled to the same number
+        of bins (e.g., aligning gene bodies).
+
+        If a number of bins is requested and ``exact`` is False, the summary
+        data is interpolated from the closest available zoom level. If you
+        need accurate summary data and are okay with small trade-off in
+        speed, set ``exact`` to True.
 
         See Also
         --------
         fetch : Fetch the signal track of a single interval
-
         """
         if self.bbi == NULL:
             raise OSError("File closed")
@@ -620,13 +638,14 @@ cdef class BBIFile:
                         )
                 array_query_summarized(
                     out[i, :], bins, self.bbi, fetcher,
-                    chromName, start, end, chromSize, oob, summary_type
+                    chromName, start, end, chromSize, oob, summary_type,
+                    exact
                 )
         return out
 
     def fetch_intervals(self, str chrom, int start, int end, bint iterator=False):
         """
-        Return an iterator or data frame of feature intervals overlapping a 
+        Return an iterator or data frame of feature intervals overlapping a
         specified genomic query interval.
 
         Parameters
@@ -639,15 +658,14 @@ cdef class BBIFile:
             End coordinate. If end is less than zero, the end is set to the
             chromosome size.
         iterator : bool, optional
-            If True, return an iterator that provides records as tuples. For 
-            bigBeds, extra BED fields will be returned as unparsed strings. If 
-            False, return a DataFrame with all columns parsed according to the 
+            If True, return an iterator that provides records as tuples. For
+            bigBeds, extra BED fields will be returned as unparsed strings. If
+            False, return a DataFrame with all columns parsed according to the
             file's autosql schema.
 
         Returns
         -------
         pandas.DataFrame or BigWigIntervalIterator or BigBedIntervalIterator
-
         """
         if self.bbi == NULL:
             raise OSError("File closed")
@@ -700,9 +718,53 @@ cdef class BBIFile:
 
         return df
 
+    def fetch_summaries(self, str chrom, int start, int end, int zoom=0):
+        """
+        Return a DataFrame of complete summary statistic bins at a specific
+        zoom level overlapping a genomic query interval.
+        """
+        if self.bbi == NULL:
+            raise OSError("File closed")
+
+        cdef bytes chromName = chrom.encode('ascii')
+
+        if zoom < 0 or zoom >= len(self.zooms):
+            raise ValueError(f"Zoom level {zoom} does not exist")
+
+        cdef bbiZoomLevel *zoomObj = NULL
+        cdef bbiZoomLevel *level = self.bbi.levelList
+        cdef int i = 0
+        while level != NULL:
+            if zoom == i:
+                zoomObj = level
+                break
+            level = level.next
+            i += 1
+
+        return get_zoom_table(self.bbi, zoomObj, chromName, start, end)
+
+    def best_zoom(self, str chrom, int start, int end, int bins):
+        cdef int baseSize = end - start
+        cdef int stepSize = baseSize // bins
+        cdef int zoomLevel = stepSize // 2
+        if zoomLevel < 0:
+            zoomLevel = 0
+
+        cdef bbiZoomLevel *zoomObj = bbiBestZoom(self.bbi.levelList, zoomLevel)
+        if zoomObj == NULL:
+            return None
+
+        cdef int binsize = zoomObj.reductionLevel
+        cdef int i = 0
+        for i, resolution in enumerate(self.zooms):
+            if resolution == binsize:
+                break
+        return i
+
+
 
 cdef class BigWigIntervalIterator:
-    
+
     cdef str chrom
     cdef int valid_start
     cdef int valid_end
@@ -733,7 +795,7 @@ cdef class BigWigIntervalIterator:
             self.chrom, self.interval.start, self.interval.end, self.interval.val
         )
         self.interval = self.interval.next
-        
+
         return out
 
     def __dealloc__(self):
@@ -744,7 +806,7 @@ cdef class BigWigIntervalIterator:
 
 
 cdef class BigBedIntervalIterator:
-    
+
     cdef str chrom
     cdef int valid_start
     cdef int valid_end
@@ -757,7 +819,7 @@ cdef class BigBedIntervalIterator:
         self.chrom = chromName.decode('ascii')
         self.valid_start = validStart
         self.valid_end = validEnd
-        
+
         # interval list is allocated out of lm
         self.lm = lmInit(0)
         self.interval = bigBedIntervalQuery(
@@ -781,7 +843,7 @@ cdef class BigBedIntervalIterator:
             restTup = ()
 
         out = (self.chrom, self.interval.start, self.interval.end, *restTup)
-        self.interval = self.interval.next        
+        self.interval = self.interval.next
         return out
 
     def __dealloc__(self):
@@ -791,6 +853,84 @@ cdef class BigBedIntervalIterator:
             lmCleanup(&self.lm)
 
 
+cdef get_zoom_table(
+    bbiFile *bbi,
+    bbiZoomLevel *zoomObj,
+    bytes chromName,
+    int start,
+    int end,
+):
+    """
+    Get the summary elements overlapping the query interval for the given zoom level
+    """
+    cdef int chromId = bbiChromId(bbi, chromName)
+    cdef int chromSize = bbiChromSize(bbi, chromName)
+    if chromSize == 0:
+        raise KeyError(f"Chromosome not found: {chromName}")
+
+    # Clip the query range
+    cdef int validStart = start
+    cdef int validEnd = end
+    if start < 0:
+        validStart = 0
+    if end > chromSize:
+        validEnd = chromSize
+
+    # Find appropriate zoom-level summary data and populate a list of summaries
+    # summList is allocated and freed
+    cdef bbiSummary *summList = bbiSummariesInRegion(
+        zoomObj, bbi, chromId, validStart, validEnd
+    )
+    cdef int summ_start
+    cdef int summ_end
+    cdef int summ_validCount
+    cdef float summ_min
+    cdef float summ_max
+    cdef float summ_sum
+    cdef float summ_sumSquares
+
+    cdef list summary_bins = []
+    cdef bbiSummary *summ = summList
+    while summ != NULL:
+        summ_start = summ.start
+        summ_end = summ.end
+        summ_validCount = summ.validCount
+        summ_min = summ.minVal
+        summ_max = summ.maxVal
+        summ_sum = summ.sumData
+        summ_sumSquares = summ.sumSquares
+        summary_bins.append(
+            (
+                chromName.decode('ascii'),
+                summ_start,
+                summ_end,
+                summ_validCount,
+                summ_min,
+                summ_max,
+                summ_sum,
+                summ_sumSquares,
+            )
+        )
+        summ = summ.next
+    slFreeList(&summList)
+
+    import pandas as pd
+
+    return pd.DataFrame(
+        summary_bins,
+        columns=[
+            "chrom",
+            "start",
+            "end",
+            "validCount",
+            "min",
+            "max",
+            "sum",
+            "sumSquares",
+        ],
+    )
+
+
 cdef inline void array_query_full(
     np.ndarray[np.double_t, ndim=1] out,
     int nbins,
@@ -855,7 +995,8 @@ cdef inline void array_query_summarized(
     int end,
     int chromSize,
     double oob,
-    bbiSummaryType summaryType
+    bbiSummaryType summaryType,
+    boolean exact
 ):
 
     # Clip the query range
@@ -866,16 +1007,20 @@ cdef inline void array_query_summarized(
     if end > chromSize:
         validEnd = chromSize
 
-    # Get the closest zoom level less than what we're looking for
-    cdef int baseSize = end - start
-    cdef int stepSize = baseSize // nbins
-    cdef int zoomLevel = stepSize // 2
-    if zoomLevel < 0:
-        zoomLevel = 0
-    cdef bbiZoomLevel *zoomObj = bbiBestZoom(bbi.levelList, zoomLevel)
-
-    # Create and populate summary elements
-    # elements is allocated
+    # If not exact, get the closest zoom level less than what we're looking for
+    cdef int baseSize
+    cdef int stepSize
+    cdef int zoomLevel
+    cdef bbiZoomLevel *zoomObj = NULL
+    baseSize = end - start
+    stepSize = baseSize // nbins
+    if not exact:
+        zoomLevel = stepSize // 2
+        if zoomLevel < 0:
+            zoomLevel = 0
+        zoomObj = bbiBestZoom(bbi.levelList, zoomLevel)
+
+    # Allocate the elements array and create and populate summary elements
     cdef boolean result = False
     cdef bbiSummaryElement *elements = NULL
     AllocArray(elements, nbins)