From de56fe029f8d73000f0b58ec9d8508645d2b5dcd Mon Sep 17 00:00:00 2001
From: Josh Dillon <jsdillon@berkeley.edu>
Date: Tue, 24 Dec 2024 15:06:09 -0800
Subject: [PATCH 1/4] Rewrite abscal.match_times() to be much faster when
 dealing with very long lists of modelfiles

---
 hera_cal/abscal.py | 79 +++++++++++++++++++++++++++++++++-------------
 1 file changed, 57 insertions(+), 22 deletions(-)

diff --git a/hera_cal/abscal.py b/hera_cal/abscal.py
index 085b1d577..5b090b62d 100644
--- a/hera_cal/abscal.py
+++ b/hera_cal/abscal.py
@@ -2538,28 +2538,63 @@ def match_times(datafile, modelfiles, filetype='uvh5', atol=1e-5):
     Returns:
         matched_modelfiles : type=list, list of modelfiles that overlap w/ datafile in LST
     """
-    # get lst arrays
-    data_dlst, data_dtime, data_lsts, data_times = io.get_file_times(datafile, filetype=filetype)
-    model_dlsts, model_dtimes, model_lsts, model_times = io.get_file_times(modelfiles, filetype=filetype)
-
-    # shift model files relative to first file & first index if needed
-    for ml in model_lsts:
-        ml[ml < model_lsts[0][0]] += 2 * np.pi
-        # also ensure that ml is increasing
-        ml[ml < ml[0]] += 2 * np.pi
-    # get model start and stop, buffering by dlst / 2
-    model_starts = np.asarray([ml[0] - md / 2.0 for ml, md in zip(model_lsts, model_dlsts)])
-    model_ends = np.asarray([ml[-1] + md / 2.0 for ml, md in zip(model_lsts, model_dlsts)])
-
-    # shift data relative to model if needed
-    data_lsts[data_lsts < model_starts[0]] += 2 * np.pi
-    # make sure monotonically increasing.
-    data_lsts = np.unwrap(data_lsts)
-    # select model files
-    match = np.asarray(modelfiles)[(model_starts < data_lsts[-1] + atol)
-                                   & (model_ends > data_lsts[0] - atol)]
-
-    return match
+    # get first modelfile time arrays
+    m0_dlst, m0_dtime, m0_lsts, m0_times = io.get_file_times(modelfiles[0], filetype=filetype)
+
+    # get data time arrays
+    data_dlst, _, data_lsts, _ = io.get_file_times(datafile, filetype=filetype)
+
+    def unwrap(lsts, branch_cut):
+        lsts[lsts < branch_cut] += 2 * np.pi
+        lsts[lsts < lsts[0]] += 2 * np.pi  # also ensure that it's increasing internally
+    unwrap(data_lsts, m0_lsts[0] - m0_dlst / 2)  # shift data relative to model
+
+    def lst_overlap(m_lsts, m_dlst):
+        if (m_lsts[0] - m_dlst / 2 < data_lsts[-1] + atol) & (m_lsts[-1] + m_dlst / 2 > data_lsts[0] - atol):
+            return 'during'
+        elif (m_lsts[0] - m_dlst / 2 < data_lsts[-1] + atol) & (m_lsts[-1] + m_dlst / 2 < data_lsts[0] - atol):
+            return 'before'
+        else:
+            return 'after'
+
+    # binary search to find a matching file
+    idx = len(modelfiles) // 2
+    step_size = max([len(modelfiles) // 2, 1])
+    indicies_tried = set([])
+    while True:
+        # get lsts for model file at idx
+        if (idx >= len(modelfiles)) or (idx < 0) or (idx in indicies_tried):
+            return []  # no match found
+        m_dlst, _, m_lsts, _ = io.get_file_times(modelfiles[idx], filetype=filetype)
+        indicies_tried.add(idx)
+        unwrap(m_lsts, m0_lsts[0])
+
+        # figure out whether this model file overlaps with the data
+        overlap = lst_overlap(m_lsts, m_dlst)
+        if overlap == 'during':
+            break
+        else:
+            step_size = max([abs(step_size) // 2, 1]) * (1 if overlap == 'before' else -1)
+        idx += step_size
+
+    # now loop over neighboring files, going forward and backwards simultaneously, looking for additional overlaps
+    match = set([modelfiles[idx]])
+    for i in range(1, len(modelfiles)):
+        m1_dlst, _, m1_lsts, _ = io.get_file_times(modelfiles[(idx - i) % len(modelfiles)], filetype=filetype)
+        unwrap(m1_lsts, m0_lsts[0])
+        m2_dlst, _, m2_lsts, _ = io.get_file_times(modelfiles[(idx + i) % len(modelfiles)], filetype=filetype)
+        unwrap(m2_lsts, m0_lsts[0])
+        overlap1 = lst_overlap(m1_lsts, m1_dlst)
+        overlap2 = lst_overlap(m2_lsts, m2_dlst)
+        if overlap1 == 'during':
+            match.add(modelfiles[(idx - i) % len(modelfiles)])
+        if overlap2 == 'during':
+            match.add(modelfiles[(idx + i) % len(modelfiles)])
+        if (overlap1 != 'during') and (overlap2 != 'during'):
+            break
+
+    # return match in the same order as the files appeared in modelfiles
+    return sorted(match, key=modelfiles.index)
 
 
 def cut_bls(datacontainer, bls=None, min_bl_cut=None, max_bl_cut=None, inplace=False):

From a19876483c6f9cdcd65972a478ed20231f3560bc Mon Sep 17 00:00:00 2001
From: Josh Dillon <jsdillon@berkeley.edu>
Date: Fri, 27 Dec 2024 12:01:33 -0800
Subject: [PATCH 2/4] handle phase wraps in lst_overlap() subfunction

---
 hera_cal/abscal.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/hera_cal/abscal.py b/hera_cal/abscal.py
index 5b090b62d..f22c24248 100644
--- a/hera_cal/abscal.py
+++ b/hera_cal/abscal.py
@@ -2550,9 +2550,12 @@ def unwrap(lsts, branch_cut):
     unwrap(data_lsts, m0_lsts[0] - m0_dlst / 2)  # shift data relative to model
 
     def lst_overlap(m_lsts, m_dlst):
-        if (m_lsts[0] - m_dlst / 2 < data_lsts[-1] + atol) & (m_lsts[-1] + m_dlst / 2 > data_lsts[0] - atol):
-            return 'during'
-        elif (m_lsts[0] - m_dlst / 2 < data_lsts[-1] + atol) & (m_lsts[-1] + m_dlst / 2 < data_lsts[0] - atol):
+        def intervals_overlap(s1, e1, s2, e2):
+            return (s1 < e2) and (e1 > s2)
+        for shift in [0, -2 * np.pi, 2 * np.pi]:
+            if intervals_overlap(m_lsts[0] - m_dlst / 2 + shift, m_lsts[-1] + m_dlst / 2 + shift, data_lsts[0] - atol, data_lsts[-1] + atol):
+                return 'during'
+        if (m_lsts[-1] + m_dlst / 2) < (data_lsts[0] - atol):
             return 'before'
         else:
             return 'after'

From 1c92491599ef3858aa00b7b5b0ee02eff32bfc90 Mon Sep 17 00:00:00 2001
From: Steven Murray <steven.g.murray@asu.edu>
Date: Fri, 17 Jan 2025 00:35:26 +0100
Subject: [PATCH 3/4] fix: edge case where data file straddles phase wrap

---
 hera_cal/abscal.py | 11 +++++++++++
 hera_cal/io.py     |  3 +++
 2 files changed, 14 insertions(+)

diff --git a/hera_cal/abscal.py b/hera_cal/abscal.py
index f22c24248..193730c15 100644
--- a/hera_cal/abscal.py
+++ b/hera_cal/abscal.py
@@ -2549,6 +2549,17 @@ def unwrap(lsts, branch_cut):
         lsts[lsts < lsts[0]] += 2 * np.pi  # also ensure that it's increasing internally
     unwrap(data_lsts, m0_lsts[0] - m0_dlst / 2)  # shift data relative to model
 
+    # There is an edge-case in which the first data lst is just below the first model lst
+    # but after the last model lst, and the other data lsts are above the first model lst.
+    # In this case, we will keep thinking we are too low in LST, so we move up til we've
+    # exhausted all the model files, but we should have been moving down.
+    mlast_dlst, _, mlast_lsts, _ = io.get_file_times(modelfiles[0], filetype=filetype)
+    if (
+        data_lsts[0] > mlast_lsts[-1] + mlast_dlst / 2
+        and data_lsts[-1] - 2 * np.pi > m0_lsts[0] - m0_dlst / 2
+    ):
+        data_lsts -= 2 * np.pi
+
     def lst_overlap(m_lsts, m_dlst):
         def intervals_overlap(s1, e1, s2, e2):
             return (s1 < e2) and (e1 > s2)
diff --git a/hera_cal/io.py b/hera_cal/io.py
index a1dbf9286..f86d6e0a0 100644
--- a/hera_cal/io.py
+++ b/hera_cal/io.py
@@ -1795,6 +1795,9 @@ def get_file_times(filepaths, filetype='uvh5'):
                 time_array = time_array[baseline_array == most_common_bl_num]
                 lst_array = lst_array[baseline_array == most_common_bl_num]
 
+                # Handle phase wraps
+                lst_array[lst_array < lst_array[0]] += 2 * np.pi
+
                 # figure out dtime and dlst, handling the case where a diff cannot be done.
                 if len(time_array) > 1:
                     int_time = np.median(np.diff(time_array))

From 13744fc6e9fcbb1b305f4f44376099fa7ee9c002 Mon Sep 17 00:00:00 2001
From: Josh Dillon <jsdillon@berkeley.edu>
Date: Tue, 21 Jan 2025 11:54:08 -0800
Subject: [PATCH 4/4] add tests

---
 hera_cal/tests/test_abscal.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/hera_cal/tests/test_abscal.py b/hera_cal/tests/test_abscal.py
index 584cceea6..cc479eb4a 100644
--- a/hera_cal/tests/test_abscal.py
+++ b/hera_cal/tests/test_abscal.py
@@ -234,9 +234,17 @@ def test_match_times(self):
         # test basic execution
         relevant_mfiles = abscal.match_times(dfiles[0], mfiles, filetype='miriad')
         assert len(relevant_mfiles) == 2
+        relevant_mfiles = abscal.match_times(dfiles[0], mfiles[0:1], filetype='miriad')
+        assert len(relevant_mfiles) == 1
+        relevant_mfiles = abscal.match_times(dfiles[0], mfiles[1:2], filetype='miriad')
+        assert len(relevant_mfiles) == 1
         # test basic execution
         relevant_mfiles = abscal.match_times(dfiles[1], mfiles, filetype='miriad')
         assert len(relevant_mfiles) == 1
+        relevant_mfiles = abscal.match_times(dfiles[1], mfiles[1:2], filetype='miriad')
+        assert len(relevant_mfiles) == 1
+        relevant_mfiles = abscal.match_times(dfiles[1], mfiles[0:1], filetype='miriad')
+        assert len(relevant_mfiles) == 0
         # test no overlap
         mfiles = sorted(glob.glob(os.path.join(DATA_PATH, 'zen.2457698.40355.xx.HH.uvcA')))
         relevant_mfiles = abscal.match_times(dfiles[0], mfiles, filetype='miriad')