Split remove_duplicates into 2: now there's select_nth_duplicate too

Also fix a bug when the same segid is passed multiple times in an iterable by adding a `set()` conversion.
htem · May 18, 2023 · 368c819 · 368c819
1 parent 7d19a93
commit 368c819
Showing 1 changed file with 16 additions and 22 deletions.
diff --git a/fanc/lookup.py b/fanc/lookup.py
@@ -215,7 +215,7 @@ def segids_from_pts(pts,
 anchor_point_sources = ['somas_dec2022', 'peripheral_nerves', 'neck_connective']
 def anchor_point(segid, source_tables=anchor_point_sources,
                  timestamp='now', resolve_duplicates=False,
-                 slow_mode=False) -> np.ndarray:
+                 select_nth_duplicate: int = 0, slow_mode=False) -> np.ndarray:
     """
     Return a representative "anchor" point for each of the given
     segment ID(s).
@@ -240,14 +240,16 @@ def anchor_point(segid, source_tables=anchor_point_sources,
       If 'now', use the current time.
       If datetime, use that time.
 
-    resolve_duplicates: bool or int (default False)
-      This argument specifies what to do if multiple anchor points are found
-      within a single table for a single segment ID. Possible values and behaviors:
-      - False: an exception will be raised instead of trying to resolve anything.
-      - True: the point with the smallest x coordinate will be used.
-      - An integer from 1 to the number of points found: The Nth point (1-indexed!!)
-        will be used, when the points are sorted by x coordinate. Setting
-        `resolve_duplicates=1` gives the same behavior as `resolve_duplicates=True`.
+    resolve_duplicates: bool (default False)
+    select_nth_duplicate: int (default 0)
+      If multiple anchor points are found within a single table for a
+      single segment ID, raise an error (if resolve_duplicates==False) or
+      select one of the points (if resolve_duplicates==True) by sorting the
+      points by x coordinate and then picking the Nth one, with N given by
+      select_nth_duplicate. (N is zero-indexed, so the default value of
+      select_nth_duplicate=0 results in selection of the point with the
+      smallest x coordinate). select_nth_duplicate is ignored if
+      resolve_duplicates==False.
 
     Returns
     -------
@@ -274,7 +276,7 @@ def anchor_point(segid, source_tables=anchor_point_sources,
                        ' Use updated IDs or provide the timestamp where'
                        ' the ID(s) is valid.')
 
-    anchor_points = pd.Series(index=segid, dtype=object)
+    anchor_points = pd.Series(index=set(segid), dtype=object)
 
     for table in source_tables:
         unanchored_ids = anchor_points[anchor_points.isna()].index.values
@@ -300,19 +302,11 @@ def anchor_point(segid, source_tables=anchor_point_sources,
                                      f' {seg} in table "{table}":\n'
                                      f'{np.vstack(point.pt_position)}.\nSet'
                                      ' resolve_duplicates to choose one.')
-                if resolve_duplicates is True:
-                    # default behavior is to resolve duplicates by taking the
-                    # first point (the one with the smallest x coordinate)
-                    resolve_duplicates = 1
-                if isinstance(resolve_duplicates, int):
-                    # change from 1-indexing (which the user can specify)
-                    # to 0-indexing (which we must use with iloc)
-                    resolve_duplicates -= 1
-                if resolve_duplicates >= len(point):
-                    raise ValueError('resolve_duplicates is greater than'
-                                     f' {len(point)}, the number of points found'
+                if select_nth_duplicate >= len(point):
+                    raise ValueError('select_nth_duplicate is too large given'
+                                     f' that {len(point)} points were found'
                                      f' for segid {seg} in table "{table}".')
-                anchor_points.loc[seg] = point['pt_position'].iloc[resolve_duplicates]
+                anchor_points.loc[seg] = point['pt_position'].iloc[select_nth_duplicate]
             else:
                 anchor_points.loc[seg] = point['pt_position'].iloc[0]
         if not any(anchor_points.isna()):