feat: make it possible to only load the row id column (#1949)

Co-authored-by: Will Jones <[email protected]>
lancedb · Feb 13, 2024 · 8b07f90 · 8b07f90
1 parent 4873151
commit 8b07f90
Show file tree

Hide file tree

Showing 3 changed files with 27 additions and 3 deletions.
diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py
@@ -1897,8 +1897,6 @@ def offset(self, n: Optional[int] = None) -> ScannerBuilder:
         return self
 
     def columns(self, cols: Optional[list[str]] = None) -> ScannerBuilder:
-        if cols is not None and len(cols) == 0:
-            cols = None
         self._columns = cols
         return self
 

diff --git a/python/python/tests/test_dataset.py b/python/python/tests/test_dataset.py
@@ -76,7 +76,8 @@ def test_roundtrip_types(tmp_path: Path):
         "dict": pa.array(["a", "b", "a"], pa.dictionary(pa.int8(), pa.string())),
         # PyArrow doesn't support creating large_string dictionaries easily.
         "large_dict": pa.DictionaryArray.from_arrays(
-            pa.array([0, 1, 1], pa.int8()), pa.array(["foo", "bar"], pa.large_string())
+            pa.array([0, 1, 1], pa.int8()),
+            pa.array(["foo", "bar"], pa.large_string()),
         ),
         "list": pa.array([["a", "b"], ["c", "d"], ["e", "f"]], pa.list_(pa.string())),
         "large_list": pa.array(
@@ -1205,6 +1206,23 @@ def test_scan_with_batch_size(tmp_path: Path):
         assert df["a"].iloc[0] == idx * 16
 
 
+def test_scan_no_columns(tmp_path: Path):
+    base_dir = tmp_path / "dataset"
+    df = pd.DataFrame({"a": range(100)})
+    dataset = lance.write_dataset(df, base_dir)
+
+    # columns=[] can be used to get just the row ids
+    batches = dataset.scanner(columns=[], with_row_id=True).to_batches()
+
+    expected_schema = pa.schema([pa.field("_rowid", pa.uint64())])
+    for batch in batches:
+        assert batch.schema == expected_schema
+
+    # if with_row_id is not True then columns=[] is an error
+    with pytest.raises(ValueError, match="no columns were selected"):
+        dataset.scanner(columns=[]).to_table()
+
+
 def test_scan_prefilter(tmp_path: Path):
     base_dir = tmp_path / "dataset"
     vecs = pa.array(

diff --git a/rust/lance/src/dataset/scanner.rs b/rust/lance/src/dataset/scanner.rs
@@ -656,6 +656,14 @@ impl Scanner {
     /// 4. Limit / Offset
     /// 5. Take remaining columns / Projection
     pub async fn create_plan(&self) -> Result<Arc<dyn ExecutionPlan>> {
+        if self.projections.fields.is_empty() && !self.with_row_id {
+            return Err(Error::InvalidInput {
+                source:
+                    "no columns were selected and with_row_id is false, there is nothing to scan"
+                        .into(),
+                location: location!(),
+            });
+        }
         // Scalar indices are only used when prefiltering
         // TODO: Should we use them when postfiltering if there is no vector search?
         let use_scalar_index = self.prefilter || self.nearest.is_none();