Skip to content

Commit

Permalink
feat: make it possible to only load the row id column (#1949)
Browse files Browse the repository at this point in the history
Co-authored-by: Will Jones <[email protected]>
  • Loading branch information
westonpace and wjones127 authored Feb 13, 2024
1 parent 4873151 commit 8b07f90
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 3 deletions.
2 changes: 0 additions & 2 deletions python/python/lance/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1897,8 +1897,6 @@ def offset(self, n: Optional[int] = None) -> ScannerBuilder:
return self

def columns(self, cols: Optional[list[str]] = None) -> ScannerBuilder:
if cols is not None and len(cols) == 0:
cols = None
self._columns = cols
return self

Expand Down
20 changes: 19 additions & 1 deletion python/python/tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,8 @@ def test_roundtrip_types(tmp_path: Path):
"dict": pa.array(["a", "b", "a"], pa.dictionary(pa.int8(), pa.string())),
# PyArrow doesn't support creating large_string dictionaries easily.
"large_dict": pa.DictionaryArray.from_arrays(
pa.array([0, 1, 1], pa.int8()), pa.array(["foo", "bar"], pa.large_string())
pa.array([0, 1, 1], pa.int8()),
pa.array(["foo", "bar"], pa.large_string()),
),
"list": pa.array([["a", "b"], ["c", "d"], ["e", "f"]], pa.list_(pa.string())),
"large_list": pa.array(
Expand Down Expand Up @@ -1205,6 +1206,23 @@ def test_scan_with_batch_size(tmp_path: Path):
assert df["a"].iloc[0] == idx * 16


def test_scan_no_columns(tmp_path: Path):
base_dir = tmp_path / "dataset"
df = pd.DataFrame({"a": range(100)})
dataset = lance.write_dataset(df, base_dir)

# columns=[] can be used to get just the row ids
batches = dataset.scanner(columns=[], with_row_id=True).to_batches()

expected_schema = pa.schema([pa.field("_rowid", pa.uint64())])
for batch in batches:
assert batch.schema == expected_schema

# if with_row_id is not True then columns=[] is an error
with pytest.raises(ValueError, match="no columns were selected"):
dataset.scanner(columns=[]).to_table()


def test_scan_prefilter(tmp_path: Path):
base_dir = tmp_path / "dataset"
vecs = pa.array(
Expand Down
8 changes: 8 additions & 0 deletions rust/lance/src/dataset/scanner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -656,6 +656,14 @@ impl Scanner {
/// 4. Limit / Offset
/// 5. Take remaining columns / Projection
pub async fn create_plan(&self) -> Result<Arc<dyn ExecutionPlan>> {
if self.projections.fields.is_empty() && !self.with_row_id {
return Err(Error::InvalidInput {
source:
"no columns were selected and with_row_id is false, there is nothing to scan"
.into(),
location: location!(),
});
}
// Scalar indices are only used when prefiltering
// TODO: Should we use them when postfiltering if there is no vector search?
let use_scalar_index = self.prefilter || self.nearest.is_none();
Expand Down

0 comments on commit 8b07f90

Please sign in to comment.