Skip to content

Commit

Permalink
fix: shard batch iterator can reads partial batches (#1889)
Browse files Browse the repository at this point in the history
  • Loading branch information
eddyxu authored Jan 30, 2024
1 parent 628f7a3 commit 085b4d9
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 1 deletion.
2 changes: 1 addition & 1 deletion python/python/lance/_dataset/sharded_batch_iterator.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ def _gen_ranges():
total,
self._world_size * self._batch_size,
):
yield start, start + self._batch_size
yield start, min(start + self._batch_size, total)

return self._ds._ds.take_scan(
_gen_ranges(),
Expand Down
19 changes: 19 additions & 0 deletions python/python/tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1416,3 +1416,22 @@ def test_sharded_iterator_batches(tmp_path: Path):
for j in range(i, i + BATCH_SIZE)
]
)


def test_sharded_iterator_non_full_batch(tmp_path: Path):
arr = pa.array(range(1186))
tbl = pa.table({"a": arr})

ds = lance.write_dataset(tbl, tmp_path)
shard_datast = ShardedBatchIterator(
ds,
1,
2,
columns=["a"],
batch_size=100,
granularity="batch",
)
batches = pa.concat_arrays([b["a"] for b in shard_datast])

# Can read partial batches
assert len(set(range(1100, 1186)) - set(batches.to_pylist())) == 0

0 comments on commit 085b4d9

Please sign in to comment.