Skip to content

Commit

Permalink
feat(python): expose drop_columns() in Python (#1904)
Browse files Browse the repository at this point in the history
Renames the Rust method `drop()` to `drop_columns()` for clarity and
also alignment with `add_columns()` and `alter_columns()`.

Closes #1076
Related #1674
  • Loading branch information
wjones127 authored Feb 3, 2024
1 parent e022688 commit 5d898a7
Show file tree
Hide file tree
Showing 7 changed files with 217 additions and 307 deletions.
6 changes: 3 additions & 3 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.1.5
rev: v0.2.0
hooks:
- id: ruff
args: [--fix, --exit-non-zero-on-fix]
args: [--preview, --fix, --exit-non-zero-on-fix]
- id: ruff-format
args: []
args: [--preview]
8 changes: 4 additions & 4 deletions python/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,16 @@ format: format-python
.PHONY: format

format-python:
ruff format python
ruff --fix python
ruff format --preview python
ruff --preview --fix python
.PHONY: format-python

lint: lint-python lint-rust
.PHONY: lint

lint-python:
ruff format --check python
ruff python
ruff format --preview --check python
ruff --preview python
.PHONY: lint-python

lint-rust:
Expand Down
31 changes: 31 additions & 0 deletions python/python/lance/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -661,6 +661,37 @@ def merge(

self._ds.merge(reader, left_on, right_on)

def drop_columns(self, columns: List[str]):
"""Drop one or more columns from the dataset
Parameters
----------
columns : list of str
The names of the columns to drop. These can be nested column references
(e.g. "a.b.c") or top-level column names (e.g. "a").
This is a metadata-only operation and does not remove the data from the
underlying storage. In order to remove the data, you must subsequently
call ``compact_files`` to rewrite the data without the removed columns and
then call ``cleanup_files`` to remove the old files.
Examples
--------
>>> import lance
>>> import pyarrow as pa
>>> table = pa.table({"a": [1, 2, 3], "b": ["a", "b", "c"]})
>>> dataset = lance.write_dataset(table, "example")
>>> dataset.drop_columns(["a"])
>>> dataset.to_table().to_pandas()
b
0 a
1 b
2 c
"""
self._ds.drop_columns(columns)
# Indices might have changed
self._list_indices_res = None

def delete(self, predicate: Union[str, pa.compute.Expression]):
"""
Delete rows from the dataset.
Expand Down
50 changes: 50 additions & 0 deletions python/python/tests/test_schema_evolution.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# Copyright (c) 2024. Lance Developers
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from pathlib import Path

import lance
import pyarrow as pa
import pyarrow.compute as pc
import pytest


def test_drop_columns(tmp_path: Path):
dims = 32
nrows = 512
values = pc.random(nrows * dims).cast("float32")
table = pa.table({
"a": pa.FixedSizeListArray.from_arrays(values, dims),
"b": range(nrows),
"c": range(nrows),
})
dataset = lance.write_dataset(table, tmp_path)
dataset.create_index("a", "IVF_PQ", num_partitions=2, num_sub_vectors=1)

# Drop a column, index is kept
dataset.drop_columns(["b"])
assert dataset.schema == pa.schema({
"a": pa.list_(pa.float32(), dims),
"c": pa.int64(),
})
assert len(dataset.list_indices()) == 1

# Drop vector column, index is dropped
dataset.drop_columns(["a"])
assert dataset.schema == pa.schema({"c": pa.int64()})
assert len(dataset.list_indices()) == 0

# Can't drop all columns
with pytest.raises(ValueError):
dataset.drop_columns(["c"])
13 changes: 13 additions & 0 deletions python/src/dataset.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1003,6 +1003,19 @@ impl Dataset {
RT.block_on(None, self.ds.validate())?
.map_err(|err| PyIOError::new_err(err.to_string()))
}

fn drop_columns(&mut self, columns: Vec<&str>) -> PyResult<()> {
let mut new_self = self.ds.as_ref().clone();
RT.block_on(None, new_self.drop_columns(&columns))?
.map_err(|err| match err {
lance::Error::InvalidInput { source, .. } => {
PyValueError::new_err(source.to_string())
}
_ => PyIOError::new_err(err.to_string()),
})?;
self.ds = Arc::new(new_self);
Ok(())
}
}

impl Dataset {
Expand Down
Loading

0 comments on commit 5d898a7

Please sign in to comment.