Skip to content

Commit

Permalink
Remove pyarrow dependency (#582)
Browse files Browse the repository at this point in the history
For #581
  • Loading branch information
kylebarron authored Aug 8, 2024
1 parent abcb5d4 commit 708f8af
Show file tree
Hide file tree
Showing 25 changed files with 1,233 additions and 1,122 deletions.
20 changes: 12 additions & 8 deletions lonboard/_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,29 +5,32 @@
from typing import Dict, List, Optional

import click
import pyarrow as pa
import pyarrow.parquet as pq
from arro3.core import Table
from pyproj import CRS

from lonboard import viz
from lonboard._constants import EXTENSION_NAME


def read_pyogrio(path: Path) -> pa.Table:
def read_pyogrio(path: Path) -> Table:
"""Read path using pyogrio and convert field metadata to geoarrow
Args:
path: Path to file readable by pyogrio
"""
try:
from pyogrio.raw import read_arrow
from pyogrio.raw import open_arrow
except ImportError as e:
raise ImportError(
"pyogrio is a required dependency for the CLI. "
"Install with `pip install pyogrio`."
) from e

meta, table = read_arrow(path)
with open_arrow(path, use_pyarrow=False) as source:
meta, stream = source
table = Table.from_arrow(stream)

# The `geometry_name` key always exists but can be an empty string. In the case of
# an empty string, we want to default to `wkb_geometry`
geometry_column_name = meta.get("geometry_name") or "wkb_geometry"
Expand All @@ -53,10 +56,10 @@ def read_pyogrio(path: Path) -> pa.Table:

new_field = field.with_name("geometry").with_metadata(metadata)
new_schema = schema.set(geometry_column_index, new_field)
return pa.Table.from_arrays(table.columns, schema=new_schema)
return table.with_schema(new_schema)


def read_geoparquet(path: Path):
def read_geoparquet(path: Path) -> Table:
"""Read GeoParquet file at path using pyarrow
Args:
Expand All @@ -67,7 +70,8 @@ def read_geoparquet(path: Path):
if not geo_meta:
raise ValueError("Expected geo metadata in Parquet file")

table = file.read()
pyarrow_table = file.read()
table = Table.from_arrow(pyarrow_table)

geo_meta = json.loads(geo_meta)
geometry_column_name = geo_meta["primary_column"]
Expand All @@ -86,7 +90,7 @@ def read_geoparquet(path: Path):

new_field = table.schema.field(geometry_column_index).with_metadata(metadata)
new_schema = table.schema.set(geometry_column_index, new_field)
return pa.Table.from_arrays(table.columns, schema=new_schema)
return table.with_schema(new_schema)


@click.command()
Expand Down
63 changes: 39 additions & 24 deletions lonboard/_geoarrow/_duckdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,18 @@

import json
import re
from typing import TYPE_CHECKING, Optional, Union
from typing import TYPE_CHECKING, List, Optional, Union

import numpy as np
import pyarrow as pa
import pyarrow.compute as pc
from arro3.compute import struct_field
from arro3.core import (
Array,
ChunkedArray,
Field,
Table,
fixed_size_list_array,
list_array,
)

from lonboard._constants import EXTENSION_NAME

Expand All @@ -29,7 +36,7 @@ def from_duckdb(
*,
con: Optional[duckdb.DuckDBPyConnection] = None,
crs: Optional[Union[str, pyproj.CRS]] = None,
) -> pa.Table:
) -> Table:
geom_col_idxs = [
i for i, t in enumerate(rel.types) if str(t) in DUCKDB_SPATIAL_TYPES
]
Expand Down Expand Up @@ -89,9 +96,9 @@ def _from_geometry(
con: Optional[duckdb.DuckDBPyConnection] = None,
geom_col_idx: int,
crs: Optional[Union[str, pyproj.CRS]] = None,
) -> pa.Table:
) -> Table:
other_col_names = [name for i, name in enumerate(rel.columns) if i != geom_col_idx]
non_geo_table = rel.select(*other_col_names).arrow()
non_geo_table = Table.from_arrow(rel.select(*other_col_names).arrow())
geom_col_name = rel.columns[geom_col_idx]

# A poor-man's string interpolation check
Expand All @@ -102,9 +109,11 @@ def _from_geometry(
), f"Expected geometry column name to match regex: {re_match}"

if con is not None:
geom_table = con.sql(f"""
geom_table = Table.from_arrow(
con.sql(f"""
SELECT ST_AsWKB( {geom_col_name} ) as {geom_col_name} FROM rel;
""").arrow()
)
else:
import duckdb

Expand All @@ -119,7 +128,9 @@ def _from_geometry(
SELECT ST_AsWKB( {geom_col_name} ) as {geom_col_name} FROM rel;
"""
try:
geom_table = duckdb.execute(sql).arrow()
geom_table = Table.from_arrow(
duckdb.execute(sql, connection=duckdb.default_connection).arrow()
)
except duckdb.CatalogException as err:
msg = (
"Could not coerce type GEOMETRY to WKB.\n"
Expand All @@ -140,8 +151,8 @@ def _from_geoarrow(
extension_type: EXTENSION_NAME,
geom_col_idx: int,
crs: Optional[Union[str, pyproj.CRS]] = None,
) -> pa.Table:
table = rel.arrow()
) -> Table:
table = Table.from_arrow(rel.arrow())
metadata = _make_geoarrow_field_metadata(extension_type, crs)
geom_field = table.schema.field(geom_col_idx).with_metadata(metadata)
return table.set_column(geom_col_idx, geom_field, table.column(geom_col_idx))
Expand All @@ -152,21 +163,24 @@ def _from_box2d(
*,
geom_col_idx: int,
crs: Optional[Union[str, pyproj.CRS]] = None,
) -> pa.Table:
table = rel.arrow()
) -> Table:
table = Table.from_arrow(rel.arrow())
geom_col = table.column(geom_col_idx)

polygon_array = _convert_box2d_to_geoarrow_polygon_array(geom_col)
polygon_chunks: List[Array] = []
for geom_chunk in geom_col.chunks:
polygon_array = _convert_box2d_to_geoarrow_polygon_array(geom_chunk)
polygon_chunks.append(polygon_array)

metadata = _make_geoarrow_field_metadata(EXTENSION_NAME.POLYGON, crs)
prev_field = table.schema.field(geom_col_idx)
geom_field = pa.field(prev_field.name, polygon_array.type, metadata=metadata)
return table.set_column(geom_col_idx, geom_field, polygon_array)
geom_field = Field(prev_field.name, polygon_chunks[0].type, metadata=metadata)
return table.set_column(geom_col_idx, geom_field, ChunkedArray(polygon_chunks))


def _convert_box2d_to_geoarrow_polygon_array(
geom_col: pa.StructArray,
) -> pa.ListArray:
geom_col: Array,
) -> Array:
"""
This is a manual conversion of the duckdb box_2d type to a GeoArrow Polygon array.
Expand All @@ -176,10 +190,10 @@ def _convert_box2d_to_geoarrow_polygon_array(
# Extract the bounding box columns from the Arrow struct
# NOTE: this assumes that the box ordering is minx, miny, maxx, maxy
# Note sure whether the positional ordering or the named fields is more stable
min_x = pc.struct_field(geom_col, 0)
min_y = pc.struct_field(geom_col, 1)
max_x = pc.struct_field(geom_col, 2)
max_y = pc.struct_field(geom_col, 3)
min_x = struct_field(geom_col, 0)
min_y = struct_field(geom_col, 1)
max_x = struct_field(geom_col, 2)
max_y = struct_field(geom_col, 3)

# Provision memory for the output coordinates. For closed polygons, each input box
# becomes 5 coordinates.
Expand Down Expand Up @@ -208,9 +222,10 @@ def _convert_box2d_to_geoarrow_polygon_array(
geom_offsets = np.arange(0, len(ring_offsets), dtype=np.int32)

# Construct the final PolygonArray
coords = pa.FixedSizeListArray.from_arrays(coords.ravel("C"), 2)
ring_array = pa.ListArray.from_arrays(ring_offsets, coords)
polygon_array = pa.ListArray.from_arrays(geom_offsets, ring_array)
flat_coords: Array = Array.from_numpy(coords.ravel("C"))
coords = fixed_size_list_array(flat_coords, 2)
ring_array = list_array(Array.from_numpy(ring_offsets), coords)
polygon_array = list_array(Array.from_numpy(geom_offsets), ring_array)
return polygon_array


Expand Down
4 changes: 2 additions & 2 deletions lonboard/_geoarrow/crs.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
import json
from typing import Optional

import pyarrow as pa
from arro3.core import Field


# Note: According to the spec, if the metadata key exists, its value should never be
# `null` or an empty dict, but we still check for those to be safe
def get_field_crs(field: pa.Field) -> Optional[str]:
def get_field_crs(field: Field) -> Optional[str]:
extension_metadata_value = field.metadata.get(b"ARROW:extension:metadata")
if not extension_metadata_value:
return None
Expand Down
Loading

0 comments on commit 708f8af

Please sign in to comment.