Skip to content

Commit

Permalink
Ensure _from_data accepts columns objects only (#1415)
Browse files Browse the repository at this point in the history
rapidsai/cudf#16285 makes `_from_data` explicitly requires the `data.values()` to all be a `ColumnBase`. This PR either ensures they are columns or just goes through the normal `GeoDataFrame`/`DataFrame` constructor if they are not.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Mark Harris (https://github.com/harrism)
  - Bradley Dice (https://github.com/bdice)

URL: #1415
  • Loading branch information
mroeschke authored Jul 25, 2024
1 parent 2bbb904 commit 7f76560
Show file tree
Hide file tree
Showing 4 changed files with 30 additions and 22 deletions.
4 changes: 2 additions & 2 deletions python/cuspatial/cuspatial/core/binpreds/contains.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2022-2023, NVIDIA CORPORATION.
# Copyright (c) 2022-2024, NVIDIA CORPORATION.

from math import ceil, sqrt

Expand Down Expand Up @@ -110,7 +110,7 @@ def _brute_force_contains_properly(points, polygons):
width=len(polygons.polygons.part_offset) - 1,
)
)
final_result = DataFrame._from_data(
final_result = DataFrame(
{
name: result[name].astype("bool")
for name in reversed(result.columns)
Expand Down
33 changes: 20 additions & 13 deletions python/cuspatial/cuspatial/core/geodataframe.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
# Copyright (c) 2020-2024, NVIDIA CORPORATION
from typing import Dict, Tuple, TypeVar, Union
from __future__ import annotations

from typing import Any, Dict, TypeVar, Union

import pandas as pd
from geopandas import GeoDataFrame as gpGeoDataFrame
from geopandas.geoseries import is_geometry_type as gp_is_geometry_type

import cudf
from cudf.core.column import as_column
from cudf.core.copy_types import BooleanMask, GatherMap

from cuspatial.core._column.geocolumn import GeoColumn, GeoMeta
Expand Down Expand Up @@ -41,7 +44,7 @@ def __init__(
column = GeoColumn(adapter._get_geotuple(), pandas_meta)
self._data[col] = column
else:
self._data[col] = data[col]
self._data[col] = as_column(data[col])
elif isinstance(data, dict):
for key in data.keys():
try:
Expand Down Expand Up @@ -137,7 +140,9 @@ def _copy_type_metadata(

return type_copied

def _split_out_geometry_columns(self) -> Tuple:
def _split_out_geometry_columns(
self,
) -> tuple[GeoDataFrame, cudf.DataFrame]:
"""
Break the geometry columns and non-geometry columns into
separate dataframes and return them separated.
Expand All @@ -154,18 +159,20 @@ def _split_out_geometry_columns(self) -> Tuple:
)
return (geo_columns, data_columns)

def _recombine_columns(self, geo_columns, data_columns):
def _recombine_columns(
self, geo_columns: GeoDataFrame, data_columns: cudf.DataFrame
) -> dict[Any, GeoSeries | cudf.Series]:
"""
Combine a GeoDataFrame of only geometry columns with a DataFrame
of non-geometry columns in the same order as the columns in `self`
"""
columns_mask = pd.Series(self.columns)
geocolumn_mask = pd.Series(
[isinstance(self[col], GeoSeries) for col in self.columns]
columns_mask = self.columns
geocolumn_mask = (
isinstance(self[col], GeoSeries) for col in columns_mask
)
return {
name: (geo_columns[name] if mask else data_columns[name])
for name, mask in zip(columns_mask.values, geocolumn_mask.values)
for name, mask in zip(columns_mask, geocolumn_mask)
}

def _slice(self: T, arg: slice) -> T:
Expand All @@ -190,15 +197,15 @@ def _apply_boolean_mask(self, mask: BooleanMask, keep_index=True) -> T:
{name: geo_columns[name][mask.column] for name in geo_columns}
)

res = self.__class__._from_data(self._recombine_columns(geo, data))
res = self.__class__(self._recombine_columns(geo, data))
if keep_index:
res.index = data.index
return res

def _gather(self, gather_map: GatherMap, keep_index=True):
geo_data, cudf_data = self._split_out_geometry_columns()
geo_data, df = self._split_out_geometry_columns()
# gather cudf columns
df = cudf.DataFrame._from_data(data=cudf_data, index=self.index)
df.index = self.index

cudf_gathered = df._gather(gather_map, keep_index=keep_index)

Expand All @@ -210,7 +217,7 @@ def _gather(self, gather_map: GatherMap, keep_index=True):
geo_gathered = GeoDataFrame(gathered)

# combine
result = GeoDataFrame._from_data(
result = GeoDataFrame(
self._recombine_columns(geo_gathered, cudf_gathered)
)
result.index = geo_gathered.index
Expand Down Expand Up @@ -294,7 +301,7 @@ def reset_index(
# Reset the index of the GeoDataFrame to match the
# cudf DataFrame and recombine.
geo_data.index = cudf_reindexed.index
result = GeoDataFrame._from_data(
result = GeoDataFrame(
recombiner._recombine_columns(geo_data, cudf_reindexed)
)
result.index = geo_data.index
Expand Down
4 changes: 2 additions & 2 deletions python/cuspatial/cuspatial/core/spatial/join.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2022-2023, NVIDIA CORPORATION.
# Copyright (c) 2022-2024, NVIDIA CORPORATION.

import warnings

Expand Down Expand Up @@ -87,7 +87,7 @@ def point_in_polygon(points: GeoSeries, polygons: GeoSeries):
)

result.columns = polygons.index[::-1]
return DataFrame._from_data(
return DataFrame(
{
name: result[name].astype("bool")
for name in reversed(result.columns)
Expand Down
11 changes: 6 additions & 5 deletions python/cuspatial/cuspatial/core/spatial/nearest_points.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import cupy as cp
# Copyright (c) 2024, NVIDIA CORPORATION.

import cudf
from cudf.core.column import as_column
Expand Down Expand Up @@ -57,7 +57,7 @@ def pairwise_point_linestring_nearest_points(
"segment_id": cudf.Series([], dtype="i4"),
"geometry": GeoSeries([]),
}
return GeoDataFrame._from_data(data)
return GeoDataFrame(data)

if not contains_only_points(points):
raise ValueError("`points` must contain only point geometries.")
Expand Down Expand Up @@ -97,11 +97,12 @@ def pairwise_point_linestring_nearest_points(
as_column(linestrings.lines.geometry_offset),
)

point_on_linestring = GeoColumn._from_points_xy(point_on_linestring_xy)
nearest_points_on_linestring = GeoSeries(point_on_linestring)
nearest_points_on_linestring = GeoColumn._from_points_xy(
point_on_linestring_xy
)

if not point_geometry_id:
point_geometry_id = cp.zeros(len(points), dtype=cp.int32)
point_geometry_id = as_column(0, length=len(points), dtype="int32")

data = {
"point_geometry_id": point_geometry_id,
Expand Down

0 comments on commit 7f76560

Please sign in to comment.