Ensure _from_data accepts columns objects only (#1415)

rapidsai/cudf#16285 makes `_from_data` explicitly requires the `data.values()` to all be a `ColumnBase`. This PR either ensures they are columns or just goes through the normal `GeoDataFrame`/`DataFrame` constructor if they are not. Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Mark Harris (https://github.com/harrism) - Bradley Dice (https://github.com/bdice) URL: #1415
rapidsai · Jul 25, 2024 · 7f76560 · 7f76560
1 parent 2bbb904
commit 7f76560
Show file tree

Hide file tree

Showing 4 changed files with 30 additions and 22 deletions.
diff --git a/python/cuspatial/cuspatial/core/binpreds/contains.py b/python/cuspatial/cuspatial/core/binpreds/contains.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 from math import ceil, sqrt
 
@@ -110,7 +110,7 @@ def _brute_force_contains_properly(points, polygons):
             width=len(polygons.polygons.part_offset) - 1,
         )
     )
-    final_result = DataFrame._from_data(
+    final_result = DataFrame(
         {
             name: result[name].astype("bool")
             for name in reversed(result.columns)

diff --git a/python/cuspatial/cuspatial/core/geodataframe.py b/python/cuspatial/cuspatial/core/geodataframe.py
@@ -1,11 +1,14 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION
-from typing import Dict, Tuple, TypeVar, Union
+from __future__ import annotations
+
+from typing import Any, Dict, TypeVar, Union
 
 import pandas as pd
 from geopandas import GeoDataFrame as gpGeoDataFrame
 from geopandas.geoseries import is_geometry_type as gp_is_geometry_type
 
 import cudf
+from cudf.core.column import as_column
 from cudf.core.copy_types import BooleanMask, GatherMap
 
 from cuspatial.core._column.geocolumn import GeoColumn, GeoMeta
@@ -41,7 +44,7 @@ def __init__(
                     column = GeoColumn(adapter._get_geotuple(), pandas_meta)
                     self._data[col] = column
                 else:
-                    self._data[col] = data[col]
+                    self._data[col] = as_column(data[col])
         elif isinstance(data, dict):
             for key in data.keys():
                 try:
@@ -137,7 +140,9 @@ def _copy_type_metadata(
 
         return type_copied
 
-    def _split_out_geometry_columns(self) -> Tuple:
+    def _split_out_geometry_columns(
+        self,
+    ) -> tuple[GeoDataFrame, cudf.DataFrame]:
         """
         Break the geometry columns and non-geometry columns into
         separate dataframes and return them separated.
@@ -154,18 +159,20 @@ def _split_out_geometry_columns(self) -> Tuple:
         )
         return (geo_columns, data_columns)
 
-    def _recombine_columns(self, geo_columns, data_columns):
+    def _recombine_columns(
+        self, geo_columns: GeoDataFrame, data_columns: cudf.DataFrame
+    ) -> dict[Any, GeoSeries | cudf.Series]:
         """
         Combine a GeoDataFrame of only geometry columns with a DataFrame
         of non-geometry columns in the same order as the columns in `self`
         """
-        columns_mask = pd.Series(self.columns)
-        geocolumn_mask = pd.Series(
-            [isinstance(self[col], GeoSeries) for col in self.columns]
+        columns_mask = self.columns
+        geocolumn_mask = (
+            isinstance(self[col], GeoSeries) for col in columns_mask
         )
         return {
             name: (geo_columns[name] if mask else data_columns[name])
-            for name, mask in zip(columns_mask.values, geocolumn_mask.values)
+            for name, mask in zip(columns_mask, geocolumn_mask)
         }
 
     def _slice(self: T, arg: slice) -> T:
@@ -190,15 +197,15 @@ def _apply_boolean_mask(self, mask: BooleanMask, keep_index=True) -> T:
             {name: geo_columns[name][mask.column] for name in geo_columns}
         )
 
-        res = self.__class__._from_data(self._recombine_columns(geo, data))
+        res = self.__class__(self._recombine_columns(geo, data))
         if keep_index:
             res.index = data.index
         return res
 
     def _gather(self, gather_map: GatherMap, keep_index=True):
-        geo_data, cudf_data = self._split_out_geometry_columns()
+        geo_data, df = self._split_out_geometry_columns()
         # gather cudf columns
-        df = cudf.DataFrame._from_data(data=cudf_data, index=self.index)
+        df.index = self.index
 
         cudf_gathered = df._gather(gather_map, keep_index=keep_index)
 
@@ -210,7 +217,7 @@ def _gather(self, gather_map: GatherMap, keep_index=True):
         geo_gathered = GeoDataFrame(gathered)
 
         # combine
-        result = GeoDataFrame._from_data(
+        result = GeoDataFrame(
             self._recombine_columns(geo_gathered, cudf_gathered)
         )
         result.index = geo_gathered.index
@@ -294,7 +301,7 @@ def reset_index(
             # Reset the index of the GeoDataFrame to match the
             # cudf DataFrame and recombine.
             geo_data.index = cudf_reindexed.index
-            result = GeoDataFrame._from_data(
+            result = GeoDataFrame(
                 recombiner._recombine_columns(geo_data, cudf_reindexed)
             )
             result.index = geo_data.index

diff --git a/python/cuspatial/cuspatial/core/spatial/join.py b/python/cuspatial/cuspatial/core/spatial/join.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 import warnings
 
@@ -87,7 +87,7 @@ def point_in_polygon(points: GeoSeries, polygons: GeoSeries):
     )
 
     result.columns = polygons.index[::-1]
-    return DataFrame._from_data(
+    return DataFrame(
         {
             name: result[name].astype("bool")
             for name in reversed(result.columns)

diff --git a/python/cuspatial/cuspatial/core/spatial/nearest_points.py b/python/cuspatial/cuspatial/core/spatial/nearest_points.py
@@ -1,4 +1,4 @@
-import cupy as cp
+# Copyright (c) 2024, NVIDIA CORPORATION.
 
 import cudf
 from cudf.core.column import as_column
@@ -57,7 +57,7 @@ def pairwise_point_linestring_nearest_points(
             "segment_id": cudf.Series([], dtype="i4"),
             "geometry": GeoSeries([]),
         }
-        return GeoDataFrame._from_data(data)
+        return GeoDataFrame(data)
 
     if not contains_only_points(points):
         raise ValueError("`points` must contain only point geometries.")
@@ -97,11 +97,12 @@ def pairwise_point_linestring_nearest_points(
         as_column(linestrings.lines.geometry_offset),
     )
 
-    point_on_linestring = GeoColumn._from_points_xy(point_on_linestring_xy)
-    nearest_points_on_linestring = GeoSeries(point_on_linestring)
+    nearest_points_on_linestring = GeoColumn._from_points_xy(
+        point_on_linestring_xy
+    )
 
     if not point_geometry_id:
-        point_geometry_id = cp.zeros(len(points), dtype=cp.int32)
+        point_geometry_id = as_column(0, length=len(points), dtype="int32")
 
     data = {
         "point_geometry_id": point_geometry_id,