From a4024ff2c6529443b6ba701daae6592a4608e3d9 Mon Sep 17 00:00:00 2001 From: Rosie Wood Date: Thu, 21 Nov 2024 13:39:00 +0000 Subject: [PATCH 01/15] add dataclasses --- mapreader/spot_text/dataclasses.py | 16 + mapreader/spot_text/deepsolo_runner.py | 4 +- mapreader/spot_text/dptext_detr_runner.py | 66 +--- mapreader/spot_text/maptext_runner.py | 4 +- mapreader/spot_text/rec_runner_base.py | 346 ----------------- mapreader/spot_text/runner_base.py | 438 ++++++++++++++++++++-- 6 files changed, 426 insertions(+), 448 deletions(-) create mode 100644 mapreader/spot_text/dataclasses.py delete mode 100644 mapreader/spot_text/rec_runner_base.py diff --git a/mapreader/spot_text/dataclasses.py b/mapreader/spot_text/dataclasses.py new file mode 100644 index 00000000..4b155f40 --- /dev/null +++ b/mapreader/spot_text/dataclasses.py @@ -0,0 +1,16 @@ +from __future__ import annotations + +from dataclasses import dataclass + +from shapely.geometry import Polygon + +# Detection only + + +@dataclass +class Prediction: + geometry: Polygon + score: float + text: str = None + patch_id: str | None = None + crs: str | None = None diff --git a/mapreader/spot_text/deepsolo_runner.py b/mapreader/spot_text/deepsolo_runner.py index ffa61cbc..e4ac9e35 100644 --- a/mapreader/spot_text/deepsolo_runner.py +++ b/mapreader/spot_text/deepsolo_runner.py @@ -20,10 +20,10 @@ import torch from deepsolo.config import get_cfg -from .rec_runner_base import RecRunner +from .runner_base import DetRecRunner -class DeepSoloRunner(RecRunner): +class DeepSoloRunner(DetRecRunner): def __init__( self, patch_df: pd.DataFrame | gpd.GeoDataFrame | str | pathlib.Path, diff --git a/mapreader/spot_text/dptext_detr_runner.py b/mapreader/spot_text/dptext_detr_runner.py index d5badcd7..9b17f1c5 100644 --- a/mapreader/spot_text/dptext_detr_runner.py +++ b/mapreader/spot_text/dptext_detr_runner.py @@ -21,10 +21,11 @@ from dptext_detr.config import get_cfg from shapely import MultiPolygon, Polygon -from .runner_base import Runner +from .dataclasses import Prediction +from .runner_base import DetRunner -class DPTextDETRRunner(Runner): +class DPTextDETRRunner(DetRunner): def __init__( self, patch_df: pd.DataFrame | gpd.GeoDataFrame | str | pathlib.Path, @@ -71,7 +72,7 @@ def __init__( # setup the predictor self.predictor = DefaultPredictor(cfg) - def get_patch_predictions( + def _get_patch_predictions( self, outputs: dict, return_dataframe: bool = False, @@ -107,7 +108,7 @@ def get_patch_predictions( self._deduplicate(image_id, min_ioa=min_ioa) if return_dataframe: - return self._dict_to_dataframe(self.patch_predictions, geo=False) + return self._dict_to_dataframe(self.patch_predictions) return self.patch_predictions def _post_process(self, image_id, scores, pred_classes, bd_pnts): @@ -122,59 +123,6 @@ def _post_process(self, image_id, scores, pred_classes, bd_pnts): score = f"{score:.2f}" - self.patch_predictions[image_id].append([polygon, score]) - - @staticmethod - def _dict_to_dataframe( - preds: dict, - geo: bool = False, - parent: bool = False, - ) -> pd.DataFrame: - """Convert the predictions dictionary to a pandas DataFrame. - - Parameters - ---------- - preds : dict - A dictionary of predictions. - geo : bool, optional - Whether the dictionary is georeferenced coords (or pixel bounds), by default True - parent : bool, optional - Whether the dictionary is at the parent level, by default False - - Returns - ------- - pd.DataFrame - A pandas DataFrame containing the predictions. - """ - if geo: - columns = ["geometry", "crs", "score"] - else: - columns = ["geometry", "score"] - - if parent: - columns.append("patch_id") - - preds_df = pd.concat( - pd.DataFrame( - preds[k], - index=np.full(len(preds[k]), k), - columns=columns, + self.patch_predictions[image_id].append( + Prediction(geometry=polygon, score=score) ) - for k in preds.keys() - ) - - if geo: - # get the crs (should be the same for all) - if not preds_df["crs"].nunique() == 1: - raise ValueError("[ERROR] Multiple crs found in the predictions.") - crs = preds_df["crs"].unique()[0] - - preds_df = gpd.GeoDataFrame( - preds_df, - geometry="geometry", - crs=crs, - ) - - preds_df.index.name = "image_id" - preds_df.reset_index(inplace=True) - return preds_df diff --git a/mapreader/spot_text/maptext_runner.py b/mapreader/spot_text/maptext_runner.py index 0291496b..b27a67dd 100644 --- a/mapreader/spot_text/maptext_runner.py +++ b/mapreader/spot_text/maptext_runner.py @@ -20,10 +20,10 @@ import torch from maptextpipeline.config import get_cfg -from .rec_runner_base import RecRunner +from .runner_base import DetRecRunner -class MapTextRunner(RecRunner): +class MapTextRunner(DetRecRunner): def __init__( self, patch_df: pd.DataFrame | gpd.GeoDataFrame | str | pathlib.Path, diff --git a/mapreader/spot_text/rec_runner_base.py b/mapreader/spot_text/rec_runner_base.py deleted file mode 100644 index 8078cd9e..00000000 --- a/mapreader/spot_text/rec_runner_base.py +++ /dev/null @@ -1,346 +0,0 @@ -from __future__ import annotations - -import pathlib -import re - -import geopandas as gpd -import matplotlib.patches as patches -import matplotlib.pyplot as plt -import numpy as np -import pandas as pd -import xyzservices as xyz -from PIL import Image -from shapely import LineString, MultiPolygon, Polygon - -from .runner_base import Runner - - -class RecRunner(Runner): - def get_patch_predictions( - self, - outputs: dict, - return_dataframe: bool = False, - min_ioa: float = 0.7, - ) -> dict | pd.DataFrame: - """Post process the model outputs to get patch predictions. - - Parameters - ---------- - outputs : dict - The outputs from the model. - return_dataframe : bool, optional - Whether to return the predictions as a pandas DataFrame, by default False - min_ioa : float, optional - The minimum intersection over area to consider two polygons the same, by default 0.7 - - Returns - ------- - dict or pd.DataFrame - A dictionary containing the patch predictions or a DataFrame if `as_dataframe` is True. - """ - # key for predictions - image_id = outputs["image_id"] - self.patch_predictions[image_id] = [] - - # get instances - instances = outputs["instances"].to("cpu") - ctrl_pnts = instances.ctrl_points.numpy() - scores = instances.scores.tolist() - recs = instances.recs - bd_pts = np.asarray(instances.bd) - - self._post_process(image_id, ctrl_pnts, scores, recs, bd_pts) - self._deduplicate(image_id, min_ioa=min_ioa) - - if return_dataframe: - return self._dict_to_dataframe(self.patch_predictions, geo=False) - return self.patch_predictions - - def _process_ctrl_pnt(self, pnt): - points = pnt.reshape(-1, 2) - return points - - def _post_process(self, image_id, ctrl_pnts, scores, recs, bd_pnts, alpha=0.4): - for ctrl_pnt, score, rec, bd in zip(ctrl_pnts, scores, recs, bd_pnts): - # draw polygons - if bd is not None: - bd = np.hsplit(bd, 2) - bd = np.vstack([bd[0], bd[1][::-1]]) - polygon = Polygon(bd).buffer(0) - - if isinstance(polygon, MultiPolygon): - polygon = polygon.convex_hull - - # draw center lines - line = self._process_ctrl_pnt(ctrl_pnt) - line = LineString(line) - - # draw text - text = self._ctc_decode_recognition(rec) - if self.voc_size == 37: - text = text.upper() - # text = "{:.2f}: {}".format(score, text) - text = f"{text}" - score = f"{score:.2f}" - - self.patch_predictions[image_id].append([polygon, text, score]) - - @staticmethod - def _dict_to_dataframe( - preds: dict, - geo: bool = False, - parent: bool = False, - ) -> pd.DataFrame: - """Convert the predictions dictionary to a pandas DataFrame. - - Parameters - ---------- - preds : dict - A dictionary of predictions. - geo : bool, optional - Whether the dictionary is georeferenced coords (or pixel bounds), by default True - parent : bool, optional - Whether the dictionary is at parent level, by default False - - Returns - ------- - pd.DataFrame - A pandas DataFrame containing the predictions. - """ - if geo: - columns = ["geometry", "crs", "text", "score"] - else: - columns = ["geometry", "text", "score"] - - if parent: - columns.append("patch_id") - - if len(preds.keys()): - preds_df = pd.concat( - pd.DataFrame( - preds[k], - index=np.full(len(preds[k]), k), - columns=columns, - ) - for k in preds.keys() - ) - else: - preds_df = pd.DataFrame(columns=columns) # empty dataframe - - if geo: - # get the crs (should be the same for all) - if not preds_df["crs"].nunique() == 1: - raise ValueError("[ERROR] Multiple crs found in the predictions.") - crs = preds_df["crs"].unique()[0] - - preds_df = gpd.GeoDataFrame( - preds_df, - geometry="geometry", - crs=crs, - ) - - preds_df.index.name = "image_id" - preds_df.reset_index(inplace=True) # reset index to get image_id as a column - return preds_df - - def search_preds( - self, search_text: str, ignore_case: bool = True, return_dataframe: bool = False - ) -> dict | pd.DataFrame: - """Search the predictions for specific text. Accepts regex. - - Parameters - ---------- - search_text : str - The text to search for. Can be a regex pattern. - ignore_case : bool, optional - Whether to ignore case when searching, by default True. - return_dataframe : bool, optional - Whether to return the results as a pandas DataFrame, by default False. - - Returns - ------- - dict | pd.DataFrame - A dictionary containing the search results or a DataFrame if `return_dataframe` is True. - - Raises - ------ - ValueError - If no parent predictions are found. - """ - # reset the search results - self.search_results = {} - - # whether to ignore case - kwargs = {"flags": re.IGNORECASE} if ignore_case else {} - - if self.parent_predictions == {}: - raise ValueError( - "[ERROR] No parent predictions found. You may need to run `convert_to_parent_pixel_bounds()`." - ) - - for image_id, preds in self.parent_predictions.items(): - for instance in preds: - # ["geometry", "text", "score"] - if re.search(search_text, instance[1], **kwargs): - if image_id in self.search_results: - self.search_results[image_id].append(instance) - else: - self.search_results[image_id] = [instance] - - if return_dataframe: - return self._dict_to_dataframe(self.search_results, parent=True) - return self.search_results - - def show_search_results( - self, - parent_id: str, - figsize: tuple | None = (10, 10), - border_color: str | None = "r", - text_color: str | None = "b", - ) -> None: - """Show the search results on an image. - - Parameters - ---------- - parent_id : str - The image ID to show the predictions on (must be parent level). - figsize : tuple | None, optional - The size of the figure, by default (10, 10) - border_color : str | None, optional - The color of the border of the polygons, by default "r" - text_color : str | None, optional - The color of the text, by default "b". - - Raises - ------ - ValueError - If the image ID is not found in the patch or parent predictions. - """ - if parent_id in self.parent_predictions.keys(): - image_path = self.parent_df.loc[parent_id, "image_path"] - else: - raise ValueError(f"[ERROR] {parent_id} not found in parent predictions.") - - img = Image.open(image_path) - - fig, ax = plt.subplots(figsize=figsize) - ax.axis("off") - - # check if grayscale - if len(img.getbands()) == 1: - ax.imshow(img, cmap="gray", vmin=0, vmax=255, zorder=1) - else: - ax.imshow(img, zorder=1) - ax.set_title(parent_id) - - preds = self.search_results - - for instance in preds[parent_id]: - # Instance is: - # - [geometry, text, score] for det/rec - polygon = np.array(instance[0].exterior.coords.xy) - center = instance[0].centroid.coords.xy - patch = patches.Polygon(polygon.T, edgecolor=border_color, facecolor="none") - ax.add_patch(patch) - ax.text( - center[0][0], center[1][0], instance[1], fontsize=8, color=text_color - ) - - fig.show() - - def _get_geo_search_results(self): - """Convert search results to georeferenced search results. - - Returns - ------- - dict - Dictionary containing georeferenced search results. - """ - self.check_georeferencing() - if not self.georeferenced: - raise ValueError( - "[ERROR] Cannot convert to coordinates as parent_df does not have 'coordinates' column." - ) - - geo_search_results = {} - - for parent_id, prediction in self.search_results.items(): - if parent_id not in geo_search_results.keys(): - geo_search_results[parent_id] = [] - - for instance in prediction: - polygon = instance[0] - - xx, yy = (np.array(i) for i in polygon.exterior.xy) - xx = ( - xx * self.parent_df.loc[parent_id, "dlon"] - + self.parent_df.loc[parent_id, "coordinates"][0] - ) - yy = ( - self.parent_df.loc[parent_id, "coordinates"][3] - - yy * self.parent_df.loc[parent_id, "dlat"] - ) - - crs = self.parent_df.loc[parent_id, "crs"] - - parent_polygon_geo = Polygon(zip(xx, yy)).buffer(0) - geo_search_results[parent_id].append( - [parent_polygon_geo, crs, *instance[1:]] - ) - - return geo_search_results - - def explore_search_results( - self, - parent_id: str, - xyz_url: str | None = None, - style_kwargs: dict | None = None, - ): - self.check_georeferencing() - if not self.georeferenced: - raise ValueError( - "[ERROR] This method only works for georeferenced results. Please ensure parent_df has 'coordinates' column and run `convert_to_coords` first." - ) - - if parent_id not in self.geo_predictions.keys(): - raise ValueError(f"[ERROR] {parent_id} not found in geo predictions.") - - if style_kwargs is None: - style_kwargs = {"fillOpacity": 0.2} - - if xyz_url: - tiles = xyz.TileProvider(name=xyz_url, url=xyz_url, attribution=xyz_url) - else: - tiles = xyz.providers.OpenStreetMap.Mapnik - - geo_search_results = self._get_geo_search_results() - geo_df = self._dict_to_dataframe(geo_search_results, geo=True, parent=True) - - return geo_df[geo_df["image_id"] == parent_id].explore( - tiles=tiles, - style_kwds=style_kwargs, - ) - - def save_search_results_to_geojson( - self, - save_path: str | pathlib.Path, - ) -> None: - """Convert the search results to georeferenced search results and save them to a GeoJSON file. - - Parameters - ---------- - save_path : str | pathlib.Path - The path to save the GeoJSON file. - - Raises - ------ - ValueError - If no search results are found. - """ - if self.search_results == {}: - raise ValueError("[ERROR] No results to save!") - - geo_search_results = self._get_geo_search_results() - - geo_df = self._dict_to_dataframe(geo_search_results, geo=True, parent=True) - geo_df.to_file(save_path, driver="GeoJSON", engine="pyogrio") diff --git a/mapreader/spot_text/runner_base.py b/mapreader/spot_text/runner_base.py index 24d94e03..7ca8ff91 100644 --- a/mapreader/spot_text/runner_base.py +++ b/mapreader/spot_text/runner_base.py @@ -12,16 +12,18 @@ import pandas as pd import xyzservices as xyz from PIL import Image -from shapely import Polygon +from shapely import LineString, MultiPolygon, Polygon from tqdm.auto import tqdm from mapreader import MapImages from mapreader.utils.load_frames import load_from_csv, load_from_geojson +from .dataclasses import Prediction -class Runner: + +class DetRunner: def __init__() -> None: - """Initialise the Runner class.""" + """Initialise the DetRunner class.""" # empty in the base class def _load_df( @@ -110,6 +112,52 @@ def _add_coord_increments(self): parent_df, _ = maps.convert_images() self.parent_df = parent_df + @staticmethod + def _dict_to_dataframe( + preds: dict, + ) -> pd.DataFrame: + """Convert the predictions dictionary to a pandas DataFrame. + + Parameters + ---------- + preds : dict + A dictionary of predictions. + + Returns + ------- + pd.DataFrame + A pandas DataFrame containing the predictions. + """ + + if len(preds): + preds_df = pd.concat( + pd.DataFrame( + preds[k], + index=np.full(len(preds[k]), k), + ) + for k in preds.keys() + ) + # drop empty cols + preds_df.dropna(inplace=True, axis=1) + + if "crs" in preds_df.columns: + # get the crs (should be the same for all) + if not preds_df["crs"].nunique() == 1: + raise ValueError("[ERROR] Multiple crs found in the predictions.") + crs = preds_df["crs"].unique()[0] + + preds_df = gpd.GeoDataFrame( + preds_df, + geometry="geometry", + crs=crs, + ) + else: + preds_df = pd.DataFrame() # empty dataframe + + preds_df.index.name = "image_id" + preds_df.reset_index(inplace=True) # reset index to get image_id as a column + return preds_df + def run_all( self, return_dataframe: bool = False, @@ -166,9 +214,7 @@ def run_on_images( _ = self.run_on_image(img_path, return_outputs=False, min_ioa=min_ioa) if return_dataframe: - return self._dict_to_dataframe( - self.patch_predictions, geo=False, parent=False - ) + return self._dict_to_dataframe(self.patch_predictions) return self.patch_predictions def run_on_image( @@ -208,16 +254,13 @@ def run_on_image( if return_outputs: return outputs - self.get_patch_predictions(outputs, min_ioa=min_ioa) - - if return_dataframe: - return self._dict_to_dataframe( - self.patch_predictions, geo=False, parent=False - ) - return self.patch_predictions + patch_predictions = self._get_patch_predictions( + outputs, return_dataframe=return_dataframe, min_ioa=min_ioa + ) + return patch_predictions def _deduplicate(self, image_id, min_ioa=0.7): - polygons = [instance[0] for instance in self.patch_predictions[image_id]] + polygons = [instance.geometry for instance in self.patch_predictions[image_id]] def calc_ioa(polygons, i, j): return polygons[i].intersection(polygons[j]).area / polygons[i].area @@ -272,6 +315,7 @@ def convert_to_parent_pixel_bounds( dict or pd.DataFrame A dictionary of predictions for each parent image or a DataFrame if `return_dataframe` is True. """ + self.parent_predictions = {} # reset parent predictions for image_id, prediction in self.patch_predictions.items(): parent_id = self.patch_df.loc[image_id, "parent_id"] @@ -279,7 +323,7 @@ def convert_to_parent_pixel_bounds( self.parent_predictions[parent_id] = [] for instance in prediction: - polygon = instance[0] + polygon = instance.geometry xx, yy = (np.array(i) for i in polygon.exterior.xy) xx = xx + self.patch_df.loc[image_id, "pixel_bounds"][0] # add min_x @@ -287,7 +331,12 @@ def convert_to_parent_pixel_bounds( parent_polygon = Polygon(zip(xx, yy)).buffer(0) self.parent_predictions[parent_id].append( - [parent_polygon, *instance[1:], image_id] + Prediction( + geometry=parent_polygon, + score=instance.score, + text=instance.text, + patch_id=image_id, + ) ) if deduplicate: @@ -295,16 +344,12 @@ def convert_to_parent_pixel_bounds( self._deduplicate_parent_level(parent_id, min_ioa=min_ioa) if return_dataframe: - return self._dict_to_dataframe( - self.parent_predictions, geo=False, parent=True - ) + return self._dict_to_dataframe(self.parent_predictions) return self.parent_predictions def _deduplicate_parent_level(self, image_id, min_ioa=0.7): # get parent predictions for selected parent image - parent_preds = np.array(self.parent_predictions[image_id]) - - all_patches = parent_preds[:, -1] + all_patches = [pred.patch_id for pred in self.parent_predictions[image_id]] patches = np.unique(all_patches).tolist() for patch_i, patch_j in combinations(patches, 2): @@ -322,11 +367,14 @@ def _deduplicate_parent_level(self, image_id, min_ioa=0.7): # get polygons that overlap with the patch intersection polygons = [] - for i, pred in enumerate(parent_preds): - if pred[-1] in [patch_i, patch_j] and pred[0].intersects( - intersection - ): - polygons.append([i, pred[0]]) + for i, pred in enumerate(np.array(self.parent_predictions[image_id])): + if pred is None: + continue + elif pred.patch_id in [ + patch_i, + patch_j, + ] and pred.geometry.intersects(intersection): + polygons.append([i, pred.geometry]) def calc_ioa(polygons, i, j): return ( @@ -389,12 +437,14 @@ def convert_to_coords( print("[INFO] Converting patch pixel bounds to parent pixel bounds.") _ = self.convert_to_parent_pixel_bounds() + self.geo_predictions = {} # reset geo predictions + for parent_id, prediction in self.parent_predictions.items(): if parent_id not in self.geo_predictions.keys(): self.geo_predictions[parent_id] = [] for instance in prediction: - polygon = instance[0] + polygon = instance.geometry xx, yy = (np.array(i) for i in polygon.exterior.xy) xx = ( @@ -410,16 +460,23 @@ def convert_to_coords( parent_polygon_geo = Polygon(zip(xx, yy)).buffer(0) self.geo_predictions[parent_id].append( - [parent_polygon_geo, crs, *instance[1:]] + Prediction( + geometry=parent_polygon_geo, + score=instance.score, + text=instance.text, + patch_id=instance.patch_id, + crs=crs, + ) ) if return_dataframe: - return self._dict_to_dataframe(self.geo_predictions, geo=True, parent=True) + return self._dict_to_dataframe(self.geo_predictions) return self.geo_predictions def save_to_geojson( self, save_path: str | pathlib.Path, + centroid: bool = False, ) -> None: """Save the georeferenced predictions to a GeoJSON file. @@ -427,15 +484,26 @@ def save_to_geojson( ---------- save_path : str | pathlib.Path, optional Path to save the GeoJSON file + centroid : bool, optional + Whether to save the centroid of the polygons as the geometry column, by default False. + Note: The original polygon will stil be saved as a separate column. """ if self.geo_predictions == {}: raise ValueError( "[ERROR] No georeferenced predictions found. Run `convert_to_coords` first." ) - geo_df = self._dict_to_dataframe(self.geo_predictions, geo=True, parent=True) + geo_df = self._dict_to_dataframe(self.geo_predictions) + + if centroid: + geo_df["polygon"] = geo_df["geometry"].to_wkt() + geo_df["geometry"] = geo_df["geometry"].apply(self._polygon_to_centroid) + geo_df.to_file(save_path, driver="GeoJSON", engine="pyogrio") + def _polygon_to_centroid(self, polygon): + return polygon.centroid + def show_predictions( self, image_id: str, @@ -488,17 +556,17 @@ def show_predictions( ax.set_title(image_id) for instance in preds[image_id]: - # Instance is: - # - [geometry, text, score] for det/rec - # - [geometry, score] for det only - polygon = np.array(instance[0].exterior.coords.xy) - center = instance[0].centroid.coords.xy + polygon = np.array(instance.geometry.exterior.coords.xy) + center = instance.geometry.centroid.coords.xy patch = patches.Polygon(polygon.T, edgecolor=border_color, facecolor="none") ax.add_patch(patch) ax.text( - center[0][0], center[1][0], instance[1], fontsize=8, color=text_color + x=center[0][0], + y=center[1][0], + s=instance.text if instance.text is not None else instance.score, + fontsize=8, + color=text_color, ) - fig.show() def explore_predictions( @@ -526,9 +594,301 @@ def explore_predictions( else: tiles = xyz.providers.OpenStreetMap.Mapnik - preds_df = self._dict_to_dataframe(self.geo_predictions, geo=True, parent=True) + preds_df = self._dict_to_dataframe(self.geo_predictions) return preds_df[preds_df["image_id"] == parent_id].explore( tiles=tiles, style_kwds=style_kwargs, ) + + +class DetRecRunner(DetRunner): + def _get_patch_predictions( + self, + outputs: dict, + return_dataframe: bool = False, + min_ioa: float = 0.7, + ) -> dict | pd.DataFrame: + """Post process the model outputs to get patch predictions. + + Parameters + ---------- + outputs : dict + The outputs from the model. + return_dataframe : bool, optional + Whether to return the predictions as a pandas DataFrame, by default False + min_ioa : float, optional + The minimum intersection over area to consider two polygons the same, by default 0.7 + + Returns + ------- + dict or pd.DataFrame + A dictionary containing the patch predictions or a DataFrame if `as_dataframe` is True. + """ + # key for predictions + image_id = outputs["image_id"] + self.patch_predictions[image_id] = [] + + # get instances + instances = outputs["instances"].to("cpu") + ctrl_pnts = instances.ctrl_points.numpy() + scores = instances.scores.tolist() + recs = instances.recs + bd_pts = np.asarray(instances.bd) + + self._post_process(image_id, ctrl_pnts, scores, recs, bd_pts) + self._deduplicate(image_id, min_ioa=min_ioa) + + if return_dataframe: + return self._dict_to_dataframe(self.patch_predictions) + return self.patch_predictions + + def _process_ctrl_pnt(self, pnt): + points = pnt.reshape(-1, 2) + return points + + def _post_process(self, image_id, ctrl_pnts, scores, recs, bd_pnts): + for ctrl_pnt, score, rec, bd in zip(ctrl_pnts, scores, recs, bd_pnts): + # draw polygons + if bd is not None: + bd = np.hsplit(bd, 2) + bd = np.vstack([bd[0], bd[1][::-1]]) + polygon = Polygon(bd).buffer(0) + + if isinstance(polygon, MultiPolygon): + polygon = polygon.convex_hull + + # draw center lines + line = self._process_ctrl_pnt(ctrl_pnt) + line = LineString(line) + + # draw text + text = self._ctc_decode_recognition(rec) + if self.voc_size == 37: + text = text.upper() + # text = "{:.2f}: {}".format(score, text) + text = f"{text}" + score = f"{score:.2f}" + + self.patch_predictions[image_id].append( + Prediction(geometry=polygon, score=score, text=text) + ) + + def search_preds( + self, search_text: str, ignore_case: bool = True, return_dataframe: bool = False + ) -> dict | pd.DataFrame: + """Search the predictions for specific text. Accepts regex. + + Parameters + ---------- + search_text : str + The text to search for. Can be a regex pattern. + ignore_case : bool, optional + Whether to ignore case when searching, by default True. + return_dataframe : bool, optional + Whether to return the results as a pandas DataFrame, by default False. + + Returns + ------- + dict | pd.DataFrame + A dictionary containing the search results or a DataFrame if `return_dataframe` is True. + + Raises + ------ + ValueError + If no parent predictions are found. + """ + # reset the search results + self.search_results = {} + + # whether to ignore case + kwargs = {"flags": re.IGNORECASE} if ignore_case else {} + + if self.parent_predictions == {}: + raise ValueError( + "[ERROR] No parent predictions found. You may need to run `convert_to_parent_pixel_bounds()`." + ) + + for image_id, preds in self.parent_predictions.items(): + for instance in preds: + if re.search(search_text, instance.text, **kwargs): + if image_id in self.search_results: + self.search_results[image_id].append(instance) + else: + self.search_results[image_id] = [instance] + + if return_dataframe: + return self._dict_to_dataframe(self.search_results) + return self.search_results + + def show_search_results( + self, + parent_id: str, + figsize: tuple | None = (10, 10), + border_color: str | None = "r", + text_color: str | None = "b", + ) -> None: + """Show the search results on an image. + + Parameters + ---------- + parent_id : str + The image ID to show the predictions on (must be parent level). + figsize : tuple | None, optional + The size of the figure, by default (10, 10) + border_color : str | None, optional + The color of the border of the polygons, by default "r" + text_color : str | None, optional + The color of the text, by default "b". + + Raises + ------ + ValueError + If the image ID is not found in the patch or parent predictions. + """ + if parent_id in self.parent_predictions.keys(): + image_path = self.parent_df.loc[parent_id, "image_path"] + else: + raise ValueError(f"[ERROR] {parent_id} not found in parent predictions.") + + img = Image.open(image_path) + + fig, ax = plt.subplots(figsize=figsize) + ax.axis("off") + + # check if grayscale + if len(img.getbands()) == 1: + ax.imshow(img, cmap="gray", vmin=0, vmax=255, zorder=1) + else: + ax.imshow(img, zorder=1) + ax.set_title(parent_id) + + preds = self.search_results + + for instance in preds[parent_id]: + # Instance is: + # - [geometry, text, score] for det/rec + polygon = np.array(instance.geometry.exterior.coords.xy) + center = instance.geometry.centroid.coords.xy + patch = patches.Polygon(polygon.T, edgecolor=border_color, facecolor="none") + ax.add_patch(patch) + ax.text( + x=center[0][0], + y=center[1][0], + s=instance.text, + fontsize=8, + color=text_color, + ) + + fig.show() + + def _get_geo_search_results(self): + """Convert search results to georeferenced search results. + + Returns + ------- + dict + Dictionary containing georeferenced search results. + """ + self.check_georeferencing() + if not self.georeferenced: + raise ValueError( + "[ERROR] Cannot convert to coordinates as parent_df does not have 'coordinates' column." + ) + + geo_search_results = {} + + for parent_id, prediction in self.search_results.items(): + if parent_id not in geo_search_results.keys(): + geo_search_results[parent_id] = [] + + for instance in prediction: + polygon = instance.geometry + + xx, yy = (np.array(i) for i in polygon.exterior.xy) + xx = ( + xx * self.parent_df.loc[parent_id, "dlon"] + + self.parent_df.loc[parent_id, "coordinates"][0] + ) + yy = ( + self.parent_df.loc[parent_id, "coordinates"][3] + - yy * self.parent_df.loc[parent_id, "dlat"] + ) + + crs = self.parent_df.loc[parent_id, "crs"] + + parent_polygon_geo = Polygon(zip(xx, yy)).buffer(0) + geo_search_results[parent_id].append( + Prediction( + geometry=parent_polygon_geo, + score=instance.score, + text=instance.score, + patch_id=instance.patch_id, + crs=crs, + ) + ) + + return geo_search_results + + def explore_search_results( + self, + parent_id: str, + xyz_url: str | None = None, + style_kwargs: dict | None = None, + ): + self.check_georeferencing() + if not self.georeferenced: + raise ValueError( + "[ERROR] This method only works for georeferenced results. Please ensure parent_df has 'coordinates' column and run `convert_to_coords` first." + ) + + if parent_id not in self.geo_predictions.keys(): + raise ValueError(f"[ERROR] {parent_id} not found in geo predictions.") + + if style_kwargs is None: + style_kwargs = {"fillOpacity": 0.2} + + if xyz_url: + tiles = xyz.TileProvider(name=xyz_url, url=xyz_url, attribution=xyz_url) + else: + tiles = xyz.providers.OpenStreetMap.Mapnik + + geo_search_results = self._get_geo_search_results() + geo_df = self._dict_to_dataframe(geo_search_results) + + return geo_df[geo_df["image_id"] == parent_id].explore( + tiles=tiles, + style_kwds=style_kwargs, + ) + + def save_search_results_to_geojson( + self, + save_path: str | pathlib.Path, + centroid: bool = False, + ) -> None: + """Convert the search results to georeferenced search results and save them to a GeoJSON file. + + Parameters + ---------- + save_path : str | pathlib.Path + The path to save the GeoJSON file. + centroid : bool, optional + Whether to save the centroid of the polygons as the geometry column, by default False. + Note: The original polygon will stil be saved as a separate column. + + Raises + ------ + ValueError + If no search results are found. + """ + if self.search_results == {}: + raise ValueError("[ERROR] No results to save!") + + geo_search_results = self._get_geo_search_results() + geo_df = self._dict_to_dataframe(geo_search_results) + + if centroid: + geo_df["polygon"] = geo_df["geometry"].to_wkt() + geo_df["geometry"] = geo_df["geometry"].apply(self._polygon_to_centroid) + + geo_df.to_file(save_path, driver="GeoJSON", engine="pyogrio") From a7c3e84b8c43e0c4907c79684b5625707b9e6514 Mon Sep 17 00:00:00 2001 From: Rosie Wood Date: Thu, 21 Nov 2024 13:39:11 +0000 Subject: [PATCH 02/15] update tests --- tests/test_text_spotting/test_deepsolo_runner.py | 9 ++++++--- tests/test_text_spotting/test_dptext_runner.py | 6 +++--- tests/test_text_spotting/test_maptext_runner.py | 6 +++--- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/tests/test_text_spotting/test_deepsolo_runner.py b/tests/test_text_spotting/test_deepsolo_runner.py index 22a9713d..9247c46c 100644 --- a/tests/test_text_spotting/test_deepsolo_runner.py +++ b/tests/test_text_spotting/test_deepsolo_runner.py @@ -13,6 +13,7 @@ from mapreader import DeepSoloRunner from mapreader.load import MapImages +from mapreader.spot_text.dataclasses import Prediction # use cloned DeepSolo path if running in github actions DEEPSOLO_PATH = ( @@ -142,8 +143,9 @@ def test_deepsolo_run_all(init_runner, mock_response): assert isinstance(out, dict) assert "patch-0-0-800-40-#mapreader_text.png#.png" in out.keys() assert isinstance(out["patch-0-0-800-40-#mapreader_text.png#.png"], list) + assert isinstance(out["patch-0-0-800-40-#mapreader_text.png#.png"][0], Prediction) # dataframe - out = runner._dict_to_dataframe(runner.patch_predictions, geo=False, parent=False) + out = runner._dict_to_dataframe(runner.patch_predictions) assert isinstance(out, pd.DataFrame) assert set(out.columns) == set(["image_id", "geometry", "text", "score"]) assert "patch-0-0-800-40-#mapreader_text.png#.png" in out["image_id"].values @@ -156,8 +158,9 @@ def test_deepsolo_convert_to_parent(runner_run_all, mock_response): assert isinstance(out, dict) assert "mapreader_text.png" in out.keys() assert isinstance(out["mapreader_text.png"], list) + assert isinstance(out["mapreader_text.png"][0], Prediction) # dataframe - out = runner._dict_to_dataframe(runner.parent_predictions, geo=False, parent=True) + out = runner._dict_to_dataframe(runner.parent_predictions) assert isinstance(out, pd.DataFrame) assert set(out.columns) == set( ["image_id", "patch_id", "geometry", "text", "score"] @@ -173,7 +176,7 @@ def test_deepsolo_convert_to_parent_coords(runner_run_all, mock_response): assert "mapreader_text.png" in out.keys() assert isinstance(out["mapreader_text.png"], list) # dataframe - out = runner._dict_to_dataframe(runner.geo_predictions, geo=True, parent=True) + out = runner._dict_to_dataframe(runner.geo_predictions) assert isinstance(out, gpd.GeoDataFrame) assert set(out.columns) == set( ["image_id", "patch_id", "geometry", "crs", "text", "score"] diff --git a/tests/test_text_spotting/test_dptext_runner.py b/tests/test_text_spotting/test_dptext_runner.py index cf62baa8..d8e4199b 100644 --- a/tests/test_text_spotting/test_dptext_runner.py +++ b/tests/test_text_spotting/test_dptext_runner.py @@ -144,7 +144,7 @@ def test_dptext_run_all(init_runner, mock_response): assert "patch-0-0-800-40-#mapreader_text.png#.png" in out.keys() assert isinstance(out["patch-0-0-800-40-#mapreader_text.png#.png"], list) # dataframe - out = runner._dict_to_dataframe(runner.patch_predictions, geo=False, parent=False) + out = runner._dict_to_dataframe(runner.patch_predictions) assert isinstance(out, pd.DataFrame) assert set(out.columns) == set(["image_id", "geometry", "score"]) assert "patch-0-0-800-40-#mapreader_text.png#.png" in out["image_id"].values @@ -158,7 +158,7 @@ def test_dptext_convert_to_parent(runner_run_all, mock_response): assert "mapreader_text.png" in out.keys() assert isinstance(out["mapreader_text.png"], list) # dataframe - out = runner._dict_to_dataframe(runner.parent_predictions, geo=False, parent=True) + out = runner._dict_to_dataframe(runner.parent_predictions) assert isinstance(out, pd.DataFrame) assert set(out.columns) == set(["image_id", "patch_id", "geometry", "score"]) assert "mapreader_text.png" in out["image_id"].values @@ -172,7 +172,7 @@ def test_dptext_convert_to_parent_coords(runner_run_all, mock_response): assert "mapreader_text.png" in out.keys() assert isinstance(out["mapreader_text.png"], list) # dataframe - out = runner._dict_to_dataframe(runner.geo_predictions, geo=True, parent=True) + out = runner._dict_to_dataframe(runner.geo_predictions) assert isinstance(out, gpd.GeoDataFrame) assert set(out.columns) == set(["image_id", "patch_id", "geometry", "crs", "score"]) assert "mapreader_text.png" in out["image_id"].values diff --git a/tests/test_text_spotting/test_maptext_runner.py b/tests/test_text_spotting/test_maptext_runner.py index 2952013c..9c593872 100644 --- a/tests/test_text_spotting/test_maptext_runner.py +++ b/tests/test_text_spotting/test_maptext_runner.py @@ -144,7 +144,7 @@ def test_maptext_run_all(init_runner, mock_response): assert "patch-0-0-800-40-#mapreader_text.png#.png" in out.keys() assert isinstance(out["patch-0-0-800-40-#mapreader_text.png#.png"], list) # dataframe - out = runner._dict_to_dataframe(runner.patch_predictions, geo=False, parent=False) + out = runner._dict_to_dataframe(runner.patch_predictions) assert isinstance(out, pd.DataFrame) assert set(out.columns) == set(["image_id", "geometry", "text", "score"]) assert "patch-0-0-800-40-#mapreader_text.png#.png" in out["image_id"].values @@ -158,7 +158,7 @@ def test_maptext_convert_to_parent(runner_run_all, mock_response): assert "mapreader_text.png" in out.keys() assert isinstance(out["mapreader_text.png"], list) # dataframe - out = runner._dict_to_dataframe(runner.parent_predictions, geo=False, parent=True) + out = runner._dict_to_dataframe(runner.parent_predictions) assert isinstance(out, pd.DataFrame) assert set(out.columns) == set( ["image_id", "patch_id", "geometry", "text", "score"] @@ -174,7 +174,7 @@ def test_maptext_convert_to_parent_coords(runner_run_all, mock_response): assert "mapreader_text.png" in out.keys() assert isinstance(out["mapreader_text.png"], list) # dataframe - out = runner._dict_to_dataframe(runner.geo_predictions, geo=True, parent=True) + out = runner._dict_to_dataframe(runner.geo_predictions) assert isinstance(out, gpd.GeoDataFrame) assert set(out.columns) == set( ["image_id", "patch_id", "geometry", "crs", "text", "score"] From 821f455cc2096ea8f56ca2d6febe60a55713f153 Mon Sep 17 00:00:00 2001 From: Rosie Wood Date: Fri, 22 Nov 2024 15:51:58 +0000 Subject: [PATCH 03/15] update dataclasses, add load_predictions method for loading from geojson --- mapreader/spot_text/dataclasses.py | 26 ++++-- mapreader/spot_text/runner_base.py | 131 ++++++++++++++++++++++++----- 2 files changed, 129 insertions(+), 28 deletions(-) diff --git a/mapreader/spot_text/dataclasses.py b/mapreader/spot_text/dataclasses.py index 4b155f40..97098e35 100644 --- a/mapreader/spot_text/dataclasses.py +++ b/mapreader/spot_text/dataclasses.py @@ -4,13 +4,27 @@ from shapely.geometry import Polygon -# Detection only +@dataclass(frozen=True) +class PatchPrediction: + pixel_geometry: Polygon + score: float + text: str = None -@dataclass -class Prediction: - geometry: Polygon + +@dataclass(frozen=True) +class ParentPrediction: + pixel_geometry: Polygon score: float + patch_id: str + text: str = None + + +@dataclass(frozen=True) +class GeoPrediction: + pixel_geometry: Polygon + score: float + patch_id: str + geometry: Polygon + crs: str text: str = None - patch_id: str | None = None - crs: str | None = None diff --git a/mapreader/spot_text/runner_base.py b/mapreader/spot_text/runner_base.py index 7ca8ff91..77c1ee71 100644 --- a/mapreader/spot_text/runner_base.py +++ b/mapreader/spot_text/runner_base.py @@ -12,13 +12,13 @@ import pandas as pd import xyzservices as xyz from PIL import Image -from shapely import LineString, MultiPolygon, Polygon +from shapely import LineString, MultiPolygon, Polygon, from_wkt from tqdm.auto import tqdm from mapreader import MapImages from mapreader.utils.load_frames import load_from_csv, load_from_geojson -from .dataclasses import Prediction +from .dataclasses import GeoPrediction, ParentPrediction, PatchPrediction class DetRunner: @@ -260,7 +260,9 @@ def run_on_image( return patch_predictions def _deduplicate(self, image_id, min_ioa=0.7): - polygons = [instance.geometry for instance in self.patch_predictions[image_id]] + polygons = [ + instance.pixel_geometry for instance in self.patch_predictions[image_id] + ] def calc_ioa(polygons, i, j): return polygons[i].intersection(polygons[j]).area / polygons[i].area @@ -323,7 +325,7 @@ def convert_to_parent_pixel_bounds( self.parent_predictions[parent_id] = [] for instance in prediction: - polygon = instance.geometry + polygon = instance.pixel_geometry xx, yy = (np.array(i) for i in polygon.exterior.xy) xx = xx + self.patch_df.loc[image_id, "pixel_bounds"][0] # add min_x @@ -331,8 +333,8 @@ def convert_to_parent_pixel_bounds( parent_polygon = Polygon(zip(xx, yy)).buffer(0) self.parent_predictions[parent_id].append( - Prediction( - geometry=parent_polygon, + ParentPrediction( + pixel_geometry=parent_polygon, score=instance.score, text=instance.text, patch_id=image_id, @@ -373,8 +375,8 @@ def _deduplicate_parent_level(self, image_id, min_ioa=0.7): elif pred.patch_id in [ patch_i, patch_j, - ] and pred.geometry.intersects(intersection): - polygons.append([i, pred.geometry]) + ] and pred.pixel_geometry.intersects(intersection): + polygons.append([i, pred.pixel_geometry]) def calc_ioa(polygons, i, j): return ( @@ -444,7 +446,7 @@ def convert_to_coords( self.geo_predictions[parent_id] = [] for instance in prediction: - polygon = instance.geometry + polygon = instance.pixel_geometry xx, yy = (np.array(i) for i in polygon.exterior.xy) xx = ( @@ -456,15 +458,16 @@ def convert_to_coords( - yy * self.parent_df.loc[parent_id, "dlat"] ) + parent_polygon_geo = Polygon(zip(xx, yy)).buffer(0) crs = self.parent_df.loc[parent_id, "crs"] - parent_polygon_geo = Polygon(zip(xx, yy)).buffer(0) self.geo_predictions[parent_id].append( - Prediction( - geometry=parent_polygon_geo, + GeoPrediction( + pixel_geometry=instance.pixel_geometry, score=instance.score, text=instance.text, patch_id=instance.patch_id, + geometry=parent_polygon_geo, crs=crs, ) ) @@ -556,8 +559,8 @@ def show_predictions( ax.set_title(image_id) for instance in preds[image_id]: - polygon = np.array(instance.geometry.exterior.coords.xy) - center = instance.geometry.centroid.coords.xy + polygon = np.array(instance.pixel_geometry.exterior.coords.xy) + center = instance.pixel_geometry.centroid.coords.xy patch = patches.Polygon(polygon.T, edgecolor=border_color, facecolor="none") ax.add_patch(patch) ax.text( @@ -601,6 +604,91 @@ def explore_predictions( style_kwds=style_kwargs, ) + def load_predictions( + self, + path_save: str | pathlib.Path, + ): + """Load georeferenced text predictions from a GeoJSON file. + + Parameters + ---------- + path_save : str | pathlib.Path + The path to the GeoJSON file. + + Raises + ------ + ValueError + If the path does not point to a GeoJSON file. + + Note + ---- + This will overwrite any existing predictions! + """ + if re.search(r"\..*?json$", str(path_save)): + preds_df = load_from_geojson(path_save, engine="pyogrio") + else: + raise ValueError("[ERROR] ``path_save`` must be a path to a geojson file.") + + # convert pixel_geometry to shapely geometry + preds_df["pixel_geometry"] = preds_df["pixel_geometry"].apply( + lambda x: from_wkt(x) + ) + + self.geo_predictions = {} + self.parent_predictions = {} + + for image_id in preds_df.index.unique(): + if image_id not in self.geo_predictions.keys(): + self.geo_predictions[image_id] = [] + if image_id not in self.parent_predictions.keys(): + self.parent_predictions[image_id] = [] + + for _, v in preds_df[preds_df.index == image_id].iterrows(): + self.geo_predictions[image_id].append( + GeoPrediction( + pixel_geometry=v.pixel_geometry, + score=v.score, + text=v.text, + patch_id=v.patch_id, + geometry=v.geometry, + crs=v.crs, + ) + ) + self.parent_predictions[image_id].append( + ParentPrediction( + pixel_geometry=v.pixel_geometry, + score=v.score, + text=v.text, + patch_id=v.patch_id, + ) + ) + + self.patch_predictions = {} # reset patch predictions + + for _, prediction in self.parent_predictions.items(): + for instance in prediction: + if instance.patch_id not in self.patch_predictions.keys(): + self.patch_predictions[instance.patch_id] = [] + + polygon = instance.pixel_geometry + + xx, yy = (np.array(i) for i in polygon.exterior.xy) + xx = ( + xx - self.patch_df.loc[instance.patch_id, "pixel_bounds"][0] + ) # add min_x + yy = ( + yy - self.patch_df.loc[instance.patch_id, "pixel_bounds"][1] + ) # add min_y + + patch_polygon = Polygon(zip(xx, yy)).buffer(0) + self.patch_predictions[instance.patch_id].append( + PatchPrediction( + pixel_geometry=patch_polygon, + score=instance.score, + text=instance.text, + ) + ) + class DetRecRunner(DetRunner): def _get_patch_predictions( @@ -671,7 +759,7 @@ def _post_process(self, image_id, ctrl_pnts, scores, recs, bd_pnts): score = f"{score:.2f}" self.patch_predictions[image_id].append( - Prediction(geometry=polygon, score=score, text=text) + PatchPrediction(pixel_geometry=polygon, score=score, text=text) ) def search_preds( @@ -766,10 +854,8 @@ def show_search_results( preds = self.search_results for instance in preds[parent_id]: - # Instance is: - # - [geometry, text, score] for det/rec - polygon = np.array(instance.geometry.exterior.coords.xy) - center = instance.geometry.centroid.coords.xy + polygon = np.array(instance.pixel_geometry.exterior.coords.xy) + center = instance.pixel_geometry.centroid.coords.xy patch = patches.Polygon(polygon.T, edgecolor=border_color, facecolor="none") ax.add_patch(patch) ax.text( @@ -803,7 +889,7 @@ def _get_geo_search_results(self): geo_search_results[parent_id] = [] for instance in prediction: - polygon = instance.geometry + polygon = instance.pixel_geometry xx, yy = (np.array(i) for i in polygon.exterior.xy) xx = ( @@ -819,11 +905,12 @@ def _get_geo_search_results(self): parent_polygon_geo = Polygon(zip(xx, yy)).buffer(0) geo_search_results[parent_id].append( - Prediction( - geometry=parent_polygon_geo, + GeoPrediction( + pixel_geometry=instance.pixel_geometry, score=instance.score, text=instance.score, patch_id=instance.patch_id, + geometry=parent_polygon_geo, crs=crs, ) ) From a4127b768f2cae275e3a13af917806538204094f Mon Sep 17 00:00:00 2001 From: Rosie Wood Date: Wed, 27 Nov 2024 13:36:28 +0000 Subject: [PATCH 04/15] fix imports --- mapreader/spot_text/dptext_detr_runner.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mapreader/spot_text/dptext_detr_runner.py b/mapreader/spot_text/dptext_detr_runner.py index 9b17f1c5..8ed24b6a 100644 --- a/mapreader/spot_text/dptext_detr_runner.py +++ b/mapreader/spot_text/dptext_detr_runner.py @@ -21,7 +21,7 @@ from dptext_detr.config import get_cfg from shapely import MultiPolygon, Polygon -from .dataclasses import Prediction +from .dataclasses import PatchPrediction from .runner_base import DetRunner @@ -124,5 +124,5 @@ def _post_process(self, image_id, scores, pred_classes, bd_pnts): score = f"{score:.2f}" self.patch_predictions[image_id].append( - Prediction(geometry=polygon, score=score) + PatchPrediction(geometry=polygon, score=score) ) From b7e3b0b46ceb10459221362c9f88adaa94880d38 Mon Sep 17 00:00:00 2001 From: Rosie Wood Date: Wed, 27 Nov 2024 13:50:30 +0000 Subject: [PATCH 05/15] fix deepsolo tests --- .../test_deepsolo_runner.py | 27 ++++++++++++------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/tests/test_text_spotting/test_deepsolo_runner.py b/tests/test_text_spotting/test_deepsolo_runner.py index 9247c46c..d935ec0b 100644 --- a/tests/test_text_spotting/test_deepsolo_runner.py +++ b/tests/test_text_spotting/test_deepsolo_runner.py @@ -13,7 +13,7 @@ from mapreader import DeepSoloRunner from mapreader.load import MapImages -from mapreader.spot_text.dataclasses import Prediction +from mapreader.spot_text.dataclasses import ParentPrediction, PatchPrediction # use cloned DeepSolo path if running in github actions DEEPSOLO_PATH = ( @@ -143,11 +143,20 @@ def test_deepsolo_run_all(init_runner, mock_response): assert isinstance(out, dict) assert "patch-0-0-800-40-#mapreader_text.png#.png" in out.keys() assert isinstance(out["patch-0-0-800-40-#mapreader_text.png#.png"], list) - assert isinstance(out["patch-0-0-800-40-#mapreader_text.png#.png"][0], Prediction) + assert isinstance( + out["patch-0-0-800-40-#mapreader_text.png#.png"][0], PatchPrediction + ) # dataframe out = runner._dict_to_dataframe(runner.patch_predictions) assert isinstance(out, pd.DataFrame) - assert set(out.columns) == set(["image_id", "geometry", "text", "score"]) + assert set(out.columns) == set( + [ + "image_id", + "pixel_geometry", + "text", + "score", + ] + ) assert "patch-0-0-800-40-#mapreader_text.png#.png" in out["image_id"].values @@ -158,12 +167,12 @@ def test_deepsolo_convert_to_parent(runner_run_all, mock_response): assert isinstance(out, dict) assert "mapreader_text.png" in out.keys() assert isinstance(out["mapreader_text.png"], list) - assert isinstance(out["mapreader_text.png"][0], Prediction) + assert isinstance(out["mapreader_text.png"][0], ParentPrediction) # dataframe out = runner._dict_to_dataframe(runner.parent_predictions) assert isinstance(out, pd.DataFrame) assert set(out.columns) == set( - ["image_id", "patch_id", "geometry", "text", "score"] + ["image_id", "patch_id", "pixel_geometry", "text", "score"] ) assert "mapreader_text.png" in out["image_id"].values @@ -179,7 +188,7 @@ def test_deepsolo_convert_to_parent_coords(runner_run_all, mock_response): out = runner._dict_to_dataframe(runner.geo_predictions) assert isinstance(out, gpd.GeoDataFrame) assert set(out.columns) == set( - ["image_id", "patch_id", "geometry", "crs", "text", "score"] + ["image_id", "patch_id", "pixel_geometry", "geometry", "crs", "text", "score"] ) assert "mapreader_text.png" in out["image_id"].values assert out.crs == runner.parent_df.crs @@ -230,7 +239,7 @@ def test_deepsolo_save_to_geojson(runner_run_all, tmp_path, mock_response): gdf = gpd.read_file(f"{tmp_path}/text.geojson") assert isinstance(gdf, gpd.GeoDataFrame) assert set(gdf.columns) == set( - ["image_id", "patch_id", "geometry", "crs", "text", "score"] + ["image_id", "patch_id", "pixel_geometry", "geometry", "crs", "text", "score"] ) @@ -244,7 +253,7 @@ def test_deepsolo_search_preds(runner_run_all, mock_response): out = runner.search_preds("map", ignore_case=True, return_dataframe=True) assert isinstance(out, pd.DataFrame) assert set(out.columns) == set( - ["image_id", "patch_id", "geometry", "text", "score"] + ["image_id", "patch_id", "pixel_geometry", "text", "score"] ) assert "mapreader_text.png" in out["image_id"].values out = runner.search_preds("somethingelse", ignore_case=True, return_dataframe=True) @@ -267,7 +276,7 @@ def test_deepsolo_save_search_results(runner_run_all, tmp_path, mock_response): gdf = gpd.read_file(f"{tmp_path}/search_results.geojson") assert isinstance(gdf, gpd.GeoDataFrame) assert set(gdf.columns) == set( - ["image_id", "patch_id", "geometry", "crs", "text", "score"] + ["image_id", "patch_id", "pixel_geometry", "geometry", "crs", "text", "score"] ) assert "mapreader_text.png" in gdf["image_id"].values From db3c5c507bd89574fab33e532640013c190c3f47 Mon Sep 17 00:00:00 2001 From: Rosie Wood Date: Wed, 27 Nov 2024 14:02:31 +0000 Subject: [PATCH 06/15] add/update tests --- .../test_deepsolo_runner.py | 7 +++++- .../test_text_spotting/test_dptext_runner.py | 22 +++++++++++++++---- .../test_text_spotting/test_maptext_runner.py | 22 ++++++++++++++----- 3 files changed, 40 insertions(+), 11 deletions(-) diff --git a/tests/test_text_spotting/test_deepsolo_runner.py b/tests/test_text_spotting/test_deepsolo_runner.py index d935ec0b..eb22c8ed 100644 --- a/tests/test_text_spotting/test_deepsolo_runner.py +++ b/tests/test_text_spotting/test_deepsolo_runner.py @@ -13,7 +13,11 @@ from mapreader import DeepSoloRunner from mapreader.load import MapImages -from mapreader.spot_text.dataclasses import ParentPrediction, PatchPrediction +from mapreader.spot_text.dataclasses import ( + GeoPrediction, + ParentPrediction, + PatchPrediction, +) # use cloned DeepSolo path if running in github actions DEEPSOLO_PATH = ( @@ -184,6 +188,7 @@ def test_deepsolo_convert_to_parent_coords(runner_run_all, mock_response): assert isinstance(out, dict) assert "mapreader_text.png" in out.keys() assert isinstance(out["mapreader_text.png"], list) + assert isinstance(out["mapreader_text.png"][0], GeoPrediction) # dataframe out = runner._dict_to_dataframe(runner.geo_predictions) assert isinstance(out, gpd.GeoDataFrame) diff --git a/tests/test_text_spotting/test_dptext_runner.py b/tests/test_text_spotting/test_dptext_runner.py index d8e4199b..0c42fea0 100644 --- a/tests/test_text_spotting/test_dptext_runner.py +++ b/tests/test_text_spotting/test_dptext_runner.py @@ -13,6 +13,11 @@ from mapreader import DPTextDETRRunner from mapreader.load import MapImages +from mapreader.spot_text.dataclasses import ( + GeoPrediction, + ParentPrediction, + PatchPrediction, +) # use cloned DPText-DETR path if running in github actions DPTEXT_DETR_PATH = ( @@ -143,10 +148,13 @@ def test_dptext_run_all(init_runner, mock_response): assert isinstance(out, dict) assert "patch-0-0-800-40-#mapreader_text.png#.png" in out.keys() assert isinstance(out["patch-0-0-800-40-#mapreader_text.png#.png"], list) + assert isinstance( + out["patch-0-0-800-40-#mapreader_text.png#.png"][0], PatchPrediction + ) # dataframe out = runner._dict_to_dataframe(runner.patch_predictions) assert isinstance(out, pd.DataFrame) - assert set(out.columns) == set(["image_id", "geometry", "score"]) + assert set(out.columns) == set(["image_id", "pixel_geometry", "score"]) assert "patch-0-0-800-40-#mapreader_text.png#.png" in out["image_id"].values @@ -157,10 +165,11 @@ def test_dptext_convert_to_parent(runner_run_all, mock_response): assert isinstance(out, dict) assert "mapreader_text.png" in out.keys() assert isinstance(out["mapreader_text.png"], list) + assert isinstance(out["mapreader_text.png"][0], ParentPrediction) # dataframe out = runner._dict_to_dataframe(runner.parent_predictions) assert isinstance(out, pd.DataFrame) - assert set(out.columns) == set(["image_id", "patch_id", "geometry", "score"]) + assert set(out.columns) == set(["image_id", "patch_id", "pixel_geometry", "score"]) assert "mapreader_text.png" in out["image_id"].values @@ -171,10 +180,13 @@ def test_dptext_convert_to_parent_coords(runner_run_all, mock_response): assert isinstance(out, dict) assert "mapreader_text.png" in out.keys() assert isinstance(out["mapreader_text.png"], list) + assert isinstance(out["mapreader_text.png"][0], GeoPrediction) # dataframe out = runner._dict_to_dataframe(runner.geo_predictions) assert isinstance(out, gpd.GeoDataFrame) - assert set(out.columns) == set(["image_id", "patch_id", "geometry", "crs", "score"]) + assert set(out.columns) == set( + ["image_id", "patch_id", "pixel_geometry", "geometry", "crs", "score"] + ) assert "mapreader_text.png" in out["image_id"].values assert out.crs == runner.parent_df.crs @@ -223,4 +235,6 @@ def test_dptext_save_to_geojson(runner_run_all, tmp_path, mock_response): assert os.path.exists(f"{tmp_path}/text.geojson") gdf = gpd.read_file(f"{tmp_path}/text.geojson") assert isinstance(gdf, gpd.GeoDataFrame) - assert set(gdf.columns) == set(["image_id", "patch_id", "geometry", "crs", "score"]) + assert set(gdf.columns) == set( + ["image_id", "patch_id", "pixel_geometry", "geometry", "crs", "score"] + ) diff --git a/tests/test_text_spotting/test_maptext_runner.py b/tests/test_text_spotting/test_maptext_runner.py index 9c593872..6bd20b08 100644 --- a/tests/test_text_spotting/test_maptext_runner.py +++ b/tests/test_text_spotting/test_maptext_runner.py @@ -13,6 +13,11 @@ from mapreader import MapTextRunner from mapreader.load import MapImages +from mapreader.spot_text.dataclasses import ( + GeoPrediction, + ParentPrediction, + PatchPrediction, +) # use cloned MapTextPipeline path if running in github actions MAPTEXTPIPELINE_PATH = ( @@ -143,10 +148,13 @@ def test_maptext_run_all(init_runner, mock_response): assert isinstance(out, dict) assert "patch-0-0-800-40-#mapreader_text.png#.png" in out.keys() assert isinstance(out["patch-0-0-800-40-#mapreader_text.png#.png"], list) + assert isinstance( + out["patch-0-0-800-40-#mapreader_text.png#.png"][0], PatchPrediction + ) # dataframe out = runner._dict_to_dataframe(runner.patch_predictions) assert isinstance(out, pd.DataFrame) - assert set(out.columns) == set(["image_id", "geometry", "text", "score"]) + assert set(out.columns) == set(["image_id", "pixel_geometry", "text", "score"]) assert "patch-0-0-800-40-#mapreader_text.png#.png" in out["image_id"].values @@ -157,11 +165,12 @@ def test_maptext_convert_to_parent(runner_run_all, mock_response): assert isinstance(out, dict) assert "mapreader_text.png" in out.keys() assert isinstance(out["mapreader_text.png"], list) + assert isinstance(out["mapreader_text.png"][0], ParentPrediction) # dataframe out = runner._dict_to_dataframe(runner.parent_predictions) assert isinstance(out, pd.DataFrame) assert set(out.columns) == set( - ["image_id", "patch_id", "geometry", "text", "score"] + ["image_id", "patch_id", "pixel_geometry", "text", "score"] ) assert "mapreader_text.png" in out["image_id"].values @@ -173,11 +182,12 @@ def test_maptext_convert_to_parent_coords(runner_run_all, mock_response): assert isinstance(out, dict) assert "mapreader_text.png" in out.keys() assert isinstance(out["mapreader_text.png"], list) + assert isinstance(out["mapreader_text.png"][0], GeoPrediction) # dataframe out = runner._dict_to_dataframe(runner.geo_predictions) assert isinstance(out, gpd.GeoDataFrame) assert set(out.columns) == set( - ["image_id", "patch_id", "geometry", "crs", "text", "score"] + ["image_id", "patch_id", "pixel_geometry", "geometry", "crs", "text", "score"] ) assert "mapreader_text.png" in out["image_id"].values assert out.crs == runner.parent_df.crs @@ -228,7 +238,7 @@ def test_maptext_save_to_geojson(runner_run_all, tmp_path, mock_response): gdf = gpd.read_file(f"{tmp_path}/text.geojson") assert isinstance(gdf, gpd.GeoDataFrame) assert set(gdf.columns) == set( - ["image_id", "patch_id", "geometry", "crs", "text", "score"] + ["image_id", "patch_id", "pixel_geometry", "geometry", "crs", "text", "score"] ) @@ -242,7 +252,7 @@ def test_maptext_search_preds(runner_run_all, mock_response): out = runner.search_preds("map", ignore_case=True, return_dataframe=True) assert isinstance(out, pd.DataFrame) assert set(out.columns) == set( - ["image_id", "patch_id", "geometry", "text", "score"] + ["image_id", "patch_id", "pixel_geometry", "text", "score"] ) assert "mapreader_text.png" in out["image_id"].values out = runner.search_preds("somethingelse", ignore_case=True, return_dataframe=True) @@ -265,7 +275,7 @@ def test_maptext_save_search_results(runner_run_all, tmp_path, mock_response): gdf = gpd.read_file(f"{tmp_path}/search_results.geojson") assert isinstance(gdf, gpd.GeoDataFrame) assert set(gdf.columns) == set( - ["image_id", "patch_id", "geometry", "crs", "text", "score"] + ["image_id", "patch_id", "pixel_geometry", "geometry", "crs", "text", "score"] ) assert "mapreader_text.png" in gdf["image_id"].values From 3af480b8d7bab9559aa3ef6dd171c9d99ef79bfe Mon Sep 17 00:00:00 2001 From: Rosie Wood Date: Fri, 29 Nov 2024 09:36:14 +0000 Subject: [PATCH 07/15] fix naming error --- mapreader/spot_text/dptext_detr_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mapreader/spot_text/dptext_detr_runner.py b/mapreader/spot_text/dptext_detr_runner.py index 8ed24b6a..c8bbb16b 100644 --- a/mapreader/spot_text/dptext_detr_runner.py +++ b/mapreader/spot_text/dptext_detr_runner.py @@ -124,5 +124,5 @@ def _post_process(self, image_id, scores, pred_classes, bd_pnts): score = f"{score:.2f}" self.patch_predictions[image_id].append( - PatchPrediction(geometry=polygon, score=score) + PatchPrediction(pixel_geometry=polygon, score=score) ) From 807e715984385e16ac640922c6317b5c3a456e27 Mon Sep 17 00:00:00 2001 From: Rosie Wood Date: Wed, 4 Dec 2024 13:31:52 +0000 Subject: [PATCH 08/15] add save to /load from csv --- mapreader/spot_text/runner_base.py | 134 ++++++++++++++++++++++++----- 1 file changed, 113 insertions(+), 21 deletions(-) diff --git a/mapreader/spot_text/runner_base.py b/mapreader/spot_text/runner_base.py index 77c1ee71..352c7378 100644 --- a/mapreader/spot_text/runner_base.py +++ b/mapreader/spot_text/runner_base.py @@ -16,7 +16,7 @@ from tqdm.auto import tqdm from mapreader import MapImages -from mapreader.utils.load_frames import load_from_csv, load_from_geojson +from mapreader.utils.load_frames import eval_dataframe, load_from_csv, load_from_geojson from .dataclasses import GeoPrediction, ParentPrediction, PatchPrediction @@ -478,18 +478,18 @@ def convert_to_coords( def save_to_geojson( self, - save_path: str | pathlib.Path, + path_save: str | pathlib.Path, centroid: bool = False, ) -> None: """Save the georeferenced predictions to a GeoJSON file. Parameters ---------- - save_path : str | pathlib.Path, optional + path_save : str | pathlib.Path, optional Path to save the GeoJSON file centroid : bool, optional - Whether to save the centroid of the polygons as the geometry column, by default False. - Note: The original polygon will stil be saved as a separate column. + Whether to convert the polygons to centroids, by default False. + NOTE: The original polygon will still be saved as a separate column """ if self.geo_predictions == {}: raise ValueError( @@ -500,12 +500,61 @@ def save_to_geojson( if centroid: geo_df["polygon"] = geo_df["geometry"].to_wkt() - geo_df["geometry"] = geo_df["geometry"].apply(self._polygon_to_centroid) + geo_df["geometry"] = geo_df["geometry"].centroid - geo_df.to_file(save_path, driver="GeoJSON", engine="pyogrio") + geo_df.to_file(path_save, driver="GeoJSON", engine="pyogrio") - def _polygon_to_centroid(self, polygon): - return polygon.centroid + def save_to_csv( + self, + path_save: str | pathlib.Path, + centroid: bool = False, + ) -> None: + """Saves the patch, parent and georeferenced predictions to CSV files. + + Parameters + ---------- + path_save : str | pathlib.Path + The path to save the CSV files. Files will be saved as `patch_predictions.csv`, `parent_predictions.csv` and `geo_predictions.csv`. + centroid : bool, optional + Whether to convert polygons to centroids, by default False. + NOTE: The original polygon will still be saved as a separate column. + + Note + ---- + Use the `save_to_geojson` method to save georeferenced predictions to a GeoJSON file. + """ + if self.patch_predictions == {}: # implies no parent or geo predictions + raise ValueError("[ERROR] No patch predictions found.") + + if not os.path.exists(path_save): + os.makedirs(path_save) + + print("[INFO] Saving patch predictions.") + patch_df = self._dict_to_dataframe(self.patch_predictions) + if centroid: + patch_df["polygon"] = patch_df["pixel_geometry"] + patch_df["pixel_geometry"] = patch_df["pixel_geometry"].apply( + lambda x: x.centroid + ) + patch_df.to_csv(f"{path_save}/patch_predictions.csv") + + if self.parent_predictions != {}: + print("[INFO] Saving parent predictions.") + parent_df = self._dict_to_dataframe(self.parent_predictions) + if centroid: + parent_df["polygon"] = parent_df["pixel_geometry"] + parent_df["pixel_geometry"] = parent_df["pixel_geometry"].apply( + lambda x: x.centroid + ) + parent_df.to_csv(f"{path_save}/parent_predictions.csv") + + if self.geo_predictions != {}: + print("[INFO] Saving geo predictions.") + geo_df = self._dict_to_dataframe(self.geo_predictions) + if centroid: + geo_df["polygon"] = geo_df["geometry"] + geo_df["geometry"] = geo_df["geometry"].centroid + geo_df.to_csv(f"{path_save}/geo_predictions.csv") def show_predictions( self, @@ -604,15 +653,15 @@ def explore_predictions( style_kwds=style_kwargs, ) - def load_predictions( + def load_geo_predictions( self, - path_save: str | pathlib.Path, + load_path: str | pathlib.Path, ): """Load georeferenced text predictions from a GeoJSON file. Parameters ---------- - path_save : str | pathlib.Path + load_path : str | pathlib.Path The path to the GeoJSON file. Raises @@ -624,10 +673,10 @@ def load_predictions( ---- This will overwrite any existing predictions! """ - if re.search(r"\..*?json$", str(path_save)): - preds_df = load_from_geojson(path_save, engine="pyogrio") + if re.search(r"\..*?json$", str(load_path)): + preds_df = load_from_geojson(load_path, engine="pyogrio") else: - raise ValueError("[ERROR] ``path_save`` must be a path to a geojson file.") + raise ValueError("[ERROR] ``load_path`` must be a path to a geojson file.") # convert pixel_geometry to shapely geometry preds_df["pixel_geometry"] = preds_df["pixel_geometry"].apply( @@ -648,7 +697,7 @@ def load_predictions( GeoPrediction( pixel_geometry=v.pixel_geometry, score=v.score, - text=v.text, + text=v.text if "text" in v.index else None, patch_id=v.patch_id, geometry=v.geometry, crs=v.crs, @@ -658,7 +707,7 @@ def load_predictions( ParentPrediction( pixel_geometry=v.pixel_geometry, score=v.score, - text=v.text, + text=v.text if "text" in v.index else None, patch_id=v.patch_id, ) ) @@ -689,6 +738,49 @@ def load_predictions( ) ) + def load_patch_predictions( + self, + patch_preds: str | pathlib.Path | pd.DataFrame, + ) -> None: + if not isinstance(patch_preds, pd.DataFrame): + if re.search(r"\..*?csv$", str(patch_preds)): + patch_preds = pd.read_csv(patch_preds, index_col=0) + patch_preds = eval_dataframe(patch_preds) + else: + raise ValueError( + "[ERROR] ``patch_preds`` must be a pandas DataFrame or path to a CSV file." + ) + + # if we have a polygon column, this implies the pixel_geometry column is the centroid + if "polygon" in patch_preds.columns: + patch_preds["pixel_geometry"] = patch_preds["polygon"] + patch_preds.drop(columns=["polygon"], inplace=True) + + # convert pixel_geometry to shapely geometry + patch_preds["pixel_geometry"] = patch_preds["pixel_geometry"].apply( + lambda x: from_wkt(x) + ) + + self.patch_predictions = {} # reset patch predictions + + for image_id in patch_preds["image_id"].unique(): + if image_id not in self.patch_predictions.keys(): + self.patch_predictions[image_id] = [] + + for _, v in patch_preds[patch_preds["image_id"] == image_id].iterrows(): + self.patch_predictions[image_id].append( + PatchPrediction( + pixel_geometry=v.pixel_geometry, + score=v.score, + text=v.text if "text" in v.index else None, + ) + ) + + self.geo_predictions = {} + self.parent_predictions = {} + + self.convert_to_parent_pixel_bounds() + class DetRecRunner(DetRunner): def _get_patch_predictions( @@ -950,14 +1042,14 @@ def explore_search_results( def save_search_results_to_geojson( self, - save_path: str | pathlib.Path, + path_save: str | pathlib.Path, centroid: bool = False, ) -> None: """Convert the search results to georeferenced search results and save them to a GeoJSON file. Parameters ---------- - save_path : str | pathlib.Path + path_save : str | pathlib.Path The path to save the GeoJSON file. centroid : bool, optional Whether to save the centroid of the polygons as the geometry column, by default False. @@ -976,6 +1068,6 @@ def save_search_results_to_geojson( if centroid: geo_df["polygon"] = geo_df["geometry"].to_wkt() - geo_df["geometry"] = geo_df["geometry"].apply(self._polygon_to_centroid) + geo_df["geometry"] = geo_df["geometry"].centroid - geo_df.to_file(save_path, driver="GeoJSON", engine="pyogrio") + geo_df.to_file(path_save, driver="GeoJSON", engine="pyogrio") From a46f4810b66a4fdb8e5a4f0a7f6a60cfe05642ad Mon Sep 17 00:00:00 2001 From: Rosie Wood Date: Wed, 4 Dec 2024 13:44:02 +0000 Subject: [PATCH 09/15] update changelog --- CHANGELOG.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7325e09b..7a1a5d5b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,11 @@ The following table shows which versions of MapReader are compatible with which _Add new changes here_ +## Added + +- Added ablity to save and reload text predictions ([#536](https://github.com/maps-as-data/MapReader/pull/536) +- Added minimal dataclasses for text predictions ([#536](https://github.com/maps-as-data/MapReader/pull/536) + ## [v1.6.1](https://github.com/Living-with-machines/MapReader/releases/tag/v1.6.1) (2024-11-18) ### Added From 13ba2fca4eface2d9585755f93b7c90cad8935db Mon Sep 17 00:00:00 2001 From: Rosie Wood Date: Wed, 4 Dec 2024 17:09:09 +0000 Subject: [PATCH 10/15] add tests deepsolo --- mapreader/spot_text/runner_base.py | 28 ++- .../test_deepsolo_runner.py | 222 ++++++++++++++++++ 2 files changed, 239 insertions(+), 11 deletions(-) diff --git a/mapreader/spot_text/runner_base.py b/mapreader/spot_text/runner_base.py index 352c7378..ccc299cf 100644 --- a/mapreader/spot_text/runner_base.py +++ b/mapreader/spot_text/runner_base.py @@ -500,7 +500,9 @@ def save_to_geojson( if centroid: geo_df["polygon"] = geo_df["geometry"].to_wkt() - geo_df["geometry"] = geo_df["geometry"].centroid + geo_df["geometry"] = ( + geo_df["geometry"].to_crs("27700").centroid.to_crs(geo_df.crs) + ) geo_df.to_file(path_save, driver="GeoJSON", engine="pyogrio") @@ -553,7 +555,9 @@ def save_to_csv( geo_df = self._dict_to_dataframe(self.geo_predictions) if centroid: geo_df["polygon"] = geo_df["geometry"] - geo_df["geometry"] = geo_df["geometry"].centroid + geo_df["geometry"] = ( + geo_df["geometry"].to_crs("27700").centroid.to_crs(geo_df.crs) + ) geo_df.to_csv(f"{path_save}/geo_predictions.csv") def show_predictions( @@ -751,15 +755,15 @@ def load_patch_predictions( "[ERROR] ``patch_preds`` must be a pandas DataFrame or path to a CSV file." ) - # if we have a polygon column, this implies the pixel_geometry column is the centroid - if "polygon" in patch_preds.columns: - patch_preds["pixel_geometry"] = patch_preds["polygon"] - patch_preds.drop(columns=["polygon"], inplace=True) + # if we have a polygon column, this implies the pixel_geometry column is the centroid + if "polygon" in patch_preds.columns: + patch_preds["pixel_geometry"] = patch_preds["polygon"] + patch_preds.drop(columns=["polygon"], inplace=True) - # convert pixel_geometry to shapely geometry - patch_preds["pixel_geometry"] = patch_preds["pixel_geometry"].apply( - lambda x: from_wkt(x) - ) + # convert pixel_geometry to shapely geometry + patch_preds["pixel_geometry"] = patch_preds["pixel_geometry"].apply( + lambda x: from_wkt(x) + ) self.patch_predictions = {} # reset patch predictions @@ -1068,6 +1072,8 @@ def save_search_results_to_geojson( if centroid: geo_df["polygon"] = geo_df["geometry"].to_wkt() - geo_df["geometry"] = geo_df["geometry"].centroid + geo_df["geometry"] = ( + geo_df["geometry"].to_crs("27700").centroid.to_crs(geo_df.crs) + ) geo_df.to_file(path_save, driver="GeoJSON", engine="pyogrio") diff --git a/tests/test_text_spotting/test_deepsolo_runner.py b/tests/test_text_spotting/test_deepsolo_runner.py index eb22c8ed..ea9ed6c2 100644 --- a/tests/test_text_spotting/test_deepsolo_runner.py +++ b/tests/test_text_spotting/test_deepsolo_runner.py @@ -5,11 +5,13 @@ import pickle import geopandas as gpd +import numpy as np import pandas as pd import pytest from deepsolo.config import get_cfg from detectron2.engine import DefaultPredictor from detectron2.structures.instances import Instances +from shapely import Polygon from mapreader import DeepSoloRunner from mapreader.load import MapImages @@ -44,6 +46,7 @@ def init_dataframes(sample_dir, tmp_path): maps.add_metadata(f"{sample_dir}/mapreader_text_metadata.csv") maps.patchify_all(patch_size=800, path_save=tmp_path) maps.check_georeferencing() + assert maps.georeferenced parent_df, patch_df = maps.convert_images() return parent_df, patch_df @@ -140,6 +143,80 @@ def test_deepsolo_init_tsv(init_dataframes, tmp_path): assert isinstance(runner.patch_df.iloc[0]["coordinates"], tuple) +def test_deepsolo_init_geojson(init_dataframes, tmp_path, mock_response): + parent_df, patch_df = init_dataframes + parent_df.to_file(f"{tmp_path}/parent_df.geojson", driver="GeoJSON") + patch_df.to_file(f"{tmp_path}/patch_df.geojson", driver="GeoJSON") + runner = DeepSoloRunner( + f"{tmp_path}/patch_df.geojson", + parent_df=f"{tmp_path}/parent_df.geojson", + cfg_file=f"{DEEPSOLO_PATH}/configs/R_50/IC15/finetune_150k_tt_mlt_13_15_textocr.yaml", + ) + assert isinstance(runner, DeepSoloRunner) + assert isinstance(runner.predictor, DefaultPredictor) + assert isinstance(runner.parent_df.iloc[0]["geometry"], Polygon) + out = runner.run_all() + assert isinstance(out, dict) + assert "patch-0-0-800-40-#mapreader_text.png#.png" in out.keys() + assert isinstance(out["patch-0-0-800-40-#mapreader_text.png#.png"], list) + assert isinstance( + out["patch-0-0-800-40-#mapreader_text.png#.png"][0], PatchPrediction + ) + + +def test_deepsolo_init_errors(init_dataframes): + parent_df, patch_df = init_dataframes + with pytest.raises(ValueError, match="path to a CSV/TSV/etc or geojson"): + DeepSoloRunner( + patch_df="fake_file.txt", + parent_df=parent_df, + cfg_file=f"{DEEPSOLO_PATH}/configs/R_50/IC15/finetune_150k_tt_mlt_13_15_textocr.yaml", + ) + with pytest.raises(ValueError, match="path to a CSV/TSV/etc or geojson"): + DeepSoloRunner( + patch_df=patch_df, + parent_df="fake_file.txt", + cfg_file=f"{DEEPSOLO_PATH}/configs/R_50/IC15/finetune_150k_tt_mlt_13_15_textocr.yaml", + ) + with pytest.raises(ValueError, match="path to a CSV/TSV/etc or geojson"): + DeepSoloRunner( + patch_df=np.array([1, 2, 3]), + parent_df=parent_df, + ) + with pytest.raises(ValueError, match="path to a CSV/TSV/etc or geojson"): + DeepSoloRunner( + patch_df=patch_df, + parent_df=np.array([1, 2, 3]), + ) + + +def test_check_georeferencing(init_dataframes): + parent_df, patch_df = init_dataframes + runner = DeepSoloRunner( + patch_df, + parent_df=parent_df, + cfg_file=f"{DEEPSOLO_PATH}/configs/R_50/IC15/finetune_150k_tt_mlt_13_15_textocr.yaml", + ) + runner.check_georeferencing() + assert runner.georeferenced + + runner = DeepSoloRunner( + patch_df, + parent_df=parent_df.drop(columns=["dlat", "dlon"]), + cfg_file=f"{DEEPSOLO_PATH}/configs/R_50/IC15/finetune_150k_tt_mlt_13_15_textocr.yaml", + ) + runner.check_georeferencing() + assert runner.georeferenced + + runner = DeepSoloRunner( + patch_df, + parent_df=parent_df.drop(columns=["coordinates"]), + cfg_file=f"{DEEPSOLO_PATH}/configs/R_50/IC15/finetune_150k_tt_mlt_13_15_textocr.yaml", + ) + runner.check_georeferencing() + assert not runner.georeferenced + + def test_deepsolo_run_all(init_runner, mock_response): runner = init_runner # dict @@ -246,6 +323,151 @@ def test_deepsolo_save_to_geojson(runner_run_all, tmp_path, mock_response): assert set(gdf.columns) == set( ["image_id", "patch_id", "pixel_geometry", "geometry", "crs", "text", "score"] ) + runner.save_to_geojson(f"{tmp_path}/text_centroid.geojson", centroid=True) + assert os.path.exists(f"{tmp_path}/text_centroid.geojson") + gdf_centroid = gpd.read_file(f"{tmp_path}/text_centroid.geojson") + assert isinstance(gdf_centroid, gpd.GeoDataFrame) + assert set(gdf_centroid.columns) == set( + [ + "image_id", + "patch_id", + "pixel_geometry", + "geometry", + "crs", + "text", + "score", + "polygon", + ] + ) + + +def test_deepsolo_load_geo_predictions(runner_run_all, tmp_path): + runner = runner_run_all + _ = runner.convert_to_coords() + runner.save_to_geojson(f"{tmp_path}/text.geojson") + runner.geo_predictions = {} + runner.load_geo_predictions(f"{tmp_path}/text.geojson") + assert len(runner.geo_predictions) + assert "mapreader_text.png" in runner.geo_predictions.keys() + assert isinstance(runner.geo_predictions["mapreader_text.png"], list) + assert isinstance(runner.geo_predictions["mapreader_text.png"][0], GeoPrediction) + + +def test_deepsolo_load_geo_predictions_errors(runner_run_all, tmp_path): + runner = runner_run_all + with pytest.raises(ValueError, match="must be a path to a geojson file"): + runner.load_geo_predictions("fakefile.csv") + + +def test_deepsolo_save_to_csv_polygon(runner_run_all, tmp_path, mock_response): + runner = runner_run_all + # patch + runner.save_to_csv(tmp_path) + assert os.path.exists(f"{tmp_path}/patch_predictions.csv") + # parent + _ = runner.convert_to_parent_pixel_bounds() + runner.save_to_csv(tmp_path) + assert os.path.exists(f"{tmp_path}/patch_predictions.csv") + assert os.path.exists(f"{tmp_path}/parent_predictions.csv") + # geo + _ = runner.convert_to_coords() + runner.save_to_csv(tmp_path) + assert os.path.exists(f"{tmp_path}/patch_predictions.csv") + assert os.path.exists(f"{tmp_path}/parent_predictions.csv") + assert os.path.exists(f"{tmp_path}/geo_predictions.csv") + + +def test_deepsolo_save_to_csv_centroid(runner_run_all, tmp_path, mock_response): + runner = runner_run_all + # patch + runner.save_to_csv(tmp_path, centroid=True) + assert os.path.exists(f"{tmp_path}/patch_predictions.csv") + # parent + _ = runner.convert_to_parent_pixel_bounds() + runner.save_to_csv(tmp_path, centroid=True) + assert os.path.exists(f"{tmp_path}/patch_predictions.csv") + assert os.path.exists(f"{tmp_path}/parent_predictions.csv") + # geo + _ = runner.convert_to_coords() + runner.save_to_csv(tmp_path, centroid=True) + assert os.path.exists(f"{tmp_path}/patch_predictions.csv") + assert os.path.exists(f"{tmp_path}/parent_predictions.csv") + assert os.path.exists(f"{tmp_path}/geo_predictions.csv") + + +def test_deepsolo_save_to_csv_errors(runner_run_all, tmp_path, mock_response): + runner = runner_run_all + runner.patch_predictions = {} + with pytest.raises(ValueError, match="No patch predictions found"): + runner.save_to_csv(tmp_path) + + +def test_deepsolo_load_patch_predictions(runner_run_all, tmp_path): + runner = runner_run_all + _ = runner.convert_to_coords() + assert len(runner.geo_predictions) # this will be empty after reloading + runner.save_to_csv(tmp_path) + runner.load_patch_predictions(f"{tmp_path}/patch_predictions.csv") + assert len(runner.patch_predictions) + assert len(runner.geo_predictions) == 0 + assert ( + "patch-0-0-800-40-#mapreader_text.png#.png" in runner.patch_predictions.keys() + ) + assert isinstance( + runner.patch_predictions["patch-0-0-800-40-#mapreader_text.png#.png"], list + ) + assert isinstance( + runner.patch_predictions["patch-0-0-800-40-#mapreader_text.png#.png"][0], + PatchPrediction, + ) + + +def test_deepsolo_load_patch_predictions_dataframe(runner_run_all): + runner = runner_run_all + patch_preds = runner._dict_to_dataframe(runner.patch_predictions) + _ = runner.convert_to_coords() + assert len(runner.geo_predictions) # this will be empty after reloading + runner.load_patch_predictions(patch_preds) + assert len(runner.patch_predictions) + assert len(runner.geo_predictions) == 0 + assert ( + "patch-0-0-800-40-#mapreader_text.png#.png" in runner.patch_predictions.keys() + ) + assert isinstance( + runner.patch_predictions["patch-0-0-800-40-#mapreader_text.png#.png"], list + ) + assert isinstance( + runner.patch_predictions["patch-0-0-800-40-#mapreader_text.png#.png"][0], + PatchPrediction, + ) + + +def test_deepsolo_load_patch_predictions_centroid(runner_run_all, tmp_path): + runner = runner_run_all + _ = runner.convert_to_coords() + assert len(runner.geo_predictions) + runner.save_to_csv(tmp_path, centroid=True) + runner.load_patch_predictions(f"{tmp_path}/patch_predictions.csv") + assert len(runner.patch_predictions) + assert len(runner.geo_predictions) == 0 + assert ( + "patch-0-0-800-40-#mapreader_text.png#.png" in runner.patch_predictions.keys() + ) + assert isinstance( + runner.patch_predictions["patch-0-0-800-40-#mapreader_text.png#.png"], list + ) + assert isinstance( + runner.patch_predictions["patch-0-0-800-40-#mapreader_text.png#.png"][0], + PatchPrediction, + ) + + +def test_deepsolo_load_patch_predictions_errors(runner_run_all, tmp_path): + runner = runner_run_all + with pytest.raises( + ValueError, match="must be a pandas DataFrame or path to a CSV file" + ): + runner.load_patch_predictions("fake_file.geojson") def test_deepsolo_search_preds(runner_run_all, mock_response): From 31e02c17e86ced5303597436eb75c4fd370d8e64 Mon Sep 17 00:00:00 2001 From: Rosie Wood Date: Thu, 5 Dec 2024 09:04:48 +0000 Subject: [PATCH 11/15] add test for saving search results to geojson --- .../test_deepsolo_runner.py | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/tests/test_text_spotting/test_deepsolo_runner.py b/tests/test_text_spotting/test_deepsolo_runner.py index ea9ed6c2..b08f20a1 100644 --- a/tests/test_text_spotting/test_deepsolo_runner.py +++ b/tests/test_text_spotting/test_deepsolo_runner.py @@ -323,6 +323,11 @@ def test_deepsolo_save_to_geojson(runner_run_all, tmp_path, mock_response): assert set(gdf.columns) == set( ["image_id", "patch_id", "pixel_geometry", "geometry", "crs", "text", "score"] ) + + +def test_deepsolo_save_to_geojson_centroid(runner_run_all, tmp_path, mock_response): + runner = runner_run_all + _ = runner.convert_to_coords() runner.save_to_geojson(f"{tmp_path}/text_centroid.geojson", centroid=True) assert os.path.exists(f"{tmp_path}/text_centroid.geojson") gdf_centroid = gpd.read_file(f"{tmp_path}/text_centroid.geojson") @@ -508,6 +513,32 @@ def test_deepsolo_save_search_results(runner_run_all, tmp_path, mock_response): assert "mapreader_text.png" in gdf["image_id"].values +def test_deepsolo_save_search_results_centroid(runner_run_all, tmp_path, mock_response): + runner = runner_run_all + _ = runner.convert_to_parent_pixel_bounds() + out = runner.search_preds("map", ignore_case=True) + assert isinstance(out, dict) + runner.save_search_results_to_geojson( + f"{tmp_path}/search_results_centroid.geojson", centroid=True + ) + assert os.path.exists(f"{tmp_path}/search_results_centroid.geojson") + gdf = gpd.read_file(f"{tmp_path}/search_results_centroid.geojson") + assert isinstance(gdf, gpd.GeoDataFrame) + assert set(gdf.columns) == set( + [ + "image_id", + "patch_id", + "pixel_geometry", + "geometry", + "crs", + "text", + "score", + "polygon", + ] + ) + assert "mapreader_text.png" in gdf["image_id"].values + + def test_deepsolo_save_search_results_errors(runner_run_all, tmp_path, mock_response): runner = runner_run_all with pytest.raises(ValueError, match="No results to save"): From c9b9529e419e98cf3ceaf445c8303ccc2c94e743 Mon Sep 17 00:00:00 2001 From: Rosie Wood Date: Thu, 5 Dec 2024 11:38:25 +0000 Subject: [PATCH 12/15] update maptext and dptxext detr tests --- .../test_text_spotting/test_dptext_runner.py | 226 ++++++++++++++++++ .../test_text_spotting/test_maptext_runner.py | 76 ++++++ 2 files changed, 302 insertions(+) diff --git a/tests/test_text_spotting/test_dptext_runner.py b/tests/test_text_spotting/test_dptext_runner.py index 0c42fea0..71327ab5 100644 --- a/tests/test_text_spotting/test_dptext_runner.py +++ b/tests/test_text_spotting/test_dptext_runner.py @@ -5,11 +5,13 @@ import pickle import geopandas as gpd +import numpy as np import pandas as pd import pytest from detectron2.engine import DefaultPredictor from detectron2.structures.instances import Instances from dptext_detr.config import get_cfg +from shapely import Polygon from mapreader import DPTextDETRRunner from mapreader.load import MapImages @@ -44,6 +46,7 @@ def init_dataframes(sample_dir, tmp_path): maps.add_metadata(f"{sample_dir}/mapreader_text_metadata.csv") maps.patchify_all(patch_size=800, path_save=tmp_path) maps.check_georeferencing() + assert maps.georeferenced parent_df, patch_df = maps.convert_images() return parent_df, patch_df @@ -141,6 +144,80 @@ def test_dptext_init_tsv(init_dataframes, tmp_path): assert isinstance(runner.patch_df.iloc[0]["coordinates"], tuple) +def test_dptext_init_geojson(init_dataframes, tmp_path, mock_response): + parent_df, patch_df = init_dataframes + parent_df.to_file(f"{tmp_path}/parent_df.geojson", driver="GeoJSON") + patch_df.to_file(f"{tmp_path}/patch_df.geojson", driver="GeoJSON") + runner = DPTextDETRRunner( + f"{tmp_path}/patch_df.geojson", + parent_df=f"{tmp_path}/parent_df.geojson", + cfg_file=f"{DPTEXT_DETR_PATH}/configs/DPText_DETR/ArT/R_50_poly.yaml", + ) + assert isinstance(runner, DPTextDETRRunner) + assert isinstance(runner.predictor, DefaultPredictor) + assert isinstance(runner.parent_df.iloc[0]["geometry"], Polygon) + out = runner.run_all() + assert isinstance(out, dict) + assert "patch-0-0-800-40-#mapreader_text.png#.png" in out.keys() + assert isinstance(out["patch-0-0-800-40-#mapreader_text.png#.png"], list) + assert isinstance( + out["patch-0-0-800-40-#mapreader_text.png#.png"][0], PatchPrediction + ) + + +def test_dptext_init_errors(init_dataframes): + parent_df, patch_df = init_dataframes + with pytest.raises(ValueError, match="path to a CSV/TSV/etc or geojson"): + DPTextDETRRunner( + patch_df="fake_file.txt", + parent_df=parent_df, + cfg_file=f"{DPTEXT_DETR_PATH}/configs/DPText_DETR/ArT/R_50_poly.yaml", + ) + with pytest.raises(ValueError, match="path to a CSV/TSV/etc or geojson"): + DPTextDETRRunner( + patch_df=patch_df, + parent_df="fake_file.txt", + cfg_file=f"{DPTEXT_DETR_PATH}/configs/DPText_DETR/ArT/R_50_poly.yaml", + ) + with pytest.raises(ValueError, match="path to a CSV/TSV/etc or geojson"): + DPTextDETRRunner( + patch_df=np.array([1, 2, 3]), + parent_df=parent_df, + ) + with pytest.raises(ValueError, match="path to a CSV/TSV/etc or geojson"): + DPTextDETRRunner( + patch_df=patch_df, + parent_df=np.array([1, 2, 3]), + ) + + +def test_dptext_check_georeferencing(init_dataframes): + parent_df, patch_df = init_dataframes + runner = DPTextDETRRunner( + patch_df, + parent_df=parent_df, + cfg_file=f"{DPTEXT_DETR_PATH}/configs/DPText_DETR/ArT/R_50_poly.yaml", + ) + runner.check_georeferencing() + assert runner.georeferenced + + runner = DPTextDETRRunner( + patch_df, + parent_df=parent_df.drop(columns=["dlat", "dlon"]), + cfg_file=f"{DPTEXT_DETR_PATH}/configs/DPText_DETR/ArT/R_50_poly.yaml", + ) + runner.check_georeferencing() + assert runner.georeferenced + + runner = DPTextDETRRunner( + patch_df, + parent_df=parent_df.drop(columns=["coordinates"]), + cfg_file=f"{DPTEXT_DETR_PATH}/configs/DPText_DETR/ArT/R_50_poly.yaml", + ) + runner.check_georeferencing() + assert not runner.georeferenced + + def test_dptext_run_all(init_runner, mock_response): runner = init_runner # dict @@ -238,3 +315,152 @@ def test_dptext_save_to_geojson(runner_run_all, tmp_path, mock_response): assert set(gdf.columns) == set( ["image_id", "patch_id", "pixel_geometry", "geometry", "crs", "score"] ) + + +def test_dptext_save_to_geojson_centroid(runner_run_all, tmp_path, mock_response): + runner = runner_run_all + _ = runner.convert_to_coords() + runner.save_to_geojson(f"{tmp_path}/text_centroid.geojson", centroid=True) + assert os.path.exists(f"{tmp_path}/text_centroid.geojson") + gdf_centroid = gpd.read_file(f"{tmp_path}/text_centroid.geojson") + assert isinstance(gdf_centroid, gpd.GeoDataFrame) + assert set(gdf_centroid.columns) == set( + [ + "image_id", + "patch_id", + "pixel_geometry", + "geometry", + "crs", + "score", + "polygon", + ] + ) + + +def test_dptext_load_geo_predictions(runner_run_all, tmp_path): + runner = runner_run_all + _ = runner.convert_to_coords() + runner.save_to_geojson(f"{tmp_path}/text.geojson") + runner.geo_predictions = {} + runner.load_geo_predictions(f"{tmp_path}/text.geojson") + assert len(runner.geo_predictions) + assert "mapreader_text.png" in runner.geo_predictions.keys() + assert isinstance(runner.geo_predictions["mapreader_text.png"], list) + assert isinstance(runner.geo_predictions["mapreader_text.png"][0], GeoPrediction) + + +def test_dptext_load_geo_predictions_errors(runner_run_all, tmp_path): + runner = runner_run_all + with pytest.raises(ValueError, match="must be a path to a geojson file"): + runner.load_geo_predictions("fakefile.csv") + + +def test_dptext_save_to_csv_polygon(runner_run_all, tmp_path, mock_response): + runner = runner_run_all + # patch + runner.save_to_csv(tmp_path) + assert os.path.exists(f"{tmp_path}/patch_predictions.csv") + # parent + _ = runner.convert_to_parent_pixel_bounds() + runner.save_to_csv(tmp_path) + assert os.path.exists(f"{tmp_path}/patch_predictions.csv") + assert os.path.exists(f"{tmp_path}/parent_predictions.csv") + # geo + _ = runner.convert_to_coords() + runner.save_to_csv(tmp_path) + assert os.path.exists(f"{tmp_path}/patch_predictions.csv") + assert os.path.exists(f"{tmp_path}/parent_predictions.csv") + assert os.path.exists(f"{tmp_path}/geo_predictions.csv") + + +def test_dptext_save_to_csv_centroid(runner_run_all, tmp_path, mock_response): + runner = runner_run_all + # patch + runner.save_to_csv(tmp_path, centroid=True) + assert os.path.exists(f"{tmp_path}/patch_predictions.csv") + # parent + _ = runner.convert_to_parent_pixel_bounds() + runner.save_to_csv(tmp_path, centroid=True) + assert os.path.exists(f"{tmp_path}/patch_predictions.csv") + assert os.path.exists(f"{tmp_path}/parent_predictions.csv") + # geo + _ = runner.convert_to_coords() + runner.save_to_csv(tmp_path, centroid=True) + assert os.path.exists(f"{tmp_path}/patch_predictions.csv") + assert os.path.exists(f"{tmp_path}/parent_predictions.csv") + assert os.path.exists(f"{tmp_path}/geo_predictions.csv") + + +def test_dptext_save_to_csv_errors(runner_run_all, tmp_path, mock_response): + runner = runner_run_all + runner.patch_predictions = {} + with pytest.raises(ValueError, match="No patch predictions found"): + runner.save_to_csv(tmp_path) + + +def test_dptext_load_patch_predictions(runner_run_all, tmp_path): + runner = runner_run_all + _ = runner.convert_to_coords() + assert len(runner.geo_predictions) # this will be empty after reloading + runner.save_to_csv(tmp_path) + runner.load_patch_predictions(f"{tmp_path}/patch_predictions.csv") + assert len(runner.patch_predictions) + assert len(runner.geo_predictions) == 0 + assert ( + "patch-0-0-800-40-#mapreader_text.png#.png" in runner.patch_predictions.keys() + ) + assert isinstance( + runner.patch_predictions["patch-0-0-800-40-#mapreader_text.png#.png"], list + ) + assert isinstance( + runner.patch_predictions["patch-0-0-800-40-#mapreader_text.png#.png"][0], + PatchPrediction, + ) + + +def test_dptext_load_patch_predictions_dataframe(runner_run_all): + runner = runner_run_all + patch_preds = runner._dict_to_dataframe(runner.patch_predictions) + _ = runner.convert_to_coords() + assert len(runner.geo_predictions) # this will be empty after reloading + runner.load_patch_predictions(patch_preds) + assert len(runner.patch_predictions) + assert len(runner.geo_predictions) == 0 + assert ( + "patch-0-0-800-40-#mapreader_text.png#.png" in runner.patch_predictions.keys() + ) + assert isinstance( + runner.patch_predictions["patch-0-0-800-40-#mapreader_text.png#.png"], list + ) + assert isinstance( + runner.patch_predictions["patch-0-0-800-40-#mapreader_text.png#.png"][0], + PatchPrediction, + ) + + +def test_dptext_load_patch_predictions_centroid(runner_run_all, tmp_path): + runner = runner_run_all + _ = runner.convert_to_coords() + assert len(runner.geo_predictions) + runner.save_to_csv(tmp_path, centroid=True) + runner.load_patch_predictions(f"{tmp_path}/patch_predictions.csv") + assert len(runner.patch_predictions) + assert len(runner.geo_predictions) == 0 + assert ( + "patch-0-0-800-40-#mapreader_text.png#.png" in runner.patch_predictions.keys() + ) + assert isinstance( + runner.patch_predictions["patch-0-0-800-40-#mapreader_text.png#.png"], list + ) + assert isinstance( + runner.patch_predictions["patch-0-0-800-40-#mapreader_text.png#.png"][0], + PatchPrediction, + ) + + +def test_dptext_load_patch_predictions_errors(runner_run_all, tmp_path): + runner = runner_run_all + with pytest.raises( + ValueError, match="must be a pandas DataFrame or path to a CSV file" + ): + runner.load_patch_predictions("fake_file.geojson") diff --git a/tests/test_text_spotting/test_maptext_runner.py b/tests/test_text_spotting/test_maptext_runner.py index 6bd20b08..0a4ff5bc 100644 --- a/tests/test_text_spotting/test_maptext_runner.py +++ b/tests/test_text_spotting/test_maptext_runner.py @@ -5,11 +5,13 @@ import pickle import geopandas as gpd +import numpy as np import pandas as pd import pytest from detectron2.engine import DefaultPredictor from detectron2.structures.instances import Instances from maptextpipeline.config import get_cfg +from shapely import Polygon from mapreader import MapTextRunner from mapreader.load import MapImages @@ -141,6 +143,80 @@ def test_maptext_init_tsv(init_dataframes, tmp_path): assert isinstance(runner.patch_df.iloc[0]["coordinates"], tuple) +def test_maptext_init_geojson(init_dataframes, tmp_path, mock_response): + parent_df, patch_df = init_dataframes + parent_df.to_file(f"{tmp_path}/parent_df.geojson", driver="GeoJSON") + patch_df.to_file(f"{tmp_path}/patch_df.geojson", driver="GeoJSON") + runner = MapTextRunner( + f"{tmp_path}/patch_df.geojson", + parent_df=f"{tmp_path}/parent_df.geojson", + cfg_file=f"{MAPTEXTPIPELINE_PATH}/configs/ViTAEv2_S/rumsey/test.yaml", + ) + assert isinstance(runner, MapTextRunner) + assert isinstance(runner.predictor, DefaultPredictor) + assert isinstance(runner.parent_df.iloc[0]["geometry"], Polygon) + out = runner.run_all() + assert isinstance(out, dict) + assert "patch-0-0-800-40-#mapreader_text.png#.png" in out.keys() + assert isinstance(out["patch-0-0-800-40-#mapreader_text.png#.png"], list) + assert isinstance( + out["patch-0-0-800-40-#mapreader_text.png#.png"][0], PatchPrediction + ) + + +def test_maptext_init_errors(init_dataframes): + parent_df, patch_df = init_dataframes + with pytest.raises(ValueError, match="path to a CSV/TSV/etc or geojson"): + MapTextRunner( + patch_df="fake_file.txt", + parent_df=parent_df, + cfg_file=f"{MAPTEXTPIPELINE_PATH}/configs/ViTAEv2_S/rumsey/test.yaml", + ) + with pytest.raises(ValueError, match="path to a CSV/TSV/etc or geojson"): + MapTextRunner( + patch_df=patch_df, + parent_df="fake_file.txt", + cfg_file=f"{MAPTEXTPIPELINE_PATH}/configs/ViTAEv2_S/rumsey/test.yaml", + ) + with pytest.raises(ValueError, match="path to a CSV/TSV/etc or geojson"): + MapTextRunner( + patch_df=np.array([1, 2, 3]), + parent_df=parent_df, + ) + with pytest.raises(ValueError, match="path to a CSV/TSV/etc or geojson"): + MapTextRunner( + patch_df=patch_df, + parent_df=np.array([1, 2, 3]), + ) + + +def test_maptext_check_georeferencing(init_dataframes): + parent_df, patch_df = init_dataframes + runner = MapTextRunner( + patch_df, + parent_df=parent_df, + cfg_file=f"{MAPTEXTPIPELINE_PATH}/configs/ViTAEv2_S/rumsey/test.yaml", + ) + runner.check_georeferencing() + assert runner.georeferenced + + runner = MapTextRunner( + patch_df, + parent_df=parent_df.drop(columns=["dlat", "dlon"]), + cfg_file=f"{MAPTEXTPIPELINE_PATH}/configs/ViTAEv2_S/rumsey/test.yaml", + ) + runner.check_georeferencing() + assert runner.georeferenced + + runner = MapTextRunner( + patch_df, + parent_df=parent_df.drop(columns=["coordinates"]), + cfg_file=f"{MAPTEXTPIPELINE_PATH}/configs/ViTAEv2_S/rumsey/test.yaml", + ) + runner.check_georeferencing() + assert not runner.georeferenced + + def test_maptext_run_all(init_runner, mock_response): runner = init_runner # dict From aaf8f7c8c86a3e68963f8533aa237adc6354a245 Mon Sep 17 00:00:00 2001 From: Rosie Wood Date: Fri, 6 Dec 2024 10:29:14 +0000 Subject: [PATCH 13/15] update docs --- .../step-by-step-guide/6-spot-text.rst | 73 ++++++++++++++++--- 1 file changed, 63 insertions(+), 10 deletions(-) diff --git a/docs/source/using-mapreader/step-by-step-guide/6-spot-text.rst b/docs/source/using-mapreader/step-by-step-guide/6-spot-text.rst index bdca45a2..84c5730e 100644 --- a/docs/source/using-mapreader/step-by-step-guide/6-spot-text.rst +++ b/docs/source/using-mapreader/step-by-step-guide/6-spot-text.rst @@ -223,7 +223,7 @@ You can do this by setting the ``deduplicate`` argument and passing a ``min_ioa` This will help resolve any issues with predictions being cut-off at the edges of patches since the overlap should help find the full piece of text. -Again, to view the predictions, you can use the ``show`` method. +Again, to view the predictions, you can use the ``show_predictions`` method. You should pass a parent image ID as the ``image_id`` argument: .. code-block:: python @@ -244,11 +244,6 @@ As above, use the ``border_color``, ``text_color`` and ``figsize`` arguments to figsize = (20, 20), ) -You can save your predictions to a csv file using the pandas ``to_csv`` method: - -.. code-block:: python - - parent_preds_df.to_csv("text_preds.csv") Geo-reference ------------- @@ -282,7 +277,11 @@ Or, if your maps are taken from a tilelayer, you can specify the URL of the tile You can also pass in a dictionary of ``style_kwargs`` to customize the appearance of the map. Refer to the `geopandas explore documentation `__ for more information on the available options. -Again, you can save your georeferenced predictions to a csv file (as shown above), or, you can save them to a geojson file for loading into GIS software: + +Saving +------ + +You can save your georeferenced predictions to a geojson file for loading into GIS software using the ``save_to_geojson`` method: .. code-block:: python @@ -290,6 +289,58 @@ Again, you can save your georeferenced predictions to a csv file (as shown above This will save the predictions to a geojson file, with each text prediction as a separate feature. +By default, the geometry column will contain the polygon representing the bounding box of your text. +If instead you would like to save just the centroid of this polygon, you can set the ``centroid`` argument: + +.. code-block:: python + + my_runner.save_to_geojson("text_preds.geojson", centroid=True) + +This will save the centroid of the bounding box as the geometry column and create a "polygon" column containing the original polygon. + +At any point, you can also save your patch, parent and georeferenced predictions to CSV files using the ``save_to_csv`` method: + +.. code-block:: python + + my_runner.save_to_csv("my_preds/") + +This will create a folder called "my_preds" and save the patch, parent and georeferenced predictions to CSV files within it. + +As above, you can use the ``centroid`` argument to save the centroid of the bounding box instead of the full polygon. + + +Loading +------- + +If you have saved your predictions and want to reload them into a runner, you use either of the ``load_geo_predictions`` or ``load_patch_predictions`` methods. + +.. note:: These methods will overwrite any existing predictions in the runner. So if you want to keep your existing predictions, you should save them to a file first! + +The ``load_geo_predictions`` method is used to load georeferenced predictions from a geojson file: + +.. code-block:: python + + my_runner.load_geo_predictions("text_preds.geojson") + +Loading this fill will populate the patch, parent and georeferenced predictions in the runner. + +The ``load_patch_predictions`` method is used to load patch predictions from a CSV file or pandas DataFrame. +To load a CSV file, you can use: + +.. code-block:: python + + my_runner.load_patch_predictions("my_preds/patch_preds.csv") + +Or, to load a pandas DataFrame, you can use: + +.. code-block:: python + + my_runner.load_patch_predictions(patch_preds_df) + +This will populate the patch and parent predictions in the runner but not the georeferenced predictions (incase you do not have georefencing information). +If you do want to convert these to georeferenced predictions, you can use the ``convert_to_coords`` method as shown above. + + Search predictions ------------------ @@ -364,8 +415,10 @@ If your maps are georeferenced, you can also save your search results using the my_runner.save_search_results_to_geojson("search_results.geojson") -This will save the search results to a geojson file, with each search result as a separate feature. +This will save the search results to a geojson file, with each search result as a separate feature which can be loaded into GIS software for further analysis/exploration. -These can then be loaded into GIS software for further analysis/exploration. +If, however, your maps are not georeferenced, you will need to save the search results to a csv file using the pandas ``to_csv`` method: + +.. code-block:: python -If your maps are not georeferenced, you can save the search results to a csv file using the pandas ``to_csv`` method (as shown above). + search_results_df.to_csv("search_results.csv") From 5ca580ab36c3414e7a7587ca426deb7d7df5fe63 Mon Sep 17 00:00:00 2001 From: Rosie Wood Date: Thu, 19 Dec 2024 09:22:38 +0000 Subject: [PATCH 14/15] Minor fixes/address review comments --- CHANGELOG.md | 2 +- .../step-by-step-guide/6-spot-text.rst | 24 ++-- mapreader/spot_text/runner_base.py | 114 +++++++++++++----- .../test_deepsolo_runner.py | 50 ++++---- .../test_text_spotting/test_dptext_runner.py | 34 +++--- .../test_text_spotting/test_maptext_runner.py | 12 +- 6 files changed, 142 insertions(+), 94 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7a1a5d5b..cabd1d60 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,7 +19,7 @@ _Add new changes here_ ## Added -- Added ablity to save and reload text predictions ([#536](https://github.com/maps-as-data/MapReader/pull/536) +- Added ability to save and reload text predictions ([#536](https://github.com/maps-as-data/MapReader/pull/536) - Added minimal dataclasses for text predictions ([#536](https://github.com/maps-as-data/MapReader/pull/536) ## [v1.6.1](https://github.com/Living-with-machines/MapReader/releases/tag/v1.6.1) (2024-11-18) diff --git a/docs/source/using-mapreader/step-by-step-guide/6-spot-text.rst b/docs/source/using-mapreader/step-by-step-guide/6-spot-text.rst index 84c5730e..2b417d63 100644 --- a/docs/source/using-mapreader/step-by-step-guide/6-spot-text.rst +++ b/docs/source/using-mapreader/step-by-step-guide/6-spot-text.rst @@ -248,7 +248,7 @@ As above, use the ``border_color``, ``text_color`` and ``figsize`` arguments to Geo-reference ------------- -If you maps are georeferenced in your ``parent_df``, you can also convert the pixel bounds to georeferenced coordinates using the ``convert_to_coords`` method: +If you maps are georeferenced in your ``parent_df``, you can also convert the pixel coordinates to georeferenced coordinates using the ``convert_to_coords`` method: .. code-block:: python @@ -281,11 +281,11 @@ Refer to the `geopandas explore documentation None: + """ + Save the georeferenced predictions to a GeoJSON file. + + Parameters + ---------- + path_save : str | pathlib.Path, optional + Path to save the GeoJSON file + centroid : bool, optional + Whether to convert the polygons to centroids, by default False. + NOTE: The original polygon will still be saved as a separate column + """ + print( + "[WARNING] This method is deprecated and will soon be removed. Use `to_geojson` instead." + ) + self.to_geojson(path_save, centroid) + + def to_geojson( + self, + path_save: str | pathlib.Path, + centroid: bool = False, ) -> None: """Save the georeferenced predictions to a GeoJSON file. @@ -506,7 +527,7 @@ def save_to_geojson( geo_df.to_file(path_save, driver="GeoJSON", engine="pyogrio") - def save_to_csv( + def to_csv( self, path_save: str | pathlib.Path, centroid: bool = False, @@ -858,7 +879,7 @@ def _post_process(self, image_id, ctrl_pnts, scores, recs, bd_pnts): PatchPrediction(pixel_geometry=polygon, score=score, text=text) ) - def search_preds( + def search_predictions( self, search_text: str, ignore_case: bool = True, return_dataframe: bool = False ) -> dict | pd.DataFrame: """Search the predictions for specific text. Accepts regex. @@ -1044,36 +1065,63 @@ def explore_search_results( style_kwds=style_kwargs, ) - def save_search_results_to_geojson( - self, - path_save: str | pathlib.Path, - centroid: bool = False, - ) -> None: - """Convert the search results to georeferenced search results and save them to a GeoJSON file. - - Parameters - ---------- - path_save : str | pathlib.Path - The path to save the GeoJSON file. - centroid : bool, optional - Whether to save the centroid of the polygons as the geometry column, by default False. - Note: The original polygon will stil be saved as a separate column. - - Raises - ------ - ValueError - If no search results are found. - """ - if self.search_results == {}: - raise ValueError("[ERROR] No results to save!") - - geo_search_results = self._get_geo_search_results() - geo_df = self._dict_to_dataframe(geo_search_results) - if centroid: - geo_df["polygon"] = geo_df["geometry"].to_wkt() - geo_df["geometry"] = ( - geo_df["geometry"].to_crs("27700").centroid.to_crs(geo_df.crs) - ) +def save_search_results_to_geojson( + self, + path_save: str | pathlib.Path, + centroid: bool = False, +) -> None: + """Convert the search results to georeferenced search results and save them to a GeoJSON file. + + Parameters + ---------- + path_save : str | pathlib.Path + The path to save the GeoJSON file. + centroid : bool, optional + Whether to save the centroid of the polygons as the geometry column, by default False. + Note: The original polygon will stil be saved as a separate column. + + Raises + ------ + ValueError + If no search results are found. + """ + print( + "[WARNING] This method is deprecated and will soon be removed. Use `search_results_to_geojson` instead." + ) + self.search_results_to_geojson(path_save, centroid) + + +def search_results_to_geojson( + self, + path_save: str | pathlib.Path, + centroid: bool = False, +) -> None: + """Convert the search results to georeferenced search results and save them to a GeoJSON file. + + Parameters + ---------- + path_save : str | pathlib.Path + The path to save the GeoJSON file. + centroid : bool, optional + Whether to save the centroid of the polygons as the geometry column, by default False. + Note: The original polygon will stil be saved as a separate column. + + Raises + ------ + ValueError + If no search results are found. + """ + if self.search_results == {}: + raise ValueError("[ERROR] No results to save!") + + geo_search_results = self._get_geo_search_results() + geo_df = self._dict_to_dataframe(geo_search_results) + + if centroid: + geo_df["polygon"] = geo_df["geometry"].to_wkt() + geo_df["geometry"] = ( + geo_df["geometry"].to_crs("27700").centroid.to_crs(geo_df.crs) + ) - geo_df.to_file(path_save, driver="GeoJSON", engine="pyogrio") + geo_df.to_file(path_save, driver="GeoJSON", engine="pyogrio") diff --git a/tests/test_text_spotting/test_deepsolo_runner.py b/tests/test_text_spotting/test_deepsolo_runner.py index b08f20a1..01a24c8a 100644 --- a/tests/test_text_spotting/test_deepsolo_runner.py +++ b/tests/test_text_spotting/test_deepsolo_runner.py @@ -44,7 +44,7 @@ def init_dataframes(sample_dir, tmp_path): """ maps = MapImages(f"{sample_dir}/mapreader_text.png") maps.add_metadata(f"{sample_dir}/mapreader_text_metadata.csv") - maps.patchify_all(patch_size=800, path_save=tmp_path) + maps.patchify_all(patch_size=800, path_=tmp_path) maps.check_georeferencing() assert maps.georeferenced parent_df, patch_df = maps.convert_images() @@ -279,7 +279,7 @@ def test_deepsolo_convert_to_parent_coords(runner_run_all, mock_response): def test_deepsolo_deduplicate(sample_dir, tmp_path, mock_response): maps = MapImages(f"{sample_dir}/mapreader_text.png") maps.add_metadata(f"{sample_dir}/mapreader_text_metadata.csv") - maps.patchify_all(patch_size=800, path_save=tmp_path, overlap=0.5) + maps.patchify_all(patch_size=800, path_=tmp_path, overlap=0.5) maps.check_georeferencing() parent_df, patch_df = maps.convert_images() runner = DeepSoloRunner( @@ -313,10 +313,10 @@ def test_deepsolo_run_on_image(init_runner, mock_response): assert isinstance(out["instances"], Instances) -def test_deepsolo_save_to_geojson(runner_run_all, tmp_path, mock_response): +def test_deepsolo_to_geojson(runner_run_all, tmp_path, mock_response): runner = runner_run_all _ = runner.convert_to_coords() - runner.save_to_geojson(f"{tmp_path}/text.geojson") + runner.to_geojson(f"{tmp_path}/text.geojson") assert os.path.exists(f"{tmp_path}/text.geojson") gdf = gpd.read_file(f"{tmp_path}/text.geojson") assert isinstance(gdf, gpd.GeoDataFrame) @@ -325,10 +325,10 @@ def test_deepsolo_save_to_geojson(runner_run_all, tmp_path, mock_response): ) -def test_deepsolo_save_to_geojson_centroid(runner_run_all, tmp_path, mock_response): +def test_deepsolo_to_geojson_centroid(runner_run_all, tmp_path, mock_response): runner = runner_run_all _ = runner.convert_to_coords() - runner.save_to_geojson(f"{tmp_path}/text_centroid.geojson", centroid=True) + runner.to_geojson(f"{tmp_path}/text_centroid.geojson", centroid=True) assert os.path.exists(f"{tmp_path}/text_centroid.geojson") gdf_centroid = gpd.read_file(f"{tmp_path}/text_centroid.geojson") assert isinstance(gdf_centroid, gpd.GeoDataFrame) @@ -349,7 +349,7 @@ def test_deepsolo_save_to_geojson_centroid(runner_run_all, tmp_path, mock_respon def test_deepsolo_load_geo_predictions(runner_run_all, tmp_path): runner = runner_run_all _ = runner.convert_to_coords() - runner.save_to_geojson(f"{tmp_path}/text.geojson") + runner.to_geojson(f"{tmp_path}/text.geojson") runner.geo_predictions = {} runner.load_geo_predictions(f"{tmp_path}/text.geojson") assert len(runner.geo_predictions) @@ -364,54 +364,54 @@ def test_deepsolo_load_geo_predictions_errors(runner_run_all, tmp_path): runner.load_geo_predictions("fakefile.csv") -def test_deepsolo_save_to_csv_polygon(runner_run_all, tmp_path, mock_response): +def test_deepsolo_to_csv_polygon(runner_run_all, tmp_path, mock_response): runner = runner_run_all # patch - runner.save_to_csv(tmp_path) + runner.to_csv(tmp_path) assert os.path.exists(f"{tmp_path}/patch_predictions.csv") # parent _ = runner.convert_to_parent_pixel_bounds() - runner.save_to_csv(tmp_path) + runner.to_csv(tmp_path) assert os.path.exists(f"{tmp_path}/patch_predictions.csv") assert os.path.exists(f"{tmp_path}/parent_predictions.csv") # geo _ = runner.convert_to_coords() - runner.save_to_csv(tmp_path) + runner.to_csv(tmp_path) assert os.path.exists(f"{tmp_path}/patch_predictions.csv") assert os.path.exists(f"{tmp_path}/parent_predictions.csv") assert os.path.exists(f"{tmp_path}/geo_predictions.csv") -def test_deepsolo_save_to_csv_centroid(runner_run_all, tmp_path, mock_response): +def test_deepsolo_to_csv_centroid(runner_run_all, tmp_path, mock_response): runner = runner_run_all # patch - runner.save_to_csv(tmp_path, centroid=True) + runner.to_csv(tmp_path, centroid=True) assert os.path.exists(f"{tmp_path}/patch_predictions.csv") # parent _ = runner.convert_to_parent_pixel_bounds() - runner.save_to_csv(tmp_path, centroid=True) + runner.to_csv(tmp_path, centroid=True) assert os.path.exists(f"{tmp_path}/patch_predictions.csv") assert os.path.exists(f"{tmp_path}/parent_predictions.csv") # geo _ = runner.convert_to_coords() - runner.save_to_csv(tmp_path, centroid=True) + runner.to_csv(tmp_path, centroid=True) assert os.path.exists(f"{tmp_path}/patch_predictions.csv") assert os.path.exists(f"{tmp_path}/parent_predictions.csv") assert os.path.exists(f"{tmp_path}/geo_predictions.csv") -def test_deepsolo_save_to_csv_errors(runner_run_all, tmp_path, mock_response): +def test_deepsolo_to_csv_errors(runner_run_all, tmp_path, mock_response): runner = runner_run_all runner.patch_predictions = {} with pytest.raises(ValueError, match="No patch predictions found"): - runner.save_to_csv(tmp_path) + runner.to_csv(tmp_path) def test_deepsolo_load_patch_predictions(runner_run_all, tmp_path): runner = runner_run_all _ = runner.convert_to_coords() assert len(runner.geo_predictions) # this will be empty after reloading - runner.save_to_csv(tmp_path) + runner.to_csv(tmp_path) runner.load_patch_predictions(f"{tmp_path}/patch_predictions.csv") assert len(runner.patch_predictions) assert len(runner.geo_predictions) == 0 @@ -451,7 +451,7 @@ def test_deepsolo_load_patch_predictions_centroid(runner_run_all, tmp_path): runner = runner_run_all _ = runner.convert_to_coords() assert len(runner.geo_predictions) - runner.save_to_csv(tmp_path, centroid=True) + runner.to_csv(tmp_path, centroid=True) runner.load_patch_predictions(f"{tmp_path}/patch_predictions.csv") assert len(runner.patch_predictions) assert len(runner.geo_predictions) == 0 @@ -498,12 +498,12 @@ def test_deepsolo_search_preds_errors(runner_run_all, mock_response): runner.search_preds("maps", ignore_case=True) -def test_deepsolo_save_search_results(runner_run_all, tmp_path, mock_response): +def test_deepsolo_search_results(runner_run_all, tmp_path, mock_response): runner = runner_run_all _ = runner.convert_to_parent_pixel_bounds() out = runner.search_preds("map", ignore_case=True) assert isinstance(out, dict) - runner.save_search_results_to_geojson(f"{tmp_path}/search_results.geojson") + runner.search_results_to_geojson(f"{tmp_path}/search_results.geojson") assert os.path.exists(f"{tmp_path}/search_results.geojson") gdf = gpd.read_file(f"{tmp_path}/search_results.geojson") assert isinstance(gdf, gpd.GeoDataFrame) @@ -513,12 +513,12 @@ def test_deepsolo_save_search_results(runner_run_all, tmp_path, mock_response): assert "mapreader_text.png" in gdf["image_id"].values -def test_deepsolo_save_search_results_centroid(runner_run_all, tmp_path, mock_response): +def test_deepsolo_search_results_centroid(runner_run_all, tmp_path, mock_response): runner = runner_run_all _ = runner.convert_to_parent_pixel_bounds() out = runner.search_preds("map", ignore_case=True) assert isinstance(out, dict) - runner.save_search_results_to_geojson( + runner.search_results_to_geojson( f"{tmp_path}/search_results_centroid.geojson", centroid=True ) assert os.path.exists(f"{tmp_path}/search_results_centroid.geojson") @@ -539,7 +539,7 @@ def test_deepsolo_save_search_results_centroid(runner_run_all, tmp_path, mock_re assert "mapreader_text.png" in gdf["image_id"].values -def test_deepsolo_save_search_results_errors(runner_run_all, tmp_path, mock_response): +def test_deepsolo_search_results_errors(runner_run_all, tmp_path, mock_response): runner = runner_run_all with pytest.raises(ValueError, match="No results to save"): - runner.save_search_results_to_geojson(f"{tmp_path}/test.geojson") + runner.search_results_to_geojson(f"{tmp_path}/test.geojson") diff --git a/tests/test_text_spotting/test_dptext_runner.py b/tests/test_text_spotting/test_dptext_runner.py index 71327ab5..ed1a231d 100644 --- a/tests/test_text_spotting/test_dptext_runner.py +++ b/tests/test_text_spotting/test_dptext_runner.py @@ -305,10 +305,10 @@ def test_dptext_run_on_image(init_runner, mock_response): assert isinstance(out["instances"], Instances) -def test_dptext_save_to_geojson(runner_run_all, tmp_path, mock_response): +def test_dptext_to_geojson(runner_run_all, tmp_path, mock_response): runner = runner_run_all _ = runner.convert_to_coords() - runner.save_to_geojson(f"{tmp_path}/text.geojson") + runner.to_geojson(f"{tmp_path}/text.geojson") assert os.path.exists(f"{tmp_path}/text.geojson") gdf = gpd.read_file(f"{tmp_path}/text.geojson") assert isinstance(gdf, gpd.GeoDataFrame) @@ -317,10 +317,10 @@ def test_dptext_save_to_geojson(runner_run_all, tmp_path, mock_response): ) -def test_dptext_save_to_geojson_centroid(runner_run_all, tmp_path, mock_response): +def test_dptext_to_geojson_centroid(runner_run_all, tmp_path, mock_response): runner = runner_run_all _ = runner.convert_to_coords() - runner.save_to_geojson(f"{tmp_path}/text_centroid.geojson", centroid=True) + runner.to_geojson(f"{tmp_path}/text_centroid.geojson", centroid=True) assert os.path.exists(f"{tmp_path}/text_centroid.geojson") gdf_centroid = gpd.read_file(f"{tmp_path}/text_centroid.geojson") assert isinstance(gdf_centroid, gpd.GeoDataFrame) @@ -340,7 +340,7 @@ def test_dptext_save_to_geojson_centroid(runner_run_all, tmp_path, mock_response def test_dptext_load_geo_predictions(runner_run_all, tmp_path): runner = runner_run_all _ = runner.convert_to_coords() - runner.save_to_geojson(f"{tmp_path}/text.geojson") + runner.to_geojson(f"{tmp_path}/text.geojson") runner.geo_predictions = {} runner.load_geo_predictions(f"{tmp_path}/text.geojson") assert len(runner.geo_predictions) @@ -355,54 +355,54 @@ def test_dptext_load_geo_predictions_errors(runner_run_all, tmp_path): runner.load_geo_predictions("fakefile.csv") -def test_dptext_save_to_csv_polygon(runner_run_all, tmp_path, mock_response): +def test_dptext_to_csv_polygon(runner_run_all, tmp_path, mock_response): runner = runner_run_all # patch - runner.save_to_csv(tmp_path) + runner.to_csv(tmp_path) assert os.path.exists(f"{tmp_path}/patch_predictions.csv") # parent _ = runner.convert_to_parent_pixel_bounds() - runner.save_to_csv(tmp_path) + runner.to_csv(tmp_path) assert os.path.exists(f"{tmp_path}/patch_predictions.csv") assert os.path.exists(f"{tmp_path}/parent_predictions.csv") # geo _ = runner.convert_to_coords() - runner.save_to_csv(tmp_path) + runner.to_csv(tmp_path) assert os.path.exists(f"{tmp_path}/patch_predictions.csv") assert os.path.exists(f"{tmp_path}/parent_predictions.csv") assert os.path.exists(f"{tmp_path}/geo_predictions.csv") -def test_dptext_save_to_csv_centroid(runner_run_all, tmp_path, mock_response): +def test_dptext_to_csv_centroid(runner_run_all, tmp_path, mock_response): runner = runner_run_all # patch - runner.save_to_csv(tmp_path, centroid=True) + runner.to_csv(tmp_path, centroid=True) assert os.path.exists(f"{tmp_path}/patch_predictions.csv") # parent _ = runner.convert_to_parent_pixel_bounds() - runner.save_to_csv(tmp_path, centroid=True) + runner.to_csv(tmp_path, centroid=True) assert os.path.exists(f"{tmp_path}/patch_predictions.csv") assert os.path.exists(f"{tmp_path}/parent_predictions.csv") # geo _ = runner.convert_to_coords() - runner.save_to_csv(tmp_path, centroid=True) + runner.to_csv(tmp_path, centroid=True) assert os.path.exists(f"{tmp_path}/patch_predictions.csv") assert os.path.exists(f"{tmp_path}/parent_predictions.csv") assert os.path.exists(f"{tmp_path}/geo_predictions.csv") -def test_dptext_save_to_csv_errors(runner_run_all, tmp_path, mock_response): +def test_dptext_to_csv_errors(runner_run_all, tmp_path, mock_response): runner = runner_run_all runner.patch_predictions = {} with pytest.raises(ValueError, match="No patch predictions found"): - runner.save_to_csv(tmp_path) + runner.to_csv(tmp_path) def test_dptext_load_patch_predictions(runner_run_all, tmp_path): runner = runner_run_all _ = runner.convert_to_coords() assert len(runner.geo_predictions) # this will be empty after reloading - runner.save_to_csv(tmp_path) + runner.to_csv(tmp_path) runner.load_patch_predictions(f"{tmp_path}/patch_predictions.csv") assert len(runner.patch_predictions) assert len(runner.geo_predictions) == 0 @@ -442,7 +442,7 @@ def test_dptext_load_patch_predictions_centroid(runner_run_all, tmp_path): runner = runner_run_all _ = runner.convert_to_coords() assert len(runner.geo_predictions) - runner.save_to_csv(tmp_path, centroid=True) + runner.to_csv(tmp_path, centroid=True) runner.load_patch_predictions(f"{tmp_path}/patch_predictions.csv") assert len(runner.patch_predictions) assert len(runner.geo_predictions) == 0 diff --git a/tests/test_text_spotting/test_maptext_runner.py b/tests/test_text_spotting/test_maptext_runner.py index 0a4ff5bc..2f21a162 100644 --- a/tests/test_text_spotting/test_maptext_runner.py +++ b/tests/test_text_spotting/test_maptext_runner.py @@ -306,10 +306,10 @@ def test_maptext_run_on_image(init_runner, mock_response): assert isinstance(out["instances"], Instances) -def test_maptext_save_to_geojson(runner_run_all, tmp_path, mock_response): +def test_maptext_to_geojson(runner_run_all, tmp_path, mock_response): runner = runner_run_all _ = runner.convert_to_coords() - runner.save_to_geojson(f"{tmp_path}/text.geojson") + runner.to_geojson(f"{tmp_path}/text.geojson") assert os.path.exists(f"{tmp_path}/text.geojson") gdf = gpd.read_file(f"{tmp_path}/text.geojson") assert isinstance(gdf, gpd.GeoDataFrame) @@ -341,12 +341,12 @@ def test_maptext_search_preds_errors(runner_run_all, mock_response): runner.search_preds("maps", ignore_case=True) -def test_maptext_save_search_results(runner_run_all, tmp_path, mock_response): +def test_maptext_search_results(runner_run_all, tmp_path, mock_response): runner = runner_run_all _ = runner.convert_to_parent_pixel_bounds() out = runner.search_preds("map", ignore_case=True) assert isinstance(out, dict) - runner.save_search_results_to_geojson(f"{tmp_path}/search_results.geojson") + runner.search_results_to_geojson(f"{tmp_path}/search_results.geojson") assert os.path.exists(f"{tmp_path}/search_results.geojson") gdf = gpd.read_file(f"{tmp_path}/search_results.geojson") assert isinstance(gdf, gpd.GeoDataFrame) @@ -356,7 +356,7 @@ def test_maptext_save_search_results(runner_run_all, tmp_path, mock_response): assert "mapreader_text.png" in gdf["image_id"].values -def test_maptext_save_search_results_errors(runner_run_all, tmp_path, mock_response): +def test_maptext_search_results_errors(runner_run_all, tmp_path, mock_response): runner = runner_run_all with pytest.raises(ValueError, match="No results to save"): - runner.save_search_results_to_geojson(f"{tmp_path}/test.geojson") + runner.search_results_to_geojson(f"{tmp_path}/test.geojson") From 484b28333c7b499ffc894a95b5eb6e501c55e588 Mon Sep 17 00:00:00 2001 From: Rosie Wood Date: Thu, 19 Dec 2024 11:54:07 +0000 Subject: [PATCH 15/15] fix typo --- mapreader/spot_text/runner_base.py | 112 +++++++++--------- .../test_deepsolo_runner.py | 22 ++-- .../test_text_spotting/test_maptext_runner.py | 16 +-- 3 files changed, 76 insertions(+), 74 deletions(-) diff --git a/mapreader/spot_text/runner_base.py b/mapreader/spot_text/runner_base.py index 8207812f..3221a3df 100644 --- a/mapreader/spot_text/runner_base.py +++ b/mapreader/spot_text/runner_base.py @@ -1065,63 +1065,61 @@ def explore_search_results( style_kwds=style_kwargs, ) + def save_search_results_to_geojson( + self, + path_save: str | pathlib.Path, + centroid: bool = False, + ) -> None: + """Convert the search results to georeferenced search results and save them to a GeoJSON file. + + Parameters + ---------- + path_save : str | pathlib.Path + The path to save the GeoJSON file. + centroid : bool, optional + Whether to save the centroid of the polygons as the geometry column, by default False. + Note: The original polygon will stil be saved as a separate column. -def save_search_results_to_geojson( - self, - path_save: str | pathlib.Path, - centroid: bool = False, -) -> None: - """Convert the search results to georeferenced search results and save them to a GeoJSON file. - - Parameters - ---------- - path_save : str | pathlib.Path - The path to save the GeoJSON file. - centroid : bool, optional - Whether to save the centroid of the polygons as the geometry column, by default False. - Note: The original polygon will stil be saved as a separate column. - - Raises - ------ - ValueError - If no search results are found. - """ - print( - "[WARNING] This method is deprecated and will soon be removed. Use `search_results_to_geojson` instead." - ) - self.search_results_to_geojson(path_save, centroid) - - -def search_results_to_geojson( - self, - path_save: str | pathlib.Path, - centroid: bool = False, -) -> None: - """Convert the search results to georeferenced search results and save them to a GeoJSON file. - - Parameters - ---------- - path_save : str | pathlib.Path - The path to save the GeoJSON file. - centroid : bool, optional - Whether to save the centroid of the polygons as the geometry column, by default False. - Note: The original polygon will stil be saved as a separate column. - - Raises - ------ - ValueError - If no search results are found. - """ - if self.search_results == {}: - raise ValueError("[ERROR] No results to save!") - - geo_search_results = self._get_geo_search_results() - geo_df = self._dict_to_dataframe(geo_search_results) - - if centroid: - geo_df["polygon"] = geo_df["geometry"].to_wkt() - geo_df["geometry"] = ( - geo_df["geometry"].to_crs("27700").centroid.to_crs(geo_df.crs) + Raises + ------ + ValueError + If no search results are found. + """ + print( + "[WARNING] This method is deprecated and will soon be removed. Use `search_results_to_geojson` instead." ) + self.search_results_to_geojson(path_save, centroid) + + def search_results_to_geojson( + self, + path_save: str | pathlib.Path, + centroid: bool = False, + ) -> None: + """Convert the search results to georeferenced search results and save them to a GeoJSON file. - geo_df.to_file(path_save, driver="GeoJSON", engine="pyogrio") + Parameters + ---------- + path_save : str | pathlib.Path + The path to save the GeoJSON file. + centroid : bool, optional + Whether to save the centroid of the polygons as the geometry column, by default False. + Note: The original polygon will stil be saved as a separate column. + + Raises + ------ + ValueError + If no search results are found. + """ + if self.search_results == {}: + raise ValueError("[ERROR] No results to save!") + + geo_search_results = self._get_geo_search_results() + geo_df = self._dict_to_dataframe(geo_search_results) + + if centroid: + geo_df["polygon"] = geo_df["geometry"].to_wkt() + geo_df["geometry"] = ( + geo_df["geometry"].to_crs("27700").centroid.to_crs(geo_df.crs) + ) + + geo_df.to_file(path_save, driver="GeoJSON", engine="pyogrio") diff --git a/tests/test_text_spotting/test_deepsolo_runner.py b/tests/test_text_spotting/test_deepsolo_runner.py index 01a24c8a..da0bd882 100644 --- a/tests/test_text_spotting/test_deepsolo_runner.py +++ b/tests/test_text_spotting/test_deepsolo_runner.py @@ -44,7 +44,7 @@ def init_dataframes(sample_dir, tmp_path): """ maps = MapImages(f"{sample_dir}/mapreader_text.png") maps.add_metadata(f"{sample_dir}/mapreader_text_metadata.csv") - maps.patchify_all(patch_size=800, path_=tmp_path) + maps.patchify_all(patch_size=800, path_save=tmp_path) maps.check_georeferencing() assert maps.georeferenced parent_df, patch_df = maps.convert_images() @@ -279,7 +279,7 @@ def test_deepsolo_convert_to_parent_coords(runner_run_all, mock_response): def test_deepsolo_deduplicate(sample_dir, tmp_path, mock_response): maps = MapImages(f"{sample_dir}/mapreader_text.png") maps.add_metadata(f"{sample_dir}/mapreader_text_metadata.csv") - maps.patchify_all(patch_size=800, path_=tmp_path, overlap=0.5) + maps.patchify_all(patch_size=800, path_save=tmp_path, overlap=0.5) maps.check_georeferencing() parent_df, patch_df = maps.convert_images() runner = DeepSoloRunner( @@ -475,33 +475,35 @@ def test_deepsolo_load_patch_predictions_errors(runner_run_all, tmp_path): runner.load_patch_predictions("fake_file.geojson") -def test_deepsolo_search_preds(runner_run_all, mock_response): +def test_deepsolo_search_predictions(runner_run_all, mock_response): runner = runner_run_all _ = runner.convert_to_parent_pixel_bounds() - out = runner.search_preds("map", ignore_case=True) + out = runner.search_predictions("map", ignore_case=True) assert isinstance(out, dict) assert "mapreader_text.png" in out.keys() # test dataframe - out = runner.search_preds("map", ignore_case=True, return_dataframe=True) + out = runner.search_predictions("map", ignore_case=True, return_dataframe=True) assert isinstance(out, pd.DataFrame) assert set(out.columns) == set( ["image_id", "patch_id", "pixel_geometry", "text", "score"] ) assert "mapreader_text.png" in out["image_id"].values - out = runner.search_preds("somethingelse", ignore_case=True, return_dataframe=True) + out = runner.search_predictions( + "somethingelse", ignore_case=True, return_dataframe=True + ) assert len(out) == 0 -def test_deepsolo_search_preds_errors(runner_run_all, mock_response): +def test_deepsolo_search_predictions_errors(runner_run_all, mock_response): runner = runner_run_all with pytest.raises(ValueError, match="No parent predictions found"): - runner.search_preds("maps", ignore_case=True) + runner.search_predictions("maps", ignore_case=True) def test_deepsolo_search_results(runner_run_all, tmp_path, mock_response): runner = runner_run_all _ = runner.convert_to_parent_pixel_bounds() - out = runner.search_preds("map", ignore_case=True) + out = runner.search_predictions("map", ignore_case=True) assert isinstance(out, dict) runner.search_results_to_geojson(f"{tmp_path}/search_results.geojson") assert os.path.exists(f"{tmp_path}/search_results.geojson") @@ -516,7 +518,7 @@ def test_deepsolo_search_results(runner_run_all, tmp_path, mock_response): def test_deepsolo_search_results_centroid(runner_run_all, tmp_path, mock_response): runner = runner_run_all _ = runner.convert_to_parent_pixel_bounds() - out = runner.search_preds("map", ignore_case=True) + out = runner.search_predictions("map", ignore_case=True) assert isinstance(out, dict) runner.search_results_to_geojson( f"{tmp_path}/search_results_centroid.geojson", centroid=True diff --git a/tests/test_text_spotting/test_maptext_runner.py b/tests/test_text_spotting/test_maptext_runner.py index 2f21a162..cbb9d8dc 100644 --- a/tests/test_text_spotting/test_maptext_runner.py +++ b/tests/test_text_spotting/test_maptext_runner.py @@ -318,33 +318,35 @@ def test_maptext_to_geojson(runner_run_all, tmp_path, mock_response): ) -def test_maptext_search_preds(runner_run_all, mock_response): +def test_maptext_search_predictions(runner_run_all, mock_response): runner = runner_run_all _ = runner.convert_to_parent_pixel_bounds() - out = runner.search_preds("map", ignore_case=True) + out = runner.search_predictions("map", ignore_case=True) assert isinstance(out, dict) assert "mapreader_text.png" in out.keys() # test dataframe - out = runner.search_preds("map", ignore_case=True, return_dataframe=True) + out = runner.search_predictions("map", ignore_case=True, return_dataframe=True) assert isinstance(out, pd.DataFrame) assert set(out.columns) == set( ["image_id", "patch_id", "pixel_geometry", "text", "score"] ) assert "mapreader_text.png" in out["image_id"].values - out = runner.search_preds("somethingelse", ignore_case=True, return_dataframe=True) + out = runner.search_predictions( + "somethingelse", ignore_case=True, return_dataframe=True + ) assert len(out) == 0 -def test_maptext_search_preds_errors(runner_run_all, mock_response): +def test_maptext_search_predictions_errors(runner_run_all, mock_response): runner = runner_run_all with pytest.raises(ValueError, match="No parent predictions found"): - runner.search_preds("maps", ignore_case=True) + runner.search_predictions("maps", ignore_case=True) def test_maptext_search_results(runner_run_all, tmp_path, mock_response): runner = runner_run_all _ = runner.convert_to_parent_pixel_bounds() - out = runner.search_preds("map", ignore_case=True) + out = runner.search_predictions("map", ignore_case=True) assert isinstance(out, dict) runner.search_results_to_geojson(f"{tmp_path}/search_results.geojson") assert os.path.exists(f"{tmp_path}/search_results.geojson")