Bug workaround (parquet serialization with non str values in Document…

…s column)
rte-france · Feb 14, 2025 · 363f44a · 363f44a
1 parent f3eb5b2
commit 363f44a
Showing 1 changed file with 7 additions and 0 deletions.
diff --git a/bertrend_apps/prospective_demo/process_new_data.py b/bertrend_apps/prospective_demo/process_new_data.py
@@ -158,6 +158,13 @@ def train_new_model(
                     lambda x: list(set(x))
                 )  # Removes duplicates within each list
 
+                # FIXME: for some unknown reasons, a few elements in the Documents column are not a str but a
+                #  timestamp (the identifier of current model); this generates errors when trying to serialize the
+                #  df to parquet. The code snippet below is a workaround to avoid this issue.
+                df["Documents"] = df["Documents"].apply(
+                    lambda l: [x if isinstance(x, str) else "" for x in l]
+                )
+
                 output_path = interpretation_path / f"{df_name}.parquet"
                 df.to_parquet(output_path)
                 logger.success(f"{df_name} saved to: {output_path}")