From 572a82aad657ca7e890fca7a82b048f59e894b2b Mon Sep 17 00:00:00 2001 From: btribonde <49305499+btribonde@users.noreply.github.com> Date: Fri, 20 Dec 2019 17:00:51 +0100 Subject: [PATCH] Df compatibility fixes (#12) * Allow multi-index columns * Optionally add index to tableau export * Rename duplicated columns to make them unique * Remove accented characters * Replace forbidden characters in Tableau column name --- jupytab/util.py | 58 +++++++++++++++++++++++++++++++++++++--- tests/test_dataframe.py | 59 +++++++++++++++++++++++++++++++++++++++++ tests/test_util.py | 9 +++++++ 3 files changed, 122 insertions(+), 4 deletions(-) create mode 100644 tests/test_dataframe.py diff --git a/jupytab/util.py b/jupytab/util.py index e564f89..5edb980 100644 --- a/jupytab/util.py +++ b/jupytab/util.py @@ -2,6 +2,11 @@ # SPDX-License-Identifier: MIT import json +import re +import unicodedata +from collections import Counter + +import pandas as pd class BaseTable: @@ -59,7 +64,7 @@ class DataFrameTable(BaseTable): This class represents a jupytab-ready table that exposes a Pandas DataFrame. """ - def __init__(self, alias, dataframe=None, refresh_method=None): + def __init__(self, alias, dataframe=None, refresh_method=None, include_index=False): """ alias -- Descriptive name of the table, that will be displayed in Tableau. @@ -70,11 +75,15 @@ def __init__(self, alias, dataframe=None, refresh_method=None): Tableau needs to access the data (for instance when the DataSource is refreshed). It takes no argument and must return a DataFrame with the same column layout (schema) as the original DataFrame (if any). + + include_index -- Add Index as column(s) in the output data to Tableau. """ BaseTable.__init__(self, alias=alias) self._dataframe = dataframe self._refresh_method = refresh_method + self._include_index = include_index + self._index_separator = '_' self.types_mapping = { 'object': 'string', @@ -84,16 +93,41 @@ def __init__(self, alias, dataframe=None, refresh_method=None): 'bool': 'bool' } + @staticmethod + def clean_column_name(col): + """Remove all forbidden characters from column names""" + + # Try to preserve accented characters + cleaned_col = unicodedata.normalize('NFD', str(col)) \ + .encode('ascii', 'ignore') \ + .decode("utf-8") + # Remove all non matching chars for Tableau WDC + cleaned_col = re.sub(r'[^A-Za-z0-9_]+', '_', cleaned_col) + return cleaned_col + + @staticmethod + def replace_duplicated_column_name(cols): + """Replace duplicated columns names""" + cols_count_dict = dict(Counter(cols)) + # Filter unique items + cols_count_dict = {key: value for (key, value) in cols_count_dict.items() if value > 1} + unique_cols = list() + for col in reversed(cols): + idx = cols_count_dict.get(col, 0) + unique_cols.insert(0, col if idx == 0 else col + '_' + str(idx)) + cols_count_dict[col] = idx - 1 + return unique_cols + def get_schema(self, key): self.refresh(only_if_undefined=True) columns = [ { - 'id': key, + 'id': '.'.join(filter(None, key)) if isinstance(key, tuple) else key, 'dataType': self.types_mapping[str(value)] if str(value) in self.types_mapping else 'string' } - for key, value in self._dataframe.dtypes.items() + for key, value in (self._prepare_dataframe()).dtypes.items() ] return { @@ -102,6 +136,21 @@ def get_schema(self, key): 'columns': columns } + def _prepare_dataframe(self): + # Remove index if it is not required + prep_df = self._dataframe \ + if self._include_index \ + else self._dataframe.reset_index(drop=True) + # Flatten multi-index + if isinstance(prep_df.columns, pd.MultiIndex): + prep_df.columns = [self._index_separator.join(map(str, col)).strip() + for col in prep_df.columns.values] + + prep_df.columns = [DataFrameTable.clean_column_name(col) for col in prep_df.columns] + prep_df.columns = DataFrameTable.replace_duplicated_column_name(prep_df.columns) + + return prep_df + def refresh(self, only_if_undefined=False): # If DataFrame exists and it is not requested to update it then we do not need to refresh. # Otherwise if a refresh method has been set it is required to update the DataFrame. @@ -109,7 +158,8 @@ def refresh(self, only_if_undefined=False): self._dataframe = self._refresh_method() def to_json(self): - return self._dataframe.to_json(orient='records', date_format="iso", date_unit="s") + return self._prepare_dataframe() \ + .to_json(orient='records', date_format="iso", date_unit="s") class Tables: diff --git a/tests/test_dataframe.py b/tests/test_dataframe.py new file mode 100644 index 0000000..974f5eb --- /dev/null +++ b/tests/test_dataframe.py @@ -0,0 +1,59 @@ +# Copyright (c) 2019 Capital Fund Management +# SPDX-License-Identifier: MIT + +import numpy as np +import pandas as pd + +from jupytab import Tables, DataFrameTable + + +def test_data_schema(): + arrays = [ + ['A', 'A', + 'a', 'a', + 0, 0, + 'a$_!#àz', 'a$_!#àz' + ], + ['A', 'A', + 0, 1, + 'z$_"_àéça"', 'z_èà[|]a', + 'ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789', 'abcdefghijklmnopqrstuvwxyz0123456789' + ] + ] + tuples = list(zip(*arrays)) + index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second']) + complex_df = pd.DataFrame(np.random.randn(len(index), len(index)), index=index, columns=index) + + tables = Tables() + tables['complex_df_no_index_{}[]#!'] = \ + DataFrameTable('A multi-index Dataframe ({}[]#!)', + dataframe=complex_df) + tables['complex_df_with_index_{}[]#!'] = \ + DataFrameTable('A multi-index Dataframe ({}[]#!)', + dataframe=complex_df, + include_index=True) + + schema = tables.schema() + + assert schema[0]['id'] == 'complex_df_no_index_{}[]#!' + assert schema[0]['alias'] == 'A multi-index Dataframe ({}[]#!)' + assert schema[1]['id'] == 'complex_df_with_index_{}[]#!' + assert schema[1]['alias'] == 'A multi-index Dataframe ({}[]#!)' + + raw_output = '[{"id": "complex_df_no_index_{}[]#!", "alias": "A multi-index Dataframe ({}[]#!' \ + ')", "columns": [{"id": "A_A_1", "dataType": "float"}, {"id": "A_A_2", "dataType' \ + '": "float"}, {"id": "a_0", "dataType": "float"}, {"id": "a_1", "dataType": "flo' \ + 'at"}, {"id": "0_z____aeca_", "dataType": "float"}, {"id": "0_z_ea_a", "dataType' \ + '": "float"}, {"id": "a___az_ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789", "dataType": ' \ + '"float"}, {"id": "a___az_abcdefghijklmnopqrstuvwxyz0123456789", "dataType": "fl' \ + 'oat"}]}, {"id": "complex_df_with_index_{}[]#!", "alias": "A multi-index Datafra' \ + 'me ({}[]#!)", "columns": [{"id": "A_A_1", "dataType": "float"}, {"id": "A_A_2",' \ + ' "dataType": "float"}, {"id": "a_0", "dataType": "float"}, {"id": "a_1", "dataT' \ + 'ype": "float"}, {"id": "0_z____aeca_", "dataType": "float"}, {"id": "0_z_ea_a",' \ + ' "dataType": "float"}, {"id": "a___az_ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789", "d' \ + 'ataType": "float"}, {"id": "a___az_abcdefghijklmnopqrstuvwxyz0123456789", "data' \ + 'Type": "float"}]}]' + + raw_schema = tables.render_schema(do_print=False) + + assert raw_output == raw_schema diff --git a/tests/test_util.py b/tests/test_util.py index c1acf8c..50bdc7f 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -41,3 +41,12 @@ def test_data_schema(): }]}]' assert raw_output == tables.render_schema(do_print=False) + + +def test_clean_column_name(): + assert DataFrameTable.clean_column_name(['abéçpo$ù"', 0, 'AaZz_#"\\']) == "_abecpo_u_0_AaZz__" + + +def test_replace_duplicated_column_name(): + assert DataFrameTable.replace_duplicated_column_name(['A', 'A', 'a', 'z', 'a', 'Y']) \ + == ['A_1', 'A_2', 'a_1', 'z', 'a_2', 'Y']