Df compatibility fixes (#12)

* Allow multi-index columns * Optionally add index to tableau export * Rename duplicated columns to make them unique * Remove accented characters * Replace forbidden characters in Tableau column name
CFMTech · Dec 20, 2019 · 572a82a · 572a82a
1 parent 8f7102e
commit 572a82a
Show file tree

Hide file tree

Showing 3 changed files with 122 additions and 4 deletions.
diff --git a/jupytab/util.py b/jupytab/util.py
@@ -2,6 +2,11 @@
 # SPDX-License-Identifier: MIT
 
 import json
+import re
+import unicodedata
+from collections import Counter
+
+import pandas as pd
 
 
 class BaseTable:
@@ -59,7 +64,7 @@ class DataFrameTable(BaseTable):
     This class represents a jupytab-ready table that exposes a Pandas DataFrame.
     """
 
-    def __init__(self, alias, dataframe=None, refresh_method=None):
+    def __init__(self, alias, dataframe=None, refresh_method=None, include_index=False):
         """
         alias -- Descriptive name of the table, that will be displayed in Tableau.
 
@@ -70,11 +75,15 @@ def __init__(self, alias, dataframe=None, refresh_method=None):
         Tableau needs to access the data (for instance when the DataSource is refreshed).
         It takes no argument and must return a DataFrame with the same column layout
         (schema) as the original DataFrame (if any).
+
+        include_index -- Add Index as column(s) in the output data to Tableau.
         """
         BaseTable.__init__(self, alias=alias)
 
         self._dataframe = dataframe
         self._refresh_method = refresh_method
+        self._include_index = include_index
+        self._index_separator = '_'
 
         self.types_mapping = {
             'object': 'string',
@@ -84,16 +93,41 @@ def __init__(self, alias, dataframe=None, refresh_method=None):
             'bool': 'bool'
         }
 
+    @staticmethod
+    def clean_column_name(col):
+        """Remove all forbidden characters from column names"""
+
+        # Try to preserve accented characters
+        cleaned_col = unicodedata.normalize('NFD', str(col)) \
+            .encode('ascii', 'ignore') \
+            .decode("utf-8")
+        # Remove all non matching chars for Tableau WDC
+        cleaned_col = re.sub(r'[^A-Za-z0-9_]+', '_', cleaned_col)
+        return cleaned_col
+
+    @staticmethod
+    def replace_duplicated_column_name(cols):
+        """Replace duplicated columns names"""
+        cols_count_dict = dict(Counter(cols))
+        # Filter unique items
+        cols_count_dict = {key: value for (key, value) in cols_count_dict.items() if value > 1}
+        unique_cols = list()
+        for col in reversed(cols):
+            idx = cols_count_dict.get(col, 0)
+            unique_cols.insert(0, col if idx == 0 else col + '_' + str(idx))
+            cols_count_dict[col] = idx - 1
+        return unique_cols
+
     def get_schema(self, key):
         self.refresh(only_if_undefined=True)
 
         columns = [
             {
-                'id': key,
+                'id': '.'.join(filter(None, key)) if isinstance(key, tuple) else key,
                 'dataType':
                     self.types_mapping[str(value)] if str(value) in self.types_mapping else 'string'
             }
-            for key, value in self._dataframe.dtypes.items()
+            for key, value in (self._prepare_dataframe()).dtypes.items()
         ]
 
         return {
@@ -102,14 +136,30 @@ def get_schema(self, key):
             'columns': columns
         }
 
+    def _prepare_dataframe(self):
+        # Remove index if it is not required
+        prep_df = self._dataframe \
+            if self._include_index \
+            else self._dataframe.reset_index(drop=True)
+        # Flatten multi-index
+        if isinstance(prep_df.columns, pd.MultiIndex):
+            prep_df.columns = [self._index_separator.join(map(str, col)).strip()
+                               for col in prep_df.columns.values]
+
+        prep_df.columns = [DataFrameTable.clean_column_name(col) for col in prep_df.columns]
+        prep_df.columns = DataFrameTable.replace_duplicated_column_name(prep_df.columns)
+
+        return prep_df
+
     def refresh(self, only_if_undefined=False):
         # If DataFrame exists and it is not requested to update it then we do not need to refresh.
         # Otherwise if a refresh method has been set it is required to update the DataFrame.
         if (not only_if_undefined or self._dataframe is None) and self._refresh_method is not None:
             self._dataframe = self._refresh_method()
 
     def to_json(self):
-        return self._dataframe.to_json(orient='records', date_format="iso", date_unit="s")
+        return self._prepare_dataframe() \
+            .to_json(orient='records', date_format="iso", date_unit="s")
 
 
 class Tables:

diff --git a/tests/test_dataframe.py b/tests/test_dataframe.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2019 Capital Fund Management
+# SPDX-License-Identifier: MIT
+
+import numpy as np
+import pandas as pd
+
+from jupytab import Tables, DataFrameTable
+
+
+def test_data_schema():
+    arrays = [
+        ['A', 'A',
+         'a', 'a',
+         0, 0,
+         'a$_!#àz', 'a$_!#àz'
+         ],
+        ['A', 'A',
+         0, 1,
+         'z$_"_àéça"', 'z_èà[|]a',
+         'ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789', 'abcdefghijklmnopqrstuvwxyz0123456789'
+         ]
+    ]
+    tuples = list(zip(*arrays))
+    index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])
+    complex_df = pd.DataFrame(np.random.randn(len(index), len(index)), index=index, columns=index)
+
+    tables = Tables()
+    tables['complex_df_no_index_{}[]#!'] = \
+        DataFrameTable('A multi-index Dataframe ({}[]#!)',
+                       dataframe=complex_df)
+    tables['complex_df_with_index_{}[]#!'] = \
+        DataFrameTable('A multi-index Dataframe ({}[]#!)',
+                       dataframe=complex_df,
+                       include_index=True)
+
+    schema = tables.schema()
+
+    assert schema[0]['id'] == 'complex_df_no_index_{}[]#!'
+    assert schema[0]['alias'] == 'A multi-index Dataframe ({}[]#!)'
+    assert schema[1]['id'] == 'complex_df_with_index_{}[]#!'
+    assert schema[1]['alias'] == 'A multi-index Dataframe ({}[]#!)'
+
+    raw_output = '[{"id": "complex_df_no_index_{}[]#!", "alias": "A multi-index Dataframe ({}[]#!' \
+                 ')", "columns": [{"id": "A_A_1", "dataType": "float"}, {"id": "A_A_2", "dataType' \
+                 '": "float"}, {"id": "a_0", "dataType": "float"}, {"id": "a_1", "dataType": "flo' \
+                 'at"}, {"id": "0_z____aeca_", "dataType": "float"}, {"id": "0_z_ea_a", "dataType' \
+                 '": "float"}, {"id": "a___az_ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789", "dataType": ' \
+                 '"float"}, {"id": "a___az_abcdefghijklmnopqrstuvwxyz0123456789", "dataType": "fl' \
+                 'oat"}]}, {"id": "complex_df_with_index_{}[]#!", "alias": "A multi-index Datafra' \
+                 'me ({}[]#!)", "columns": [{"id": "A_A_1", "dataType": "float"}, {"id": "A_A_2",' \
+                 ' "dataType": "float"}, {"id": "a_0", "dataType": "float"}, {"id": "a_1", "dataT' \
+                 'ype": "float"}, {"id": "0_z____aeca_", "dataType": "float"}, {"id": "0_z_ea_a",' \
+                 ' "dataType": "float"}, {"id": "a___az_ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789", "d' \
+                 'ataType": "float"}, {"id": "a___az_abcdefghijklmnopqrstuvwxyz0123456789", "data' \
+                 'Type": "float"}]}]'
+
+    raw_schema = tables.render_schema(do_print=False)
+
+    assert raw_output == raw_schema
diff --git a/tests/test_util.py b/tests/test_util.py
@@ -41,3 +41,12 @@ def test_data_schema():
 }]}]'
 
     assert raw_output == tables.render_schema(do_print=False)
+
+
+def test_clean_column_name():
+    assert DataFrameTable.clean_column_name(['abéçpo$ù"', 0, 'AaZz_#"\\']) == "_abecpo_u_0_AaZz__"
+
+
+def test_replace_duplicated_column_name():
+    assert DataFrameTable.replace_duplicated_column_name(['A', 'A', 'a', 'z', 'a', 'Y']) \
+           == ['A_1', 'A_2', 'a_1', 'z', 'a_2', 'Y']