Skip to content

Commit

Permalink
Df compatibility fixes (#12)
Browse files Browse the repository at this point in the history
* Allow multi-index columns
* Optionally add index to tableau export
* Rename duplicated columns to make them unique
* Remove accented characters
* Replace forbidden characters in Tableau column name
  • Loading branch information
btribonde authored Dec 20, 2019
1 parent 8f7102e commit 572a82a
Show file tree
Hide file tree
Showing 3 changed files with 122 additions and 4 deletions.
58 changes: 54 additions & 4 deletions jupytab/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@
# SPDX-License-Identifier: MIT

import json
import re
import unicodedata
from collections import Counter

import pandas as pd


class BaseTable:
Expand Down Expand Up @@ -59,7 +64,7 @@ class DataFrameTable(BaseTable):
This class represents a jupytab-ready table that exposes a Pandas DataFrame.
"""

def __init__(self, alias, dataframe=None, refresh_method=None):
def __init__(self, alias, dataframe=None, refresh_method=None, include_index=False):
"""
alias -- Descriptive name of the table, that will be displayed in Tableau.
Expand All @@ -70,11 +75,15 @@ def __init__(self, alias, dataframe=None, refresh_method=None):
Tableau needs to access the data (for instance when the DataSource is refreshed).
It takes no argument and must return a DataFrame with the same column layout
(schema) as the original DataFrame (if any).
include_index -- Add Index as column(s) in the output data to Tableau.
"""
BaseTable.__init__(self, alias=alias)

self._dataframe = dataframe
self._refresh_method = refresh_method
self._include_index = include_index
self._index_separator = '_'

self.types_mapping = {
'object': 'string',
Expand All @@ -84,16 +93,41 @@ def __init__(self, alias, dataframe=None, refresh_method=None):
'bool': 'bool'
}

@staticmethod
def clean_column_name(col):
"""Remove all forbidden characters from column names"""

# Try to preserve accented characters
cleaned_col = unicodedata.normalize('NFD', str(col)) \
.encode('ascii', 'ignore') \
.decode("utf-8")
# Remove all non matching chars for Tableau WDC
cleaned_col = re.sub(r'[^A-Za-z0-9_]+', '_', cleaned_col)
return cleaned_col

@staticmethod
def replace_duplicated_column_name(cols):
"""Replace duplicated columns names"""
cols_count_dict = dict(Counter(cols))
# Filter unique items
cols_count_dict = {key: value for (key, value) in cols_count_dict.items() if value > 1}
unique_cols = list()
for col in reversed(cols):
idx = cols_count_dict.get(col, 0)
unique_cols.insert(0, col if idx == 0 else col + '_' + str(idx))
cols_count_dict[col] = idx - 1
return unique_cols

def get_schema(self, key):
self.refresh(only_if_undefined=True)

columns = [
{
'id': key,
'id': '.'.join(filter(None, key)) if isinstance(key, tuple) else key,
'dataType':
self.types_mapping[str(value)] if str(value) in self.types_mapping else 'string'
}
for key, value in self._dataframe.dtypes.items()
for key, value in (self._prepare_dataframe()).dtypes.items()
]

return {
Expand All @@ -102,14 +136,30 @@ def get_schema(self, key):
'columns': columns
}

def _prepare_dataframe(self):
# Remove index if it is not required
prep_df = self._dataframe \
if self._include_index \
else self._dataframe.reset_index(drop=True)
# Flatten multi-index
if isinstance(prep_df.columns, pd.MultiIndex):
prep_df.columns = [self._index_separator.join(map(str, col)).strip()
for col in prep_df.columns.values]

prep_df.columns = [DataFrameTable.clean_column_name(col) for col in prep_df.columns]
prep_df.columns = DataFrameTable.replace_duplicated_column_name(prep_df.columns)

return prep_df

def refresh(self, only_if_undefined=False):
# If DataFrame exists and it is not requested to update it then we do not need to refresh.
# Otherwise if a refresh method has been set it is required to update the DataFrame.
if (not only_if_undefined or self._dataframe is None) and self._refresh_method is not None:
self._dataframe = self._refresh_method()

def to_json(self):
return self._dataframe.to_json(orient='records', date_format="iso", date_unit="s")
return self._prepare_dataframe() \
.to_json(orient='records', date_format="iso", date_unit="s")


class Tables:
Expand Down
59 changes: 59 additions & 0 deletions tests/test_dataframe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# Copyright (c) 2019 Capital Fund Management
# SPDX-License-Identifier: MIT

import numpy as np
import pandas as pd

from jupytab import Tables, DataFrameTable


def test_data_schema():
arrays = [
['A', 'A',
'a', 'a',
0, 0,
'a$_!#àz', 'a$_!#àz'
],
['A', 'A',
0, 1,
'z$_"_àéça"', 'z_èà[|]a',
'ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789', 'abcdefghijklmnopqrstuvwxyz0123456789'
]
]
tuples = list(zip(*arrays))
index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])
complex_df = pd.DataFrame(np.random.randn(len(index), len(index)), index=index, columns=index)

tables = Tables()
tables['complex_df_no_index_{}[]#!'] = \
DataFrameTable('A multi-index Dataframe ({}[]#!)',
dataframe=complex_df)
tables['complex_df_with_index_{}[]#!'] = \
DataFrameTable('A multi-index Dataframe ({}[]#!)',
dataframe=complex_df,
include_index=True)

schema = tables.schema()

assert schema[0]['id'] == 'complex_df_no_index_{}[]#!'
assert schema[0]['alias'] == 'A multi-index Dataframe ({}[]#!)'
assert schema[1]['id'] == 'complex_df_with_index_{}[]#!'
assert schema[1]['alias'] == 'A multi-index Dataframe ({}[]#!)'

raw_output = '[{"id": "complex_df_no_index_{}[]#!", "alias": "A multi-index Dataframe ({}[]#!' \
')", "columns": [{"id": "A_A_1", "dataType": "float"}, {"id": "A_A_2", "dataType' \
'": "float"}, {"id": "a_0", "dataType": "float"}, {"id": "a_1", "dataType": "flo' \
'at"}, {"id": "0_z____aeca_", "dataType": "float"}, {"id": "0_z_ea_a", "dataType' \
'": "float"}, {"id": "a___az_ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789", "dataType": ' \
'"float"}, {"id": "a___az_abcdefghijklmnopqrstuvwxyz0123456789", "dataType": "fl' \
'oat"}]}, {"id": "complex_df_with_index_{}[]#!", "alias": "A multi-index Datafra' \
'me ({}[]#!)", "columns": [{"id": "A_A_1", "dataType": "float"}, {"id": "A_A_2",' \
' "dataType": "float"}, {"id": "a_0", "dataType": "float"}, {"id": "a_1", "dataT' \
'ype": "float"}, {"id": "0_z____aeca_", "dataType": "float"}, {"id": "0_z_ea_a",' \
' "dataType": "float"}, {"id": "a___az_ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789", "d' \
'ataType": "float"}, {"id": "a___az_abcdefghijklmnopqrstuvwxyz0123456789", "data' \
'Type": "float"}]}]'

raw_schema = tables.render_schema(do_print=False)

assert raw_output == raw_schema
9 changes: 9 additions & 0 deletions tests/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,3 +41,12 @@ def test_data_schema():
}]}]'

assert raw_output == tables.render_schema(do_print=False)


def test_clean_column_name():
assert DataFrameTable.clean_column_name(['abéçpo$ù"', 0, 'AaZz_#"\\']) == "_abecpo_u_0_AaZz__"


def test_replace_duplicated_column_name():
assert DataFrameTable.replace_duplicated_column_name(['A', 'A', 'a', 'z', 'a', 'Y']) \
== ['A_1', 'A_2', 'a_1', 'z', 'a_2', 'Y']

0 comments on commit 572a82a

Please sign in to comment.