Skip to content

Commit

Permalink
Update test_perf.ipynb
Browse files Browse the repository at this point in the history
  • Loading branch information
loco-philippe committed Jun 25, 2024
1 parent 514f982 commit b7e6ddb
Showing 1 changed file with 67 additions and 59 deletions.
126 changes: 67 additions & 59 deletions tests/test_perf.ipynb
Original file line number Diff line number Diff line change
@@ -1,61 +1,14 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 59,
"id": "7469f343",
"metadata": {},
"outputs": [],
"source": [
"import random\n",
"import string\n",
"import json\n",
"import xarray as xr\n",
"import pandas as pd\n",
"import numpy as np\n",
"from ntv_numpy import Xdataset\n",
"import ntv_pandas as npd\n",
"import cbor2\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": 68,
"id": "717e496b-19c1-472c-ad67-6c9ba1b1a014",
"metadata": {},
"outputs": [],
"source": [
"def file_sizes(xnd, forma={}):\n",
" '''calculate the size of each format'''\n",
" df = xnd.to_dataframe(json_name=False, info=False).reset_index().sample(frac=1)\n",
" jsn = xnd.to_json(notype='all', header=False, encoded=False, format=forma)\n",
" return {'pd.to_json(values)': len(df.to_json(orient='values')),\n",
" 'pd.to_csv': len(df.to_csv()),\n",
" 'pd.to_parquet': len(df.to_parquet(engine='pyarrow')),\n",
" 'xnd.to_json': len(json.dumps(jsn)),\n",
" 'xnd.to_json(cbor)': len(cbor2.dumps(jsn))}\n",
"\n",
"def sizes_plot(sizes, titles, fig_title):\n",
" '''plot the size of some format''' \n",
" fig, axs = plt.subplots(1, 4, figsize=(24, 6))\n",
" fig.suptitle(fig_title)\n",
" for idx, (size, title) in enumerate(zip(sizes, titles)):\n",
" bar_colors = ['tab:red' if val == min(size.values()) else 'tab:blue' for val in size.values()]\n",
" percent = [str(round(val / list(size.values())[1] * 100, 1)) + ' %' for val in size.values()]\n",
" bar_plt = axs[idx].bar(size.keys(), size.values(), color=bar_colors)\n",
" axs[idx].set_title(title)\n",
" axs[idx].bar_label(bar_plt, percent, label_type='center')\n",
" axs[idx].tick_params(axis='x', rotation=55)\n",
" plt.show()"
]
},
{
"cell_type": "markdown",
"id": "2d95acb2-7c8b-432e-ac23-e3d6ec04c8fe",
"id": "09e9962f-04a5-423b-b10d-ce431fccf085",
"metadata": {},
"source": [
"## "
"# Table representation\n",
"\n",
"This Notebook compares the size of table representation for the most common Table structures.\n",
"Representation uased are CSV, JSON, PARQUET"
]
},
{
Expand All @@ -71,19 +24,24 @@
"id": "a9247d13-a1b0-4b33-af74-eab3c48511ac",
"metadata": {},
"source": [
"Two basic patterns are present in Datasets:\n",
"Tables contain to kind of columns:\n",
"\n",
"- variables (values) : columns with unstructured data\n",
"- indexes (coordinates) : columns with data used to select or filter variables data\n",
"\n",
"Two basic patterns are present in Tables indexes :\n",
"\n",
"- Tree pattern: A tree is represented in tabular form by a list of paths between each leaf and the node. The columns then represent the levels of the tree.\n",
"- Multidimensional pattern: A matrix (or multidimensional data) is represented in tabular form by a column of the values of the matrix and additional columns represent the coordinates of each of the values.\n",
"\n",
"Table 1 presents an example of binary tree.\n",
"\n",
"| Root | level 1 | level 2 |\n",
"|------|---------|---------|\n",
"| A | B | D |\n",
"| A | B | E |\n",
"| A | C | F |\n",
"| A | C | G |\n",
"| Root | level 1 | level 2 | value |\n",
"|------|---------|---------|-------|\n",
"| A | B | D |1 |\n",
"| A | B | E |2 |\n",
"| A | C | F |3 |\n",
"| A | C | G |4 |\n",
"\n",
"*Table 1: Tree pattern*\n",
"\n",
Expand All @@ -101,6 +59,25 @@
"Taking these structures into account leads to significant duplication of data. In the general case, Datasets mix these different structures."
]
},
{
"cell_type": "code",
"execution_count": 59,
"id": "7469f343",
"metadata": {},
"outputs": [],
"source": [
"import random\n",
"import string\n",
"import json\n",
"import xarray as xr\n",
"import pandas as pd\n",
"import numpy as np\n",
"from ntv_numpy import Xdataset\n",
"import ntv_pandas as npd\n",
"import cbor2\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "markdown",
"id": "4f003ae2-04f5-4f51-83a8-106189bf8fdb",
Expand Down Expand Up @@ -292,6 +269,37 @@
"Xdataset.from_xarray(xdss[2]).to_dataframe(ntv_type=False, info=False).reset_index()"
]
},
{
"cell_type": "code",
"execution_count": 68,
"id": "717e496b-19c1-472c-ad67-6c9ba1b1a014",
"metadata": {},
"outputs": [],
"source": [
"def file_sizes(xnd, forma={}):\n",
" '''calculate the size of each format'''\n",
" df = xnd.to_dataframe(json_name=False, info=False).reset_index().sample(frac=1)\n",
" jsn = xnd.to_json(notype='all', header=False, encoded=False, format=forma)\n",
" return {'pd.to_json(values)': len(df.to_json(orient='values')),\n",
" 'pd.to_csv': len(df.to_csv()),\n",
" 'pd.to_parquet': len(df.to_parquet(engine='pyarrow')),\n",
" 'xnd.to_json': len(json.dumps(jsn)),\n",
" 'xnd.to_json(cbor)': len(cbor2.dumps(jsn))}\n",
"\n",
"def sizes_plot(sizes, titles, fig_title):\n",
" '''plot the size of some format''' \n",
" fig, axs = plt.subplots(1, 4, figsize=(24, 6))\n",
" fig.suptitle(fig_title)\n",
" for idx, (size, title) in enumerate(zip(sizes, titles)):\n",
" bar_colors = ['tab:red' if val == min(size.values()) else 'tab:blue' for val in size.values()]\n",
" percent = [str(round(val / list(size.values())[1] * 100, 1)) + ' %' for val in size.values()]\n",
" bar_plt = axs[idx].bar(size.keys(), size.values(), color=bar_colors)\n",
" axs[idx].set_title(title)\n",
" axs[idx].bar_label(bar_plt, percent, label_type='center')\n",
" axs[idx].tick_params(axis='x', rotation=55)\n",
" plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 92,
Expand Down

0 comments on commit b7e6ddb

Please sign in to comment.