Update test_perf.ipynb

loco-philippe · Jun 25, 2024 · b7e6ddb · b7e6ddb
1 parent 514f982
commit b7e6ddb
Showing 1 changed file with 67 additions and 59 deletions.
diff --git a/tests/test_perf.ipynb b/tests/test_perf.ipynb
@@ -1,61 +1,14 @@
 {
  "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 59,
-   "id": "7469f343",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import random\n",
-    "import string\n",
-    "import json\n",
-    "import xarray as xr\n",
-    "import pandas as pd\n",
-    "import numpy as np\n",
-    "from ntv_numpy import Xdataset\n",
-    "import ntv_pandas as npd\n",
-    "import cbor2\n",
-    "import matplotlib.pyplot as plt"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 68,
-   "id": "717e496b-19c1-472c-ad67-6c9ba1b1a014",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def file_sizes(xnd, forma={}):\n",
-    "    '''calculate the size of each format'''\n",
-    "    df = xnd.to_dataframe(json_name=False, info=False).reset_index().sample(frac=1)\n",
-    "    jsn = xnd.to_json(notype='all', header=False, encoded=False, format=forma)\n",
-    "    return {'pd.to_json(values)': len(df.to_json(orient='values')),\n",
-    "            'pd.to_csv': len(df.to_csv()),\n",
-    "            'pd.to_parquet': len(df.to_parquet(engine='pyarrow')),\n",
-    "            'xnd.to_json': len(json.dumps(jsn)),\n",
-    "            'xnd.to_json(cbor)': len(cbor2.dumps(jsn))}\n",
-    "\n",
-    "def sizes_plot(sizes, titles, fig_title):\n",
-    "    '''plot the size of some format''' \n",
-    "    fig, axs = plt.subplots(1, 4, figsize=(24, 6))\n",
-    "    fig.suptitle(fig_title)\n",
-    "    for idx, (size, title) in enumerate(zip(sizes, titles)):\n",
-    "        bar_colors = ['tab:red' if val == min(size.values()) else 'tab:blue' for val in size.values()]\n",
-    "        percent = [str(round(val / list(size.values())[1] * 100, 1)) + ' %' for val in size.values()]\n",
-    "        bar_plt = axs[idx].bar(size.keys(), size.values(), color=bar_colors)\n",
-    "        axs[idx].set_title(title)\n",
-    "        axs[idx].bar_label(bar_plt, percent, label_type='center')\n",
-    "        axs[idx].tick_params(axis='x', rotation=55)\n",
-    "    plt.show()"
-   ]
-  },
   {
    "cell_type": "markdown",
-   "id": "2d95acb2-7c8b-432e-ac23-e3d6ec04c8fe",
+   "id": "09e9962f-04a5-423b-b10d-ce431fccf085",
    "metadata": {},
    "source": [
-    "## "
+    "# Table representation\n",
+    "\n",
+    "This Notebook compares the size of table representation for the most common Table structures.\n",
+    "Representation uased are CSV, JSON, PARQUET"
    ]
   },
   {
@@ -71,19 +24,24 @@
    "id": "a9247d13-a1b0-4b33-af74-eab3c48511ac",
    "metadata": {},
    "source": [
-    "Two basic patterns are present in Datasets:\n",
+    "Tables contain to kind of columns:\n",
+    "\n",
+    "- variables (values) : columns with unstructured data\n",
+    "- indexes (coordinates) : columns with data used to select or filter variables data\n",
+    "\n",
+    "Two basic patterns are present in Tables indexes :\n",
     "\n",
     "- Tree pattern: A tree is represented in tabular form by a list of paths between each leaf and the node. The columns then represent the levels of the tree.\n",
     "- Multidimensional pattern: A matrix (or multidimensional data) is represented in tabular form by a column of the values of the matrix and additional columns represent the coordinates of each of the values.\n",
     "\n",
     "Table 1 presents an example of binary tree.\n",
     "\n",
-    "| Root | level 1 | level 2 |\n",
-    "|------|---------|---------|\n",
-    "| A    | B       | D       |\n",
-    "| A    | B       | E       |\n",
-    "| A    | C       | F       |\n",
-    "| A    | C       | G       |\n",
+    "| Root | level 1 | level 2 | value |\n",
+    "|------|---------|---------|-------|\n",
+    "| A    | B       | D       |1      |\n",
+    "| A    | B       | E       |2      |\n",
+    "| A    | C       | F       |3      |\n",
+    "| A    | C       | G       |4      |\n",
     "\n",
     "*Table 1: Tree pattern*\n",
     "\n",
@@ -101,6 +59,25 @@
     "Taking these structures into account leads to significant duplication of data. In the general case, Datasets mix these different structures."
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 59,
+   "id": "7469f343",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import random\n",
+    "import string\n",
+    "import json\n",
+    "import xarray as xr\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from ntv_numpy import Xdataset\n",
+    "import ntv_pandas as npd\n",
+    "import cbor2\n",
+    "import matplotlib.pyplot as plt"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "4f003ae2-04f5-4f51-83a8-106189bf8fdb",
@@ -292,6 +269,37 @@
     "Xdataset.from_xarray(xdss[2]).to_dataframe(ntv_type=False, info=False).reset_index()"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 68,
+   "id": "717e496b-19c1-472c-ad67-6c9ba1b1a014",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def file_sizes(xnd, forma={}):\n",
+    "    '''calculate the size of each format'''\n",
+    "    df = xnd.to_dataframe(json_name=False, info=False).reset_index().sample(frac=1)\n",
+    "    jsn = xnd.to_json(notype='all', header=False, encoded=False, format=forma)\n",
+    "    return {'pd.to_json(values)': len(df.to_json(orient='values')),\n",
+    "            'pd.to_csv': len(df.to_csv()),\n",
+    "            'pd.to_parquet': len(df.to_parquet(engine='pyarrow')),\n",
+    "            'xnd.to_json': len(json.dumps(jsn)),\n",
+    "            'xnd.to_json(cbor)': len(cbor2.dumps(jsn))}\n",
+    "\n",
+    "def sizes_plot(sizes, titles, fig_title):\n",
+    "    '''plot the size of some format''' \n",
+    "    fig, axs = plt.subplots(1, 4, figsize=(24, 6))\n",
+    "    fig.suptitle(fig_title)\n",
+    "    for idx, (size, title) in enumerate(zip(sizes, titles)):\n",
+    "        bar_colors = ['tab:red' if val == min(size.values()) else 'tab:blue' for val in size.values()]\n",
+    "        percent = [str(round(val / list(size.values())[1] * 100, 1)) + ' %' for val in size.values()]\n",
+    "        bar_plt = axs[idx].bar(size.keys(), size.values(), color=bar_colors)\n",
+    "        axs[idx].set_title(title)\n",
+    "        axs[idx].bar_label(bar_plt, percent, label_type='center')\n",
+    "        axs[idx].tick_params(axis='x', rotation=55)\n",
+    "    plt.show()"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 92,