diff --git a/example/example_analysis.ipynb b/example/example_analysis.ipynb index 1a9e76c..29ad395 100644 --- a/example/example_analysis.ipynb +++ b/example/example_analysis.ipynb @@ -72,28 +72,33 @@ } ], "source": [ - "# Json data model \n", - "df_country = { \n", - " 'country and region:$erDiagram' : { \n", - " 'entity': {\n", - " 'COUNTRY': [ \n", - " ['string', 'country', 'PK' ], \n", - " ['string', 'code', 'unique'] \n", - " ], \n", - " 'REGION': [ \n", - " ['string', 'region', 'PK'],\n", - " ['number', 'population'] \n", - " ]\n", + "# Json data model\n", + "df_country = {\n", + " \"country and region:$erDiagram\": {\n", + " \"entity\": {\n", + " \"COUNTRY\": [[\"string\", \"country\", \"PK\"], [\"string\", \"code\", \"unique\"]],\n", + " \"REGION\": [[\"string\", \"region\", \"PK\"], [\"number\", \"population\"]],\n", " },\n", - " 'relationship': [ \n", - " [ 'REGION', 'exactly one', 'identifying', 'one or more', 'COUNTRY', 'brings_together']\n", + " \"relationship\": [\n", + " [\n", + " \"REGION\",\n", + " \"exactly one\",\n", + " \"identifying\",\n", + " \"one or more\",\n", + " \"COUNTRY\",\n", + " \"brings_together\",\n", + " ]\n", " ],\n", - "\n", - " } }\n", + " }\n", + "}\n", "\n", "# It is converted in Mermaid structure and then displayed\n", "diag = MermaidConnec.diagram(df_country)\n", - "display(Image(url=\"https://mermaid.ink/img/\" + b64encode(diag.encode(\"ascii\")).decode(\"ascii\")))" + "display(\n", + " Image(\n", + " url=\"https://mermaid.ink/img/\" + b64encode(diag.encode(\"ascii\")).decode(\"ascii\")\n", + " )\n", + ")" ] }, { @@ -176,8 +181,7 @@ "metadata": {}, "outputs": [], "source": [ - "import pandas as pd\n", - "import ntv_pandas as npd" + "import pandas as pd" ] }, { @@ -187,10 +191,12 @@ "metadata": {}, "outputs": [], "source": [ - "example1 = {'country' : ['France', 'Spain', 'Estonia', 'Nigeria'],\n", - " 'region': ['European Union', 'European Union', 'European Union', 'Africa'],\n", - " 'code': ['FR', 'ES', 'ES', 'NI'],\n", - " 'population': [449, 48, 449, 1460]}\n", + "example1 = {\n", + " \"country\": [\"France\", \"Spain\", \"Estonia\", \"Nigeria\"],\n", + " \"region\": [\"European Union\", \"European Union\", \"European Union\", \"Africa\"],\n", + " \"code\": [\"FR\", \"ES\", \"ES\", \"NI\"],\n", + " \"population\": [449, 48, 449, 1460],\n", + "}\n", "pd_ex1 = pd.DataFrame(example1)" ] }, @@ -222,11 +228,19 @@ ], "source": [ "ana1 = pd_ex1.npd.analysis()\n", - "print(\"country - code (must be coupled): \", ana1.get_relation('country', 'code').typecoupl)\n", - "print(\"region - population (must be derived True): \", ana1.get_relation('region', 'population').typecoupl, \n", - " ana1.get_relation('region', 'population').parent_child)\n", - "print(\"country - region (must be derived True): \", ana1.get_relation('country', 'region').typecoupl,\n", - " ana1.get_relation('country', 'region').parent_child)" + "print(\n", + " \"country - code (must be coupled): \", ana1.get_relation(\"country\", \"code\").typecoupl\n", + ")\n", + "print(\n", + " \"region - population (must be derived True): \",\n", + " ana1.get_relation(\"region\", \"population\").typecoupl,\n", + " ana1.get_relation(\"region\", \"population\").parent_child,\n", + ")\n", + "print(\n", + " \"country - region (must be derived True): \",\n", + " ana1.get_relation(\"country\", \"region\").typecoupl,\n", + " ana1.get_relation(\"country\", \"region\").parent_child,\n", + ")" ] }, { @@ -253,8 +267,8 @@ } ], "source": [ - "print(pd_ex1.npd.check_relation('country', 'code', 'coupled', value=True))\n", - "print(pd_ex1.npd.check_relation('region', 'population', 'derived', value=True))" + "print(pd_ex1.npd.check_relation(\"country\", \"code\", \"coupled\", value=True))\n", + "print(pd_ex1.npd.check_relation(\"region\", \"population\", \"derived\", value=True))" ] }, { @@ -294,10 +308,12 @@ "metadata": {}, "outputs": [], "source": [ - "example2 = {'country' : ['France', 'Spain', 'Estonia', 'Nigeria'],\n", - " 'region': ['European Union', 'European Union', 'European Union', 'Africa'],\n", - " 'code': ['FR', 'ES', 'EE', 'NI'],\n", - " 'population': [449, 449, 449, 1460]}\n", + "example2 = {\n", + " \"country\": [\"France\", \"Spain\", \"Estonia\", \"Nigeria\"],\n", + " \"region\": [\"European Union\", \"European Union\", \"European Union\", \"Africa\"],\n", + " \"code\": [\"FR\", \"ES\", \"EE\", \"NI\"],\n", + " \"population\": [449, 449, 449, 1460],\n", + "}\n", "pd_ex2 = pd.DataFrame(example2)" ] }, @@ -321,11 +337,19 @@ ], "source": [ "ana2 = pd_ex2.npd.analysis()\n", - "print(\"country - code (must be coupled): \", ana2.get_relation('country', 'code').typecoupl)\n", - "print(\"region - population (must be derived True): \", ana2.get_relation('region', 'population').typecoupl, \n", - " ana2.get_relation('region', 'population').parent_child)\n", - "print(\"country - region (must be derived True): \", ana2.get_relation('country', 'region').typecoupl,\n", - " ana2.get_relation('country', 'region').parent_child)" + "print(\n", + " \"country - code (must be coupled): \", ana2.get_relation(\"country\", \"code\").typecoupl\n", + ")\n", + "print(\n", + " \"region - population (must be derived True): \",\n", + " ana2.get_relation(\"region\", \"population\").typecoupl,\n", + " ana2.get_relation(\"region\", \"population\").parent_child,\n", + ")\n", + "print(\n", + " \"country - region (must be derived True): \",\n", + " ana2.get_relation(\"country\", \"region\").typecoupl,\n", + " ana2.get_relation(\"country\", \"region\").parent_child,\n", + ")" ] }, { diff --git a/example/example_json_pandas.ipynb b/example/example_json_pandas.ipynb index 6d846d7..baa41dd 100644 --- a/example/example_json_pandas.ipynb +++ b/example/example_json_pandas.ipynb @@ -36,11 +36,9 @@ "outputs": [], "source": [ "import math\n", - "from pprint import pprint\n", "from io import StringIO\n", "\n", "import pandas as pd\n", - "from shapely.geometry import Point\n", "from datetime import date, datetime, time" ] }, @@ -119,7 +117,7 @@ } ], "source": [ - "df = pd.DataFrame(pd.Series([10,20], name='test int32', dtype='Int32'))\n", + "df = pd.DataFrame(pd.Series([10, 20], name=\"test int32\", dtype=\"Int32\"))\n", "\n", "# dtype is not included in usual json interface\n", "df.to_json()" @@ -153,9 +151,9 @@ ], "source": [ "# 'int32' is lost in json-table interface\n", - "df2 = pd.read_json(StringIO(df.to_json(orient='table')), orient='table')\n", + "df2 = pd.read_json(StringIO(df.to_json(orient=\"table\")), orient=\"table\")\n", "print(df2.dtypes)\n", - "print('\\nis Json translation reversible ? ', df.equals(df2))" + "print(\"\\nis Json translation reversible ? \", df.equals(df2))" ] }, { @@ -187,11 +185,11 @@ } ], "source": [ - "df = pd.DataFrame(pd.Series([10,20], name='test float64', dtype='float64'))\n", - "print(df.dtypes, '\\n')\n", - "df2 = pd.read_json(StringIO(df.to_json(orient='records')), orient='records')\n", + "df = pd.DataFrame(pd.Series([10, 20], name=\"test float64\", dtype=\"float64\"))\n", + "print(df.dtypes, \"\\n\")\n", + "df2 = pd.read_json(StringIO(df.to_json(orient=\"records\")), orient=\"records\")\n", "print(df2.dtypes)\n", - "print('\\nis Json translation reversible ? ', df.equals(df2))" + "print(\"\\nis Json translation reversible ? \", df.equals(df2))" ] }, { @@ -215,11 +213,11 @@ } ], "source": [ - "sr = pd.Series([math.nan,math.nan], name='nan')\n", - "print(sr.dtype, '\\n')\n", - "sr2 = pd.read_json(StringIO(sr.to_json()), typ='series')\n", + "sr = pd.Series([math.nan, math.nan], name=\"nan\")\n", + "print(sr.dtype, \"\\n\")\n", + "sr2 = pd.read_json(StringIO(sr.to_json()), typ=\"series\")\n", "print(sr2)\n", - "print('\\nis Json translation reversible ? ', sr.equals(sr2))" + "print(\"\\nis Json translation reversible ? \", sr.equals(sr2))" ] }, { @@ -253,15 +251,15 @@ } ], "source": [ - "dfd = pd.DataFrame({'test dates': [date(2021, 3, 1), date(2021, 3, 3)]})\n", + "dfd = pd.DataFrame({\"test dates\": [date(2021, 3, 1), date(2021, 3, 3)]})\n", "\n", - "print(dfd.to_json(default_handler=date.isoformat), '\\n')\n", - "print(dfd.to_json(orient='table'), '\\n')\n", + "print(dfd.to_json(default_handler=date.isoformat), \"\\n\")\n", + "print(dfd.to_json(orient=\"table\"), \"\\n\")\n", "\n", - "dfd2 = pd.read_json(StringIO(dfd.to_json(orient='table')), orient='table')\n", + "dfd2 = pd.read_json(StringIO(dfd.to_json(orient=\"table\")), orient=\"table\")\n", "print(dfd2)\n", "\n", - "print('\\nis Json translation reversible ? ', dfd.equals(dfd2))" + "print(\"\\nis Json translation reversible ? \", dfd.equals(dfd2))" ] }, { @@ -291,13 +289,13 @@ } ], "source": [ - "dfd = pd.DataFrame({'test tuple': [(2021, 3, 1), (2021, 3, 3)]})\n", - "print(dfd, '\\n')\n", - "print(dfd.to_json(), '\\n')\n", - "print(dfd.to_json(orient='table'), '\\n')\n", - "dfd2 = pd.read_json(StringIO(dfd.to_json(orient='table')), orient='table')\n", + "dfd = pd.DataFrame({\"test tuple\": [(2021, 3, 1), (2021, 3, 3)]})\n", + "print(dfd, \"\\n\")\n", + "print(dfd.to_json(), \"\\n\")\n", + "print(dfd.to_json(orient=\"table\"), \"\\n\")\n", + "dfd2 = pd.read_json(StringIO(dfd.to_json(orient=\"table\")), orient=\"table\")\n", "print(dfd2)\n", - "print('\\nis Json translation reversible ? ', dfd.equals(dfd2))" + "print(\"\\nis Json translation reversible ? \", dfd.equals(dfd2))" ] }, { @@ -323,8 +321,10 @@ } ], "source": [ - "df = pd.DataFrame(pd.Series([10,20], name='test float', dtype='float32'), dtype='category')\n", - "print(df.to_json(orient='table'))" + "df = pd.DataFrame(\n", + " pd.Series([10, 20], name=\"test float\", dtype=\"float32\"), dtype=\"category\"\n", + ")\n", + "print(df.to_json(orient=\"table\"))" ] }, { @@ -358,11 +358,15 @@ } ], "source": [ - "df = pd.DataFrame({'test dates' : [date(2021, 10, 2), date(2021, 10, 4)],\n", - " 'test times' : [time(10, 10, 2), time(11, 10, 4)]})\n", - "print('CSV data :\\n', df.to_csv())\n", - "print('JSON data :\\n', df.to_json())\n", - "print('JSON data :\\n', df.to_json(date_format='iso'))" + "df = pd.DataFrame(\n", + " {\n", + " \"test dates\": [date(2021, 10, 2), date(2021, 10, 4)],\n", + " \"test times\": [time(10, 10, 2), time(11, 10, 4)],\n", + " }\n", + ")\n", + "print(\"CSV data :\\n\", df.to_csv())\n", + "print(\"JSON data :\\n\", df.to_json())\n", + "print(\"JSON data :\\n\", df.to_json(date_format=\"iso\"))" ] }, { @@ -404,21 +408,30 @@ "source": [ "import ntv_pandas as npd\n", "\n", - "tab_data = {'dates': ['1964-01-01', '1985-02-05', '2022-01-21', '1964-01-01', '1985-02-05', '2022-01-21'], \n", - " 'value': [10, 10, 20, 20, 30, 30],\n", - " 'names': ['john', 'eric', 'judith', 'mila', 'hector', 'maria'],\n", - " 'unique': [True, True, True, True, True, True] }\n", + "tab_data = {\n", + " \"dates\": [\n", + " \"1964-01-01\",\n", + " \"1985-02-05\",\n", + " \"2022-01-21\",\n", + " \"1964-01-01\",\n", + " \"1985-02-05\",\n", + " \"2022-01-21\",\n", + " ],\n", + " \"value\": [10, 10, 20, 20, 30, 30],\n", + " \"names\": [\"john\", \"eric\", \"judith\", \"mila\", \"hector\", \"maria\"],\n", + " \"unique\": [True, True, True, True, True, True],\n", + "}\n", "\n", - "df = pd.DataFrame(tab_data, dtype='category')\n", - "print(df, '\\n')\n", + "df = pd.DataFrame(tab_data, dtype=\"category\")\n", + "print(df, \"\\n\")\n", "\n", "# length with compact interface : 240\n", "print(npd.to_json(df, text=True))\n", - "print(len(npd.to_json(df, text=True)), '\\n')\n", + "print(len(npd.to_json(df, text=True)), \"\\n\")\n", "\n", "# length with actual interface : 946\n", - "print(df.to_json(orient='table'))\n", - "print(len(df.to_json(orient='table')), '\\n')" + "print(df.to_json(orient=\"table\"))\n", + "print(len(df.to_json(orient=\"table\")), \"\\n\")" ] }, { @@ -580,33 +593,38 @@ "source": [ "import json\n", "\n", - "data = [[{'test': [1,2,3]}, {'dtype': 'int32'}],\n", - " [{'test': [1,2,3]}, {'dtype': 'int'}],\n", - " [{'test': [1,2,3]}, {'dtype': 'int64'}],\n", - " [{'test': [1,2,3]}, {'dtype': 'Int64'}],\n", - " [{'test': [1,2,3]}, {'dtype': 'UInt64'}],\n", - " [{'test': [1,2,3]}, {'dtype': 'float'}],\n", - " [{'test': [1,2,3]}, {'dtype': 'Float32'}],\n", - " [{'test': [1,2,3]}, {'dtype': 'float32'}],\n", - " [{'test': [1,2,3]}, {'dtype': 'float64'}],\n", - " [{'test': [1,2,3]}, {'dtype': 'Sparse[float64]'}],\n", - " # [{'test': [pd.Interval(1,2), pd.Interval(2,3), pd.Interval(3,4)]}], # read_json ko\n", - " [{'test': ['2020-01-01']}, {'dtype': 'datetime64[ns]'}],\n", - " [{'test': ['2020-01-01']}, {'dtype': 'datetime64[ns, UTC]'}],\n", - " [{'test': ['2020-01-01']}, {'dtype': 'category'}],\n", - " [{'test': [datetime(2020, 1, 1)]}, {'dtype': 'category'}], \n", - " [{'test': [True, False]}, {'dtype': 'boolean'}],\n", - " [{'test': [True, False]}, {'dtype': 'bool'}],\n", - " # [{'test': ['1 days', '2 days']}, {'dtype': 'timedelta64[ns]'}], # read_json not yet implemented\n", - " # [{'test': ['2020-01-01', '2020-02-01', '2020-03-01']}, {'dtype': 'period[M]'}], # read_json not available\n", - " [{'test': [True, 1, 'er', datetime(2020, 1, 1)]}, {'dtype': 'object'}],\n", - " ]\n", - "print('reverse (True, False), json table-schema object, reverse dtype:\\n')\n", + "data = [\n", + " [{\"test\": [1, 2, 3]}, {\"dtype\": \"int32\"}],\n", + " [{\"test\": [1, 2, 3]}, {\"dtype\": \"int\"}],\n", + " [{\"test\": [1, 2, 3]}, {\"dtype\": \"int64\"}],\n", + " [{\"test\": [1, 2, 3]}, {\"dtype\": \"Int64\"}],\n", + " [{\"test\": [1, 2, 3]}, {\"dtype\": \"UInt64\"}],\n", + " [{\"test\": [1, 2, 3]}, {\"dtype\": \"float\"}],\n", + " [{\"test\": [1, 2, 3]}, {\"dtype\": \"Float32\"}],\n", + " [{\"test\": [1, 2, 3]}, {\"dtype\": \"float32\"}],\n", + " [{\"test\": [1, 2, 3]}, {\"dtype\": \"float64\"}],\n", + " [{\"test\": [1, 2, 3]}, {\"dtype\": \"Sparse[float64]\"}],\n", + " # [{'test': [pd.Interval(1,2), pd.Interval(2,3), pd.Interval(3,4)]}], # read_json ko\n", + " [{\"test\": [\"2020-01-01\"]}, {\"dtype\": \"datetime64[ns]\"}],\n", + " [{\"test\": [\"2020-01-01\"]}, {\"dtype\": \"datetime64[ns, UTC]\"}],\n", + " [{\"test\": [\"2020-01-01\"]}, {\"dtype\": \"category\"}],\n", + " [{\"test\": [datetime(2020, 1, 1)]}, {\"dtype\": \"category\"}],\n", + " [{\"test\": [True, False]}, {\"dtype\": \"boolean\"}],\n", + " [{\"test\": [True, False]}, {\"dtype\": \"bool\"}],\n", + " # [{'test': ['1 days', '2 days']}, {'dtype': 'timedelta64[ns]'}], # read_json not yet implemented\n", + " # [{'test': ['2020-01-01', '2020-02-01', '2020-03-01']}, {'dtype': 'period[M]'}], # read_json not available\n", + " [{\"test\": [True, 1, \"er\", datetime(2020, 1, 1)]}, {\"dtype\": \"object\"}],\n", + "]\n", + "print(\"reverse (True, False), json table-schema object, reverse dtype:\\n\")\n", "for df_data in data:\n", - " df = pd.DataFrame(df_data[0], **df_data[1]) if len(df_data) == 2 else pd.DataFrame(df_data[0])\n", - " js = df.to_json(orient='table')\n", - " df2 = pd.read_json(StringIO(js), orient='table')\n", - " print(df.equals(df2), json.loads(js)['schema']['fields'][1], df2.dtypes.iloc[0])" + " df = (\n", + " pd.DataFrame(df_data[0], **df_data[1])\n", + " if len(df_data) == 2\n", + " else pd.DataFrame(df_data[0])\n", + " )\n", + " js = df.to_json(orient=\"table\")\n", + " df2 = pd.read_json(StringIO(js), orient=\"table\")\n", + " print(df.equals(df2), json.loads(js)[\"schema\"][\"fields\"][1], df2.dtypes.iloc[0])" ] } ], diff --git a/example/example_multidimensional.ipynb b/example/example_multidimensional.ipynb index b4b0177..d1b52af 100644 --- a/example/example_multidimensional.ipynb +++ b/example/example_multidimensional.ipynb @@ -36,19 +36,48 @@ "outputs": [], "source": [ "import pandas as pd\n", - "import ntv_pandas as npd # activate pandas npd accessor\n", + "import ntv_pandas as npd # activate pandas npd accessor\n", "\n", - "fruits = {'plants': ['fruit', 'fruit', 'fruit', 'fruit', 'vegetable', 'vegetable', 'vegetable', 'vegetable'],\n", - " 'plts': ['fr', 'fr', 'fr', 'fr', 've', 've', 've', 've'], \n", - " 'quantity': ['1 kg', '10 kg', '1 kg', '10 kg', '1 kg', '10 kg', '1 kg', '10 kg'],\n", - " 'product': ['apple', 'apple', 'orange', 'orange', 'peppers', 'peppers', 'carrot', 'carrot'],\n", - " 'price': [1, 10, 2, 20, 1.5, 15, 1.5, 20],\n", - " 'price level': ['low', 'low', 'high', 'high', 'low', 'low', 'high', 'high'],\n", - " 'group': ['fruit 1', 'fruit 10', 'fruit 1', 'veget', 'veget', 'veget', 'veget', 'veget'],\n", - " 'id': [1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008],\n", - " 'supplier': [\"sup1\", \"sup1\", \"sup1\", \"sup2\", \"sup2\", \"sup2\", \"sup2\", \"sup1\"],\n", - " 'location': [\"fr\", \"gb\", \"es\", \"ch\", \"gb\", \"fr\", \"es\", \"ch\"],\n", - " 'valid': [\"ok\", \"ok\", \"ok\", \"ok\", \"ok\", \"ok\", \"ok\", \"ok\"]}\n", + "fruits = {\n", + " \"plants\": [\n", + " \"fruit\",\n", + " \"fruit\",\n", + " \"fruit\",\n", + " \"fruit\",\n", + " \"vegetable\",\n", + " \"vegetable\",\n", + " \"vegetable\",\n", + " \"vegetable\",\n", + " ],\n", + " \"plts\": [\"fr\", \"fr\", \"fr\", \"fr\", \"ve\", \"ve\", \"ve\", \"ve\"],\n", + " \"quantity\": [\"1 kg\", \"10 kg\", \"1 kg\", \"10 kg\", \"1 kg\", \"10 kg\", \"1 kg\", \"10 kg\"],\n", + " \"product\": [\n", + " \"apple\",\n", + " \"apple\",\n", + " \"orange\",\n", + " \"orange\",\n", + " \"peppers\",\n", + " \"peppers\",\n", + " \"carrot\",\n", + " \"carrot\",\n", + " ],\n", + " \"price\": [1, 10, 2, 20, 1.5, 15, 1.5, 20],\n", + " \"price level\": [\"low\", \"low\", \"high\", \"high\", \"low\", \"low\", \"high\", \"high\"],\n", + " \"group\": [\n", + " \"fruit 1\",\n", + " \"fruit 10\",\n", + " \"fruit 1\",\n", + " \"veget\",\n", + " \"veget\",\n", + " \"veget\",\n", + " \"veget\",\n", + " \"veget\",\n", + " ],\n", + " \"id\": [1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008],\n", + " \"supplier\": [\"sup1\", \"sup1\", \"sup1\", \"sup2\", \"sup2\", \"sup2\", \"sup2\", \"sup1\"],\n", + " \"location\": [\"fr\", \"gb\", \"es\", \"ch\", \"gb\", \"fr\", \"es\", \"ch\"],\n", + " \"valid\": [\"ok\", \"ok\", \"ok\", \"ok\", \"ok\", \"ok\", \"ok\", \"ok\"],\n", + "}\n", "df_fruits = pd.DataFrame(fruits)" ] }, @@ -169,7 +198,7 @@ } ], "source": [ - "ana_fruits.field_partition() # first partition" + "ana_fruits.field_partition() # first partition" ] }, { @@ -200,7 +229,7 @@ } ], "source": [ - "ana_fruits.relation_partition() # first partition" + "ana_fruits.relation_partition() # first partition" ] }, { @@ -227,7 +256,7 @@ } ], "source": [ - "ana_fruits.field_partition(partition=['product', 'quantity'])" + "ana_fruits.field_partition(partition=[\"product\", \"quantity\"])" ] }, { @@ -258,7 +287,7 @@ } ], "source": [ - "ana_fruits.relation_partition(partition=['product', 'quantity'])" + "ana_fruits.relation_partition(partition=[\"product\", \"quantity\"])" ] }, { @@ -296,7 +325,15 @@ "source": [ "from base64 import b64encode\n", "from IPython.display import Image, display\n", - "display(Image(url=\"https://mermaid.ink/img/\" + b64encode(open('fruits.mmd', 'r', encoding=\"utf-8\").read().encode(\"ascii\")).decode(\"ascii\")))" + "\n", + "display(\n", + " Image(\n", + " url=\"https://mermaid.ink/img/\"\n", + " + b64encode(\n", + " open(\"fruits.mmd\", \"r\", encoding=\"utf-8\").read().encode(\"ascii\")\n", + " ).decode(\"ascii\")\n", + " )\n", + ")" ] }, { @@ -740,7 +777,11 @@ } ], "source": [ - "kwargs = {'dims':['plants', 'quantity', 'price level'], 'info': False, 'ntv_type': False}\n", + "kwargs = {\n", + " \"dims\": [\"plants\", \"quantity\", \"price level\"],\n", + " \"info\": False,\n", + " \"ntv_type\": False,\n", + "}\n", "\n", "xd_fruits_1 = df_fruits.npd.to_xarray(**kwargs)\n", "xd_fruits_1" @@ -752,9 +793,7 @@ "id": "46ae0c11-2612-42e4-b6ab-135647762cf9", "metadata": {}, "outputs": [], - "source": [ - "import ntv_numpy # activate xarray nxr accessor" - ] + "source": [] }, { "cell_type": "code", @@ -774,9 +813,15 @@ } ], "source": [ - "df_fruits_xd = xd_fruits_1.nxr.to_dataframe(ntv_type=False) # identical as: df_fruits_xd = npd.from_xarray(ntv_type=False)\n", + "df_fruits_xd = xd_fruits_1.nxr.to_dataframe(\n", + " ntv_type=False\n", + ") # identical as: df_fruits_xd = npd.from_xarray(ntv_type=False)\n", "\n", - "df_fruits_xd_sort = df_fruits_xd.reset_index()[list(df_fruits.columns)].sort_values(list(df_fruits.columns)).reset_index(drop=True)\n", + "df_fruits_xd_sort = (\n", + " df_fruits_xd.reset_index()[list(df_fruits.columns)]\n", + " .sort_values(list(df_fruits.columns))\n", + " .reset_index(drop=True)\n", + ")\n", "df_fruits_sort = df_fruits.sort_values(list(df_fruits.columns)).reset_index(drop=True)\n", "\n", "df_fruits_xd_sort.equals(df_fruits_sort)" @@ -867,7 +912,11 @@ "source": [ "df_fruits_sc = npd.from_scipp(sc_fruits_1, ntv_type=False)\n", "\n", - "df_fruits_sc_sort = df_fruits_sc.reset_index()[list(df_fruits.columns)].sort_values(list(df_fruits.columns)).reset_index(drop=True)\n", + "df_fruits_sc_sort = (\n", + " df_fruits_sc.reset_index()[list(df_fruits.columns)]\n", + " .sort_values(list(df_fruits.columns))\n", + " .reset_index(drop=True)\n", + ")\n", "df_fruits_sort = df_fruits.sort_values(list(df_fruits.columns)).reset_index(drop=True)\n", "\n", "df_fruits_sc_sort.equals(df_fruits_sort)" @@ -1308,7 +1357,7 @@ } ], "source": [ - "kwargs = {'dims':['product', 'quantity'], 'info': False, 'ntv_type': False}\n", + "kwargs = {\"dims\": [\"product\", \"quantity\"], \"info\": False, \"ntv_type\": False}\n", "\n", "xd_fruits_2 = df_fruits.npd.to_xarray(**kwargs)\n", "xd_fruits_2" @@ -1332,9 +1381,15 @@ } ], "source": [ - "df_fruits_xd = xd_fruits_2.nxr.to_dataframe(ntv_type=False) # or npd.from_xarray(xd_fruits_2, ntv_type=False)\n", + "df_fruits_xd = xd_fruits_2.nxr.to_dataframe(\n", + " ntv_type=False\n", + ") # or npd.from_xarray(xd_fruits_2, ntv_type=False)\n", "\n", - "df_fruits_xd_sort = df_fruits_xd.reset_index()[list(df_fruits.columns)].sort_values(list(df_fruits.columns)).reset_index(drop=True)\n", + "df_fruits_xd_sort = (\n", + " df_fruits_xd.reset_index()[list(df_fruits.columns)]\n", + " .sort_values(list(df_fruits.columns))\n", + " .reset_index(drop=True)\n", + ")\n", "df_fruits_sort = df_fruits.sort_values(list(df_fruits.columns)).reset_index(drop=True)\n", "\n", "df_fruits_xd_sort.equals(df_fruits_sort)" @@ -1423,7 +1478,11 @@ "source": [ "df_fruits_sc = npd.from_scipp(sc_fruits_2, ntv_type=False)\n", "\n", - "df_fruits_sc_sort = df_fruits_sc.reset_index()[list(df_fruits.columns)].sort_values(list(df_fruits.columns)).reset_index(drop=True)\n", + "df_fruits_sc_sort = (\n", + " df_fruits_sc.reset_index()[list(df_fruits.columns)]\n", + " .sort_values(list(df_fruits.columns))\n", + " .reset_index(drop=True)\n", + ")\n", "df_fruits_sort = df_fruits.sort_values(list(df_fruits.columns)).reset_index(drop=True)\n", "\n", "df_fruits_sc_sort.equals(df_fruits_sort)" @@ -1466,9 +1525,9 @@ } ], "source": [ - "print(ana_fruits.get_relation('plants', 'plts').typecoupl)\n", - "print(ana_fruits.get_relation('plants', 'product').typecoupl)\n", - "print(ana_fruits.get_relation('quantity', 'product').typecoupl)\n" + "print(ana_fruits.get_relation(\"plants\", \"plts\").typecoupl)\n", + "print(ana_fruits.get_relation(\"plants\", \"product\").typecoupl)\n", + "print(ana_fruits.get_relation(\"quantity\", \"product\").typecoupl)" ] }, { @@ -1499,11 +1558,11 @@ } ], "source": [ - "print('minimum distance: ', ana_fruits.get_relation('plants', 'plts').distance)\n", - "print('maximum distance: ', ana_fruits.get_relation('id', 'valid').distance)\n", - "print('intermediate distance: ', ana_fruits.get_relation('plants', 'product').distance)\n", - "# The 'plants' - 'product' relationship will be 'coupled' if we change, for example, \n", - "#'fruit-orange' in 'citrus-orange' and 'carrot-vegetable' in 'carrot-root vegetable' (2 changes) " + "print(\"minimum distance: \", ana_fruits.get_relation(\"plants\", \"plts\").distance)\n", + "print(\"maximum distance: \", ana_fruits.get_relation(\"id\", \"valid\").distance)\n", + "print(\"intermediate distance: \", ana_fruits.get_relation(\"plants\", \"product\").distance)\n", + "# The 'plants' - 'product' relationship will be 'coupled' if we change, for example,\n", + "#'fruit-orange' in 'citrus-orange' and 'carrot-vegetable' in 'carrot-root vegetable' (2 changes)" ] }, { @@ -1538,7 +1597,12 @@ ], "source": [ "# list of categories for each Field\n", - "print({field.idfield: category for field, category in zip(ana_fruits.fields, ana_fruits.category)})" + "print(\n", + " {\n", + " field.idfield: category\n", + " for field, category in zip(ana_fruits.fields, ana_fruits.category)\n", + " }\n", + ")" ] }, { diff --git a/example/example_ntv_pandas.ipynb b/example/example_ntv_pandas.ipynb index 99eed8a..1052360 100644 --- a/example/example_ntv_pandas.ipynb +++ b/example/example_ntv_pandas.ipynb @@ -52,7 +52,7 @@ "\n", "import pandas as pd\n", "import ntv_pandas as npd\n", - "from shapely.geometry import Point, Polygon\n", + "from shapely.geometry import Point\n", "from json_ntv import Ntv\n", "from datetime import date, datetime, time" ] @@ -201,16 +201,31 @@ } ], "source": [ - "tab_data = {'index': [100, 200, 300, 400, 500, 600],\n", - " 'dates::date': pd.Series([date(1964,1,1), date(1985,2,5), date(2022,1,21), date(1964,1,1), \n", - " date(1985,2,5), date(2022,1,21)], dtype='category'), \n", - " 'valid': [True, False, True, True, False, True],\n", - " 'value32': pd.Series([12, 12, 22, 22, 32, 32], dtype='int32'),\n", - " '::month': [1, 2, 1, 1, 2, 1],\n", - " 'coord::point': pd.Series([Point(1,2), Point(3,4), Point(5,6), Point(7,8), Point(3,4), Point(5,6)]),\n", - " 'names': pd.Series(['john', 'eric', 'judith', 'mila', 'hector', 'maria'], dtype='string'),\n", - " 'unique::year': 2021 }\n", - "df = pd.DataFrame(tab_data).set_index('index')\n", + "tab_data = {\n", + " \"index\": [100, 200, 300, 400, 500, 600],\n", + " \"dates::date\": pd.Series(\n", + " [\n", + " date(1964, 1, 1),\n", + " date(1985, 2, 5),\n", + " date(2022, 1, 21),\n", + " date(1964, 1, 1),\n", + " date(1985, 2, 5),\n", + " date(2022, 1, 21),\n", + " ],\n", + " dtype=\"category\",\n", + " ),\n", + " \"valid\": [True, False, True, True, False, True],\n", + " \"value32\": pd.Series([12, 12, 22, 22, 32, 32], dtype=\"int32\"),\n", + " \"::month\": [1, 2, 1, 1, 2, 1],\n", + " \"coord::point\": pd.Series(\n", + " [Point(1, 2), Point(3, 4), Point(5, 6), Point(7, 8), Point(3, 4), Point(5, 6)]\n", + " ),\n", + " \"names\": pd.Series(\n", + " [\"john\", \"eric\", \"judith\", \"mila\", \"hector\", \"maria\"], dtype=\"string\"\n", + " ),\n", + " \"unique::year\": 2021,\n", + "}\n", + "df = pd.DataFrame(tab_data).set_index(\"index\")\n", "df" ] }, @@ -381,7 +396,7 @@ ], "source": [ "df_from_json = npd.read_json(df_to_json)\n", - "print('df created from JSON-NTV is equal to initial df ? ', df_from_json.equals(df))\n", + "print(\"df created from JSON-NTV is equal to initial df ? \", df_from_json.equals(df))\n", "df_from_json" ] }, @@ -427,13 +442,16 @@ } ], "source": [ - "field_data = {'value': [1, 2, 3]}\n", - "sr = npd.read_json({':field': field_data})\n", + "field_data = {\"value\": [1, 2, 3]}\n", + "sr = npd.read_json({\":field\": field_data})\n", "# pandas dtype conform to Ntv type\n", - "print('pandas object :\\n' + str(sr))\n", - "print('\\nJson representation : \\n ', sr.npd.to_json())\n", - "print('\\nis Json translation reversible ? ', sr.equals(npd.read_json(sr.npd.to_json())))\n", - "print('\\nis pandas translation reversible ? ', json.dumps(sr.npd.to_json()) == json.dumps({':field': field_data}))" + "print(\"pandas object :\\n\" + str(sr))\n", + "print(\"\\nJson representation : \\n \", sr.npd.to_json())\n", + "print(\"\\nis Json translation reversible ? \", sr.equals(npd.read_json(sr.npd.to_json())))\n", + "print(\n", + " \"\\nis pandas translation reversible ? \",\n", + " json.dumps(sr.npd.to_json()) == json.dumps({\":field\": field_data}),\n", + ")" ] }, { @@ -468,12 +486,12 @@ } ], "source": [ - "field_data = {'dates::datetime': ['1964-01-01', '1985-02-05', '2022-01-21']}\n", - "sr = npd.read_json({':field': field_data})\n", + "field_data = {\"dates::datetime\": [\"1964-01-01\", \"1985-02-05\", \"2022-01-21\"]}\n", + "sr = npd.read_json({\":field\": field_data})\n", "# pandas dtype conform to Ntv type\n", - "print('pandas object :\\n' + str(sr))\n", - "print('\\nJson representation : \\n ', sr.npd.to_json())\n", - "print('\\nis Json translation reversible ? ', sr.equals(npd.read_json(sr.npd.to_json())))" + "print(\"pandas object :\\n\" + str(sr))\n", + "print(\"\\nJson representation : \\n \", sr.npd.to_json())\n", + "print(\"\\nis Json translation reversible ? \", sr.equals(npd.read_json(sr.npd.to_json())))" ] }, { @@ -510,13 +528,16 @@ } ], "source": [ - "field_data = {'dates::date': ['1964-01-01', '1985-02-05', '2022-01-21']}\n", - "sr = npd.read_json({':field': field_data})\n", + "field_data = {\"dates::date\": [\"1964-01-01\", \"1985-02-05\", \"2022-01-21\"]}\n", + "sr = npd.read_json({\":field\": field_data})\n", "# pandas dtype conform to Ntv type\n", - "print('pandas object :\\n' + str(sr))\n", - "print('\\nJson representation : \\n ', sr.npd.to_json())\n", - "print('\\nis Json translation reversible ? ', sr.equals(npd.read_json(sr.npd.to_json())))\n", - "print('\\nis pandas translation reversible ? ', json.dumps(sr.npd.to_json()) == json.dumps({':field': field_data}))" + "print(\"pandas object :\\n\" + str(sr))\n", + "print(\"\\nJson representation : \\n \", sr.npd.to_json())\n", + "print(\"\\nis Json translation reversible ? \", sr.equals(npd.read_json(sr.npd.to_json())))\n", + "print(\n", + " \"\\nis pandas translation reversible ? \",\n", + " json.dumps(sr.npd.to_json()) == json.dumps({\":field\": field_data}),\n", + ")" ] }, { @@ -543,12 +564,12 @@ } ], "source": [ - "field_data = {'coord::point': [[1,2], [3,4], [5,6]]}\n", - "sr = npd.read_json({':field': field_data})\n", + "field_data = {\"coord::point\": [[1, 2], [3, 4], [5, 6]]}\n", + "sr = npd.read_json({\":field\": field_data})\n", "# pandas dtype conform to Ntv type\n", - "print('pandas object :\\n' + str(sr))\n", - "print('\\nJson representation : \\n ', sr.npd.to_json())\n", - "print('\\nis Json translation reversible ? ', sr.equals(npd.read_json(sr.npd.to_json())))" + "print(\"pandas object :\\n\" + str(sr))\n", + "print(\"\\nJson representation : \\n \", sr.npd.to_json())\n", + "print(\"\\nis Json translation reversible ? \", sr.equals(npd.read_json(sr.npd.to_json())))" ] }, { @@ -589,12 +610,15 @@ ], "source": [ "field_data = {\"integer\": [[1, 2], [0, 1, 1, 0]]}\n", - "sr = npd.read_json({':field': field_data})\n", + "sr = npd.read_json({\":field\": field_data})\n", "# pandas dtype conform to Ntv type\n", - "print('pandas object :\\n' + str(sr))\n", - "print('\\nJson representation : \\n ', sr.npd.to_json())\n", - "print('\\nis Json translation reversible ? ', sr.equals(npd.read_json(sr.npd.to_json())))\n", - "print('\\nis pandas translation reversible ? ', json.dumps(sr.npd.to_json()) == json.dumps({':field': field_data}))" + "print(\"pandas object :\\n\" + str(sr))\n", + "print(\"\\nJson representation : \\n \", sr.npd.to_json())\n", + "print(\"\\nis Json translation reversible ? \", sr.equals(npd.read_json(sr.npd.to_json())))\n", + "print(\n", + " \"\\nis pandas translation reversible ? \",\n", + " json.dumps(sr.npd.to_json()) == json.dumps({\":field\": field_data}),\n", + ")" ] }, { @@ -623,12 +647,14 @@ } ], "source": [ - "field_data = {'dates': [{'::date': ['1964-01-01', '1985-02-05', '2022-01-21']}, [0, 1, 0, 2]]}\n", - "sr = npd.read_json({':field': field_data})\n", + "field_data = {\n", + " \"dates\": [{\"::date\": [\"1964-01-01\", \"1985-02-05\", \"2022-01-21\"]}, [0, 1, 0, 2]]\n", + "}\n", + "sr = npd.read_json({\":field\": field_data})\n", "# pandas dtype conform to Ntv type\n", - "print('pandas object :\\n' + str(sr))\n", - "print('\\nJson representation : \\n ', sr.npd.to_json())\n", - "print('\\nis Json translation reversible ? ', sr.equals(npd.read_json(sr.npd.to_json())))" + "print(\"pandas object :\\n\" + str(sr))\n", + "print(\"\\nJson representation : \\n \", sr.npd.to_json())\n", + "print(\"\\nis Json translation reversible ? \", sr.equals(npd.read_json(sr.npd.to_json())))" ] }, { @@ -659,13 +685,16 @@ } ], "source": [ - "field_data = {'test_array': [{'::array': [[1,2], [3,4], [5,6]]}, [0, 1, 0, 2]]}\n", - "sr = npd.read_json({':field': field_data})\n", + "field_data = {\"test_array\": [{\"::array\": [[1, 2], [3, 4], [5, 6]]}, [0, 1, 0, 2]]}\n", + "sr = npd.read_json({\":field\": field_data})\n", "# pandas dtype conform to Ntv type\n", - "print('pandas object :\\n' + str(sr))\n", - "print('\\nJson representation : \\n ', sr.npd.to_json())\n", - "print('\\nis Json translation reversible ? ', sr.equals(npd.read_json(sr.npd.to_json())))\n", - "print('\\nis pandas translation reversible ? ', json.dumps(sr.npd.to_json()) == json.dumps({':field': field_data}))" + "print(\"pandas object :\\n\" + str(sr))\n", + "print(\"\\nJson representation : \\n \", sr.npd.to_json())\n", + "print(\"\\nis Json translation reversible ? \", sr.equals(npd.read_json(sr.npd.to_json())))\n", + "print(\n", + " \"\\nis pandas translation reversible ? \",\n", + " json.dumps(sr.npd.to_json()) == json.dumps({\":field\": field_data}),\n", + ")" ] }, { @@ -716,10 +745,10 @@ "source": [ "df = pd.DataFrame({\"A\": list(\"abca\"), \"B\": list(\"bccd\")})\n", "\n", - "print('pandas dtype :\\n' + str(df.dtypes))\n", - "print('\\npandas object :\\n' + str(df))\n", - "print('\\nJson representation : \\n ', df.npd.to_json())\n", - "print('\\nis Json translation reversible ? ', df.equals(npd.read_json(df.npd.to_json())))" + "print(\"pandas dtype :\\n\" + str(df.dtypes))\n", + "print(\"\\npandas object :\\n\" + str(df))\n", + "print(\"\\nJson representation : \\n \", df.npd.to_json())\n", + "print(\"\\nis Json translation reversible ? \", df.equals(npd.read_json(df.npd.to_json())))" ] }, { @@ -778,20 +807,29 @@ } ], "source": [ - "tab_data = {'index': [100, 200, 300, 400, 500, 600],\n", - " 'dates::date': ['1964-01-01', '1985-02-05', '2022-01-21', '1964-01-01', '1985-02-05', '2022-01-21'], \n", - " 'value': [10, 10, 20, 20, 30, 30],\n", - " 'value32::int32': [12, 12, 22, 22, 32, 32],\n", - " 'res': [10, 20, 30, 10, 20, 30],\n", - " 'coord::point': [[1,2], [3,4], [5,6], [7,8], [3,4], [5,6]],\n", - " 'names::string': ['john', 'eric', 'judith', 'mila', 'hector', 'maria'],\n", - " 'unique': True }\n", - "df = npd.read_json({':tab': tab_data})\n", - "print('pandas dtype :\\n' + str(df.dtypes))\n", - "print('\\npandas object :\\n' + str(df))\n", - "print('\\nJson representation :')\n", + "tab_data = {\n", + " \"index\": [100, 200, 300, 400, 500, 600],\n", + " \"dates::date\": [\n", + " \"1964-01-01\",\n", + " \"1985-02-05\",\n", + " \"2022-01-21\",\n", + " \"1964-01-01\",\n", + " \"1985-02-05\",\n", + " \"2022-01-21\",\n", + " ],\n", + " \"value\": [10, 10, 20, 20, 30, 30],\n", + " \"value32::int32\": [12, 12, 22, 22, 32, 32],\n", + " \"res\": [10, 20, 30, 10, 20, 30],\n", + " \"coord::point\": [[1, 2], [3, 4], [5, 6], [7, 8], [3, 4], [5, 6]],\n", + " \"names::string\": [\"john\", \"eric\", \"judith\", \"mila\", \"hector\", \"maria\"],\n", + " \"unique\": True,\n", + "}\n", + "df = npd.read_json({\":tab\": tab_data})\n", + "print(\"pandas dtype :\\n\" + str(df.dtypes))\n", + "print(\"\\npandas object :\\n\" + str(df))\n", + "print(\"\\nJson representation :\")\n", "pprint(df.npd.to_json(), width=140)\n", - "print('\\nis Json translation reversible ? ', df.equals(npd.read_json(df.npd.to_json())))" + "print(\"\\nis Json translation reversible ? \", df.equals(npd.read_json(df.npd.to_json())))" ] }, { @@ -817,17 +855,33 @@ } ], "source": [ - "tab_data = {'index': [100, 200, 300, 400, 500, 600],\n", - " 'dates::date': ['1964-01-01', '1985-02-05', '2022-01-21', '1964-01-01', '1985-02-05', '2022-01-21'], \n", - " 'value': [10, 10, 20, 20, {'valid?': 30}, 30],\n", - " 'value32::int32': [12, 12, 22, 22, 32, 32],\n", - " 'res': {'res1': 10, 'res2': 20, 'res3': 30, 'res4': 10, 'res5': 20, 'res6': 30},\n", - " 'coord::point': [[1,2], [3,4], [5,6], [7,8], {'same as 2nd point': [3,4]}, [5,6]],\n", - " 'names::string': ['john', 'eric', 'judith', 'mila', 'hector', 'maria'],\n", - " 'unique': True }\n", + "tab_data = {\n", + " \"index\": [100, 200, 300, 400, 500, 600],\n", + " \"dates::date\": [\n", + " \"1964-01-01\",\n", + " \"1985-02-05\",\n", + " \"2022-01-21\",\n", + " \"1964-01-01\",\n", + " \"1985-02-05\",\n", + " \"2022-01-21\",\n", + " ],\n", + " \"value\": [10, 10, 20, 20, {\"valid?\": 30}, 30],\n", + " \"value32::int32\": [12, 12, 22, 22, 32, 32],\n", + " \"res\": {\"res1\": 10, \"res2\": 20, \"res3\": 30, \"res4\": 10, \"res5\": 20, \"res6\": 30},\n", + " \"coord::point\": [\n", + " [1, 2],\n", + " [3, 4],\n", + " [5, 6],\n", + " [7, 8],\n", + " {\"same as 2nd point\": [3, 4]},\n", + " [5, 6],\n", + " ],\n", + " \"names::string\": [\"john\", \"eric\", \"judith\", \"mila\", \"hector\", \"maria\"],\n", + " \"unique\": True,\n", + "}\n", "\n", - "df2 = npd.read_json({':tab': tab_data}, annotated=True)\n", - "print('is DataFrame identical ? ', df.equals(df2))" + "df2 = npd.read_json({\":tab\": tab_data}, annotated=True)\n", + "print(\"is DataFrame identical ? \", df.equals(df2))" ] }, { @@ -870,10 +924,10 @@ "source": [ "df = pd.DataFrame({\"A\": list(\"abca\"), \"B\": list(\"bccd\")}, dtype=\"category\")\n", "\n", - "print('pandas dtype :\\n' + str(df.dtypes))\n", - "print('\\npandas object :\\n' + str(df))\n", - "print('\\nJson representation : \\n ', df.npd.to_json())\n", - "print('\\nis Json translation reversible ? ', df.equals(npd.read_json(df.npd.to_json())))" + "print(\"pandas dtype :\\n\" + str(df.dtypes))\n", + "print(\"\\npandas object :\\n\" + str(df))\n", + "print(\"\\nJson representation : \\n \", df.npd.to_json())\n", + "print(\"\\nis Json translation reversible ? \", df.equals(npd.read_json(df.npd.to_json())))" ] }, { @@ -920,21 +974,33 @@ } ], "source": [ - "tab_data = {'index': [100, 200, 300, 400, 500, 600],\n", - " 'dates': [{'::date': ['1964-01-01', '1985-02-05', '2022-01-21']}, [0, 1, 2, 0, 1, 2]],\n", - " 'value': [[10, 20, {'valid?': 30}], [0, 0, 1, 1, 2, 2]],\n", - " 'value32::int32': [12, 12, 22, 22, 32, 32],\n", - " 'res': {'res1': 10, 'res2': 20, 'res3': 30, 'res4': 10, 'res5': 20, 'res6': 30},\n", - " 'coord::point': [[1,2], [3,4], [5,6], [7,8], {'same as 2nd point': [3,4]}, [5,6]],\n", - " 'names::string': ['john', 'eric', 'judith', 'mila', 'hector', 'maria'],\n", - " 'unique:boolean': True }\n", + "tab_data = {\n", + " \"index\": [100, 200, 300, 400, 500, 600],\n", + " \"dates\": [\n", + " {\"::date\": [\"1964-01-01\", \"1985-02-05\", \"2022-01-21\"]},\n", + " [0, 1, 2, 0, 1, 2],\n", + " ],\n", + " \"value\": [[10, 20, {\"valid?\": 30}], [0, 0, 1, 1, 2, 2]],\n", + " \"value32::int32\": [12, 12, 22, 22, 32, 32],\n", + " \"res\": {\"res1\": 10, \"res2\": 20, \"res3\": 30, \"res4\": 10, \"res5\": 20, \"res6\": 30},\n", + " \"coord::point\": [\n", + " [1, 2],\n", + " [3, 4],\n", + " [5, 6],\n", + " [7, 8],\n", + " {\"same as 2nd point\": [3, 4]},\n", + " [5, 6],\n", + " ],\n", + " \"names::string\": [\"john\", \"eric\", \"judith\", \"mila\", \"hector\", \"maria\"],\n", + " \"unique:boolean\": True,\n", + "}\n", "\n", - "df = npd.read_json({':tab': tab_data}, annotated=True)\n", - "print('pandas dtype :\\n' + str(df.dtypes))\n", - "print('\\npandas object :\\n' + str(df))\n", - "print('\\nJson representation :')\n", + "df = npd.read_json({\":tab\": tab_data}, annotated=True)\n", + "print(\"pandas dtype :\\n\" + str(df.dtypes))\n", + "print(\"\\npandas object :\\n\" + str(df))\n", + "print(\"\\nJson representation :\")\n", "pprint(df.npd.to_json(), width=140)\n", - "print('\\nis Json translation reversible ? ', df.equals(npd.read_json(df.npd.to_json())))" + "print(\"\\nis Json translation reversible ? \", df.equals(npd.read_json(df.npd.to_json())))" ] }, { @@ -979,22 +1045,43 @@ } ], "source": [ - "index = pd.Series([100, 200, 300, 400, 500, 600])\n", - "dates = pd.Series(name='dates::date', data=[date(1964, 1, 1), date(1985, 2, 5), date(2022, 1, 21), date(1964, 1, 1),\n", - " date(1985, 2, 5), date(2022, 1, 21)], dtype='object').astype('category')\n", - "value = pd.Series(name='value', data=[10,10,20,20,30,30], dtype='Int64').astype('category') #alias mandatory \n", - "value32 = pd.Series(name='value32', data=[12, 12, 22, 22, 32, 32], dtype='int32')\n", - "coord = pd.Series(name='coord::point', data=[Point(1,2), Point(3,4), Point(5,6), Point(7,8), Point(3,4), Point(5,6)])\n", - "names = pd.Series(name='names', data=['john', 'eric', 'judith', 'mila', 'hector', 'maria'], dtype='string')\n", - "unique = pd.Series(name='unique', data=[True, True, True, True, True, True])\n", + "index = pd.Series([100, 200, 300, 400, 500, 600])\n", + "dates = pd.Series(\n", + " name=\"dates::date\",\n", + " data=[\n", + " date(1964, 1, 1),\n", + " date(1985, 2, 5),\n", + " date(2022, 1, 21),\n", + " date(1964, 1, 1),\n", + " date(1985, 2, 5),\n", + " date(2022, 1, 21),\n", + " ],\n", + " dtype=\"object\",\n", + ").astype(\"category\")\n", + "value = pd.Series(name=\"value\", data=[10, 10, 20, 20, 30, 30], dtype=\"Int64\").astype(\n", + " \"category\"\n", + ") # alias mandatory\n", + "value32 = pd.Series(name=\"value32\", data=[12, 12, 22, 22, 32, 32], dtype=\"int32\")\n", + "coord = pd.Series(\n", + " name=\"coord::point\",\n", + " data=[Point(1, 2), Point(3, 4), Point(5, 6), Point(7, 8), Point(3, 4), Point(5, 6)],\n", + ")\n", + "names = pd.Series(\n", + " name=\"names\",\n", + " data=[\"john\", \"eric\", \"judith\", \"mila\", \"hector\", \"maria\"],\n", + " dtype=\"string\",\n", + ")\n", + "unique = pd.Series(name=\"unique\", data=[True, True, True, True, True, True])\n", "\n", - "df = pd.DataFrame({ser.name: ser for ser in [index, dates, value, value32, coord, names, unique]}).set_index(None)\n", + "df = pd.DataFrame(\n", + " {ser.name: ser for ser in [index, dates, value, value32, coord, names, unique]}\n", + ").set_index(None)\n", "\n", - "print('pandas dtype :\\n' + str(df.dtypes))\n", - "print('\\npandas object :\\n' + str(df))\n", - "print('\\nJson representation :')\n", + "print(\"pandas dtype :\\n\" + str(df.dtypes))\n", + "print(\"\\npandas object :\\n\" + str(df))\n", + "print(\"\\nJson representation :\")\n", "pprint(df.npd.to_json(), width=140)\n", - "print('\\nis Json translation reversible ? ', df.equals(npd.read_json(df.npd.to_json())))" + "print(\"\\nis Json translation reversible ? \", df.equals(npd.read_json(df.npd.to_json())))" ] }, { @@ -1119,13 +1206,33 @@ } ], "source": [ - "data = {\"quantity\": [\"1 kg\", \"1 kg\", \"1 kg\", \"1 kg\", \"10 kg\", \"10 kg\", \"10 kg\", \"10 kg\"],\n", - " \"product\": [\"banana\", \"orange\", \"apple\", \"peppers\", \"banana\", \"orange\", \"apple\", \"peppers\"], \n", - " \"plants\": [\"fruit\", \"fruit\", \"fruit\", \"vegetable\", \"fruit\", \"fruit\", \"fruit\", \"vegetable\"], \n", - " \"price\": [0.5, 2, 1, 1.5, 5, 20, 10, 15]}\n", + "data = {\n", + " \"quantity\": [\"1 kg\", \"1 kg\", \"1 kg\", \"1 kg\", \"10 kg\", \"10 kg\", \"10 kg\", \"10 kg\"],\n", + " \"product\": [\n", + " \"banana\",\n", + " \"orange\",\n", + " \"apple\",\n", + " \"peppers\",\n", + " \"banana\",\n", + " \"orange\",\n", + " \"apple\",\n", + " \"peppers\",\n", + " ],\n", + " \"plants\": [\n", + " \"fruit\",\n", + " \"fruit\",\n", + " \"fruit\",\n", + " \"vegetable\",\n", + " \"fruit\",\n", + " \"fruit\",\n", + " \"fruit\",\n", + " \"vegetable\",\n", + " ],\n", + " \"price\": [0.5, 2, 1, 1.5, 5, 20, 10, 15],\n", + "}\n", "\n", - "df = pd.DataFrame(data)\n", - "df2 = pd.DataFrame(data, dtype='category').sort_values(by=['quantity', 'product'])\n", + "df = pd.DataFrame(data)\n", + "df2 = pd.DataFrame(data, dtype=\"category\").sort_values(by=[\"quantity\", \"product\"])\n", "df2" ] }, @@ -1551,20 +1658,24 @@ } ], "source": [ - "json_df = Ntv.obj(df).to_obj()[':tab']\n", + "json_df = Ntv.obj(df).to_obj()[\":tab\"]\n", "print('json_df is the JSON-TAB format with \"full\" mode\\n')\n", "pprint(json_df, width=200)\n", "\n", - "json_xar = Ntv.obj(df2).to_obj()[':tab']\n", + "json_xar = Ntv.obj(df2).to_obj()[\":tab\"]\n", "print('\\njson_xa is the JSON-TAB format with \"optimize\" mode\\n')\n", "pprint(json_xar, width=200)\n", "\n", - "df_from_xar = Ntv.obj({':tab': json_xar}).to_obj(format='obj').sort_index()\n", - "print('\\nDataFrame from the two JSON-TAB format are identical ? ', df.astype('object').equals(df_from_xar.astype('object')))\n", + "df_from_xar = Ntv.obj({\":tab\": json_xar}).to_obj(format=\"obj\").sort_index()\n", + "print(\n", + " \"\\nDataFrame from the two JSON-TAB format are identical ? \",\n", + " df.astype(\"object\").equals(df_from_xar.astype(\"object\")),\n", + ")\n", "\n", "print('\\nThe \"optimize\" JSON-TAB format is the image of the DataArray Xarray')\n", "from tab_dataset import Sdataset\n", - "Sdataset.ntv(json_df).setcanonorder().to_xarray(varname='price')" + "\n", + "Sdataset.ntv(json_df).setcanonorder().to_xarray(varname=\"price\")" ] }, { @@ -1622,57 +1733,59 @@ "source": [ "# json interface ok\n", "srs = [\n", - " # without ntv_type, without dtype\n", - " pd.Series([{'a': 2, 'e':4}, {'a': 3, 'e':5}, {'a': 4, 'e':6}]), \n", - " pd.Series([[1,2], [3,4], [5,6]]), \n", - " pd.Series([[1,2], [3,4], {'a': 3, 'e':5}]), \n", - " pd.Series([True, False, True]),\n", - " pd.Series(['az', 'er', 'cd']),\n", - " pd.Series(['az', 'az', 'az']),\n", - " pd.Series([1,2,3]),\n", - " pd.Series([1.1,2,3]),\n", - " \n", - " # without ntv_type, with dtype\n", - " pd.Series([10,20,30], dtype='Int64'),\n", - " pd.Series([True, False, True], dtype='boolean'),\n", - " pd.Series([1.1, 2, 3], dtype='float64'), \n", - "\n", - " # with ntv_type only in json data (not numbers)\n", - " pd.Series([pd.NaT, pd.NaT, pd.NaT]),\n", - " pd.Series([datetime(2022, 1, 1), datetime(2022, 1, 2)], dtype='datetime64[ns]'),\n", - " pd.Series(pd.to_timedelta(['1D', '2D'])),\n", - " pd.Series(['az', 'er', 'cd'], dtype='string'), \n", - "\n", - " # with ntv_type only in json data (numbers)\n", - " pd.Series([1,2,3], dtype='Int32'), \n", - " pd.Series([1,2,3], dtype='UInt64'),\n", - " pd.Series([1,2,3], dtype='float32'),\n", - "\n", - " # with ntv_type in Series name and in json data (numbers)\n", - " pd.Series([1,2,3], name='::int64'),\n", - " pd.Series([1,2,3], dtype='Float64', name='::float64'), # force dtype dans la conversion json\n", - "\n", - " # with ntv_type in Series name and in json data (not numbers)\n", - " pd.Series([[1,2], [3,4], [5,6]], name='::array'), \n", - " pd.Series([{'a': 2, 'e':4}, {'a': 3, 'e':5}, {'a': 4, 'e':6}], name='::object'), \n", - " pd.Series([None, None, None], name='::null'), \n", - " pd.Series([\"geo:13.412 ,103.866\", \"mailto:John.Doe@example.com\"], name='::uri', dtype='string'),\n", - " pd.Series([\"///path/to/file\", \"//host.example.com/path/to/file\"], name='::file', dtype='string'),\n", - "\n", - " # with ntv_type converted in object dtype (not in datetime)\n", - " pd.Series([date(2022, 1, 1), date(2022, 1, 2)], name='::date'),\n", - " pd.Series([time(10, 21, 1), time(8, 1, 2)], name='::time'),\n", - "\n", - " # with ntv_type unknown in pandas and with pandas conversion \n", - " pd.Series([1,2,3], dtype='int64', name='::day'),\n", - " pd.Series([2001,2002,2003], dtype='int64', name='::year'),\n", - " pd.Series([21,10,55], name='::minute'),\n", - "\n", - " # with ntv_type unknown in pandas and NTV conversion\n", - " pd.Series([Point(1, 0), Point(1, 1), Point(1, 2)], name='::point'),\n", + " # without ntv_type, without dtype\n", + " pd.Series([{\"a\": 2, \"e\": 4}, {\"a\": 3, \"e\": 5}, {\"a\": 4, \"e\": 6}]),\n", + " pd.Series([[1, 2], [3, 4], [5, 6]]),\n", + " pd.Series([[1, 2], [3, 4], {\"a\": 3, \"e\": 5}]),\n", + " pd.Series([True, False, True]),\n", + " pd.Series([\"az\", \"er\", \"cd\"]),\n", + " pd.Series([\"az\", \"az\", \"az\"]),\n", + " pd.Series([1, 2, 3]),\n", + " pd.Series([1.1, 2, 3]),\n", + " # without ntv_type, with dtype\n", + " pd.Series([10, 20, 30], dtype=\"Int64\"),\n", + " pd.Series([True, False, True], dtype=\"boolean\"),\n", + " pd.Series([1.1, 2, 3], dtype=\"float64\"),\n", + " # with ntv_type only in json data (not numbers)\n", + " pd.Series([pd.NaT, pd.NaT, pd.NaT]),\n", + " pd.Series([datetime(2022, 1, 1), datetime(2022, 1, 2)], dtype=\"datetime64[ns]\"),\n", + " pd.Series(pd.to_timedelta([\"1D\", \"2D\"])),\n", + " pd.Series([\"az\", \"er\", \"cd\"], dtype=\"string\"),\n", + " # with ntv_type only in json data (numbers)\n", + " pd.Series([1, 2, 3], dtype=\"Int32\"),\n", + " pd.Series([1, 2, 3], dtype=\"UInt64\"),\n", + " pd.Series([1, 2, 3], dtype=\"float32\"),\n", + " # with ntv_type in Series name and in json data (numbers)\n", + " pd.Series([1, 2, 3], name=\"::int64\"),\n", + " pd.Series(\n", + " [1, 2, 3], dtype=\"Float64\", name=\"::float64\"\n", + " ), # force dtype dans la conversion json\n", + " # with ntv_type in Series name and in json data (not numbers)\n", + " pd.Series([[1, 2], [3, 4], [5, 6]], name=\"::array\"),\n", + " pd.Series([{\"a\": 2, \"e\": 4}, {\"a\": 3, \"e\": 5}, {\"a\": 4, \"e\": 6}], name=\"::object\"),\n", + " pd.Series([None, None, None], name=\"::null\"),\n", + " pd.Series(\n", + " [\"geo:13.412 ,103.866\", \"mailto:John.Doe@example.com\"],\n", + " name=\"::uri\",\n", + " dtype=\"string\",\n", + " ),\n", + " pd.Series(\n", + " [\"///path/to/file\", \"//host.example.com/path/to/file\"],\n", + " name=\"::file\",\n", + " dtype=\"string\",\n", + " ),\n", + " # with ntv_type converted in object dtype (not in datetime)\n", + " pd.Series([date(2022, 1, 1), date(2022, 1, 2)], name=\"::date\"),\n", + " pd.Series([time(10, 21, 1), time(8, 1, 2)], name=\"::time\"),\n", + " # with ntv_type unknown in pandas and with pandas conversion\n", + " pd.Series([1, 2, 3], dtype=\"int64\", name=\"::day\"),\n", + " pd.Series([2001, 2002, 2003], dtype=\"int64\", name=\"::year\"),\n", + " pd.Series([21, 10, 55], name=\"::minute\"),\n", + " # with ntv_type unknown in pandas and NTV conversion\n", + " pd.Series([Point(1, 0), Point(1, 1), Point(1, 2)], name=\"::point\"),\n", "]\n", "for sr in srs:\n", - " print(sr.npd.equals(npd.read_json(sr.npd.to_json())), sr.npd.to_json()) " + " print(sr.npd.equals(npd.read_json(sr.npd.to_json())), sr.npd.to_json())" ] }, { @@ -1706,24 +1819,25 @@ ], "source": [ "# json interface ok\n", - "for a in [{'test::int32': [1,2,3]},\n", - " {'test': [1,2,3]},\n", - " [1.0, 2.1, 3.0],\n", - " ['er', 'et', 'ez'],\n", - " [True, False, True],\n", - " {'::boolean': [True, False, True]},\n", - " {'::string': ['er', 'et', 'ez']},\n", - " {'test::float32': [1.0, 2.5, 3.0]},\n", - " {'::int64': [1,2,3]},\n", - " {'::datetime': [\"2021-12-31T23:00:00.000\",\"2022-01-01T23:00:00.000\"] },\n", - " {'::date': [\"2021-12-31\", \"2022-01-01\"] },\n", - " {'::time': [\"23:00:00\", \"23:01:00\"] },\n", - " {'::object': [{'a': 3, 'e':5}, {'a': 4, 'e':6}]},\n", - " {'::array': [[1,2], [3,4], [5,6]]},\n", - " True,\n", - " {':boolean': True}\n", - " ]:\n", - " field = {':field': a}\n", + "for a in [\n", + " {\"test::int32\": [1, 2, 3]},\n", + " {\"test\": [1, 2, 3]},\n", + " [1.0, 2.1, 3.0],\n", + " [\"er\", \"et\", \"ez\"],\n", + " [True, False, True],\n", + " {\"::boolean\": [True, False, True]},\n", + " {\"::string\": [\"er\", \"et\", \"ez\"]},\n", + " {\"test::float32\": [1.0, 2.5, 3.0]},\n", + " {\"::int64\": [1, 2, 3]},\n", + " {\"::datetime\": [\"2021-12-31T23:00:00.000\", \"2022-01-01T23:00:00.000\"]},\n", + " {\"::date\": [\"2021-12-31\", \"2022-01-01\"]},\n", + " {\"::time\": [\"23:00:00\", \"23:01:00\"]},\n", + " {\"::object\": [{\"a\": 3, \"e\": 5}, {\"a\": 4, \"e\": 6}]},\n", + " {\"::array\": [[1, 2], [3, 4], [5, 6]]},\n", + " True,\n", + " {\":boolean\": True},\n", + "]:\n", + " field = {\":field\": a}\n", " print(npd.read_json(field).npd.to_json() == field, field)" ] }, @@ -1757,22 +1871,32 @@ ], "source": [ "# json interface ok (categorical data)\n", - "for a in [{'test': [{'::int32': [1, 2, 3]}, [0,1,2,0,1]]},\n", - " {'test': [[1, 2, 3], [0,1,2,0,1]]},\n", - " [[1.0, 2.1, 3.0], [0,1,2,0,1]],\n", - " [['er', 'et', 'ez'], [0,1,2,0,1]],\n", - " [[True, False], [0,1,0,1,0]],\n", - " [{'::string': ['er', 'et', 'ez']}, [0,1,2,0,1]],\n", - " {'test':[{'::float32': [1.0, 2.5, 3.0]}, [0,1,2,0,1]]},\n", - " [{'::int64': [1, 2, 3]}, [0,1,2,0,1]],\n", - " [{'::datetime': [\"2021-12-31T23:00:00.000\", \"2022-01-01T23:00:00.000\"] }, [0,1,0,1,0]],\n", - " [{'::date': [\"2021-12-31\", \"2022-01-01\"] }, [0,1,0,1,0]],\n", - " [{'::time': [\"23:00:00\", \"23:01:00\"] }, [0,1,0,1,0]],\n", - " {'test_date': [{'::datetime': [\"2021-12-31T23:00:00.000\", \"2022-01-01T23:00:00.000\"] }, [0,1,0,1,0]]},\n", - " [{'::boolean': [True, False]}, [0,1,0,1,0]],\n", - " [[True], [2]], # periodic Series\n", - " {'quantity': [['1 kg', '10 kg'], [4]]}]: # periodic Series\n", - " field = {':field': a}\n", + "for a in [\n", + " {\"test\": [{\"::int32\": [1, 2, 3]}, [0, 1, 2, 0, 1]]},\n", + " {\"test\": [[1, 2, 3], [0, 1, 2, 0, 1]]},\n", + " [[1.0, 2.1, 3.0], [0, 1, 2, 0, 1]],\n", + " [[\"er\", \"et\", \"ez\"], [0, 1, 2, 0, 1]],\n", + " [[True, False], [0, 1, 0, 1, 0]],\n", + " [{\"::string\": [\"er\", \"et\", \"ez\"]}, [0, 1, 2, 0, 1]],\n", + " {\"test\": [{\"::float32\": [1.0, 2.5, 3.0]}, [0, 1, 2, 0, 1]]},\n", + " [{\"::int64\": [1, 2, 3]}, [0, 1, 2, 0, 1]],\n", + " [\n", + " {\"::datetime\": [\"2021-12-31T23:00:00.000\", \"2022-01-01T23:00:00.000\"]},\n", + " [0, 1, 0, 1, 0],\n", + " ],\n", + " [{\"::date\": [\"2021-12-31\", \"2022-01-01\"]}, [0, 1, 0, 1, 0]],\n", + " [{\"::time\": [\"23:00:00\", \"23:01:00\"]}, [0, 1, 0, 1, 0]],\n", + " {\n", + " \"test_date\": [\n", + " {\"::datetime\": [\"2021-12-31T23:00:00.000\", \"2022-01-01T23:00:00.000\"]},\n", + " [0, 1, 0, 1, 0],\n", + " ]\n", + " },\n", + " [{\"::boolean\": [True, False]}, [0, 1, 0, 1, 0]],\n", + " [[True], [2]], # periodic Series\n", + " {\"quantity\": [[\"1 kg\", \"10 kg\"], [4]]},\n", + "]: # periodic Series\n", + " field = {\":field\": a}\n", " print(npd.read_json(field).npd.to_json() == field, field)" ] }, @@ -1795,22 +1919,26 @@ ], "source": [ "# json interface ko\n", - "srs = [# without ntv_type\n", - " pd.Series([math.nan, math.nan]), # bug pandas conversion json : datetime NaT\n", - " \n", - " # without ntv_type, with dtype\n", - " pd.Series([math.nan, math.nan], dtype='float64'), # bug pandas conversion json : datetime NaT\n", - " \n", - " # with ntv_type in Series name and in json data\n", - " pd.Series([1,2,3], dtype='UInt64', name='::uint64'), # name inutile\n", - " \n", - " # with ntv_type unknown in pandas\n", - " pd.Series([datetime(2022, 1, 1), datetime(2022, 1, 2), datetime(2022, 1, 3)], dtype='datetime64[ns, UTC]'), #à traiter\n", + "srs = [ # without ntv_type\n", + " pd.Series([math.nan, math.nan]), # bug pandas conversion json : datetime NaT\n", + " # without ntv_type, with dtype\n", + " pd.Series(\n", + " [math.nan, math.nan], dtype=\"float64\"\n", + " ), # bug pandas conversion json : datetime NaT\n", + " # with ntv_type in Series name and in json data\n", + " pd.Series([1, 2, 3], dtype=\"UInt64\", name=\"::uint64\"), # name inutile\n", + " # with ntv_type unknown in pandas\n", + " pd.Series(\n", + " [datetime(2022, 1, 1), datetime(2022, 1, 2), datetime(2022, 1, 3)],\n", + " dtype=\"datetime64[ns, UTC]\",\n", + " ), # à traiter\n", "]\n", "for sr in srs:\n", - " print(sr.npd.equals(npd.read_json(sr.npd.to_json())), \n", - " npd.read_json(sr.npd.to_json()).name == sr.name, \n", - " sr.npd.to_json(text=True)) " + " print(\n", + " sr.npd.equals(npd.read_json(sr.npd.to_json())),\n", + " npd.read_json(sr.npd.to_json()).name == sr.name,\n", + " sr.npd.to_json(text=True),\n", + " )" ] }, { @@ -1829,8 +1957,10 @@ ], "source": [ "# json interface ko (categorical data)\n", - "for a in [{'test_array': [{'::array': [[1,2], [3,4], [5,6], [7,8]]}, [0, 1, 0, 2, 3]]}]: # list -> tuple to be hashable\n", - " field = {':field': a}\n", + "for a in [\n", + " {\"test_array\": [{\"::array\": [[1, 2], [3, 4], [5, 6], [7, 8]]}, [0, 1, 0, 2, 3]]}\n", + "]: # list -> tuple to be hashable\n", + " field = {\":field\": a}\n", " print(npd.read_json(field).npd.to_json() == field, field)" ] } diff --git a/example/example_size.ipynb b/example/example_size.ipynb index a5cd777..a363438 100644 --- a/example/example_size.ipynb +++ b/example/example_size.ipynb @@ -76,7 +76,6 @@ "import pandas as pd\n", "import numpy as np\n", "from ntv_numpy import Xdataset\n", - "import ntv_pandas as npd\n", "import cbor2\n", "import matplotlib.pyplot as plt" ] @@ -130,10 +129,14 @@ "outputs": [], "source": [ "def coords_int(val, leng):\n", - " return np.arange(val, val+leng)\n", + " return np.arange(val, val + leng)\n", + "\n", "\n", "def coords_str(leng):\n", - " return np.array([''.join(random.choices(string.ascii_letters, k=12)) for _ in range(leng)])\n", + " return np.array(\n", + " [\"\".join(random.choices(string.ascii_letters, k=12)) for _ in range(leng)]\n", + " )\n", + "\n", "\n", "def variable(nb, dim):\n", " return np.arange(nb**dim).reshape([nb] * dim)" @@ -251,12 +254,14 @@ } ], "source": [ - "dimensions = 3 \n", + "dimensions = 3\n", "nb1 = 2\n", "\n", - "fields = ['dim_' + str(i) for i in range(dimensions)]\n", - "xds = xr.Dataset({\"var\": (fields, variable(nb1, dimensions))}, \n", - " coords={field: coords_int(val*nb1, nb1) for val, field in enumerate(fields)})\n", + "fields = [\"dim_\" + str(i) for i in range(dimensions)]\n", + "xds = xr.Dataset(\n", + " {\"var\": (fields, variable(nb1, dimensions))},\n", + " coords={field: coords_int(val * nb1, nb1) for val, field in enumerate(fields)},\n", + ")\n", "\n", "df = Xdataset.from_xarray(xds).to_dataframe(ntv_type=False, info=False).reset_index()\n", "df" @@ -280,7 +285,7 @@ } ], "source": [ - "df.to_json(orient='values')" + "df.to_json(orient=\"values\")" ] }, { @@ -306,8 +311,8 @@ } ], "source": [ - "Xdataset.from_xarray(xds).to_json(notype='all', header=False, encoded=False)\n", - "Xdataset.from_dataframe(df).to_json(notype='all', header=False, encoded=False)" + "Xdataset.from_xarray(xds).to_json(notype=\"all\", header=False, encoded=False)\n", + "Xdataset.from_dataframe(df).to_json(notype=\"all\", header=False, encoded=False)" ] }, { @@ -503,19 +508,29 @@ } ], "source": [ - "dimensions = 5 \n", - "fields = ['dim_' + str(i) for i in range(dimensions)]\n", + "dimensions = 5\n", + "fields = [\"dim_\" + str(i) for i in range(dimensions)]\n", "\n", "nb1 = 4\n", "nb2 = 10\n", - "xdss = [xr.Dataset({\"var\": (fields, variable(nb1, dimensions))}, \n", - " coords={field: coords_int(val*nb1, nb1) for val, field in enumerate(fields)}),\n", - " xr.Dataset({\"var\": (fields, variable(nb1, dimensions))}, \n", - " coords={field: coords_str(nb1) for val, field in enumerate(fields)}),\n", - " xr.Dataset({\"var\": (fields, variable(nb2, dimensions))}, \n", - " coords={field: coords_int(val*nb2, nb2) for val, field in enumerate(fields)}),\n", - " xr.Dataset({\"var\": (fields, variable(nb2, dimensions))}, \n", - " coords={field: coords_str(nb2) for val, field in enumerate(fields)})] #,\n", + "xdss = [\n", + " xr.Dataset(\n", + " {\"var\": (fields, variable(nb1, dimensions))},\n", + " coords={field: coords_int(val * nb1, nb1) for val, field in enumerate(fields)},\n", + " ),\n", + " xr.Dataset(\n", + " {\"var\": (fields, variable(nb1, dimensions))},\n", + " coords={field: coords_str(nb1) for val, field in enumerate(fields)},\n", + " ),\n", + " xr.Dataset(\n", + " {\"var\": (fields, variable(nb2, dimensions))},\n", + " coords={field: coords_int(val * nb2, nb2) for val, field in enumerate(fields)},\n", + " ),\n", + " xr.Dataset(\n", + " {\"var\": (fields, variable(nb2, dimensions))},\n", + " coords={field: coords_str(nb2) for val, field in enumerate(fields)},\n", + " ),\n", + "] # ,\n", "\n", "Xdataset.from_xarray(xdss[2]).to_dataframe(ntv_type=False, info=False).reset_index()" ] @@ -528,27 +543,38 @@ "outputs": [], "source": [ "def file_sizes(xnd, forma={}):\n", - " '''calculate the size of each format'''\n", + " \"\"\"calculate the size of each format\"\"\"\n", " df = xnd.to_dataframe(json_name=False, info=False).reset_index().sample(frac=1)\n", - " jsn = xnd.to_json(notype='all', header=False, encoded=False, format=forma)\n", - " return {'JSON-pandas': len(df.to_json(orient='values')),\n", - " 'CSV': len(df.to_csv()),\n", - " 'PARQUET-pandas': len(df.to_parquet(engine='pyarrow')),\n", - " 'JSON-NTV': len(json.dumps(jsn)),\n", - " 'CBOR-NTV': len(cbor2.dumps(jsn))}\n", + " jsn = xnd.to_json(notype=\"all\", header=False, encoded=False, format=forma)\n", + " return {\n", + " \"JSON-pandas\": len(df.to_json(orient=\"values\")),\n", + " \"CSV\": len(df.to_csv()),\n", + " \"PARQUET-pandas\": len(df.to_parquet(engine=\"pyarrow\")),\n", + " \"JSON-NTV\": len(json.dumps(jsn)),\n", + " \"CBOR-NTV\": len(cbor2.dumps(jsn)),\n", + " }\n", + "\n", "\n", "def sizes_plot(sizes, titles, fig_title):\n", - " '''plot the size of some format''' \n", + " \"\"\"plot the size of some format\"\"\"\n", " fig, axs = plt.subplots(2, 2, figsize=(11, 6), sharex=\"col\")\n", " fig.suptitle(fig_title)\n", " for idx, (size, title) in enumerate(zip(sizes, titles)):\n", - " bar_colors = ['tab:red' if val == min(size.values()) else 'tab:blue' for val in size.values()]\n", - " percent = [str(round(val / list(size.values())[1] * 100, 1)) + ' %' for val in size.values()]\n", - " bar_plt = axs[idx//2][idx%2].bar(size.keys(), size.values(), color=bar_colors)\n", - " axs[idx//2][idx%2].set_title(title)\n", - " axs[idx//2][idx%2].bar_label(bar_plt, percent, label_type='center')\n", - " if idx//2:\n", - " axs[idx//2][idx%2].tick_params(axis='x', rotation=70)\n", + " bar_colors = [\n", + " \"tab:red\" if val == min(size.values()) else \"tab:blue\"\n", + " for val in size.values()\n", + " ]\n", + " percent = [\n", + " str(round(val / list(size.values())[1] * 100, 1)) + \" %\"\n", + " for val in size.values()\n", + " ]\n", + " bar_plt = axs[idx // 2][idx % 2].bar(\n", + " size.keys(), size.values(), color=bar_colors\n", + " )\n", + " axs[idx // 2][idx % 2].set_title(title)\n", + " axs[idx // 2][idx % 2].bar_label(bar_plt, percent, label_type=\"center\")\n", + " if idx // 2:\n", + " axs[idx // 2][idx % 2].tick_params(axis=\"x\", rotation=70)\n", " plt.show()" ] }, @@ -572,8 +598,13 @@ "source": [ "xnds = [Xdataset.from_xarray(xds) for xds in xdss]\n", "sizes = [file_sizes(xnd) for xnd in xnds]\n", - "titles = ['integer (6 100 values)', 'string (6 100 values)', 'integer (600 000 values)', 'string (600 000 values)']\n", - "sizes_plot(sizes, titles, 'multidimensional DataFrame size (bytes)')" + "titles = [\n", + " \"integer (6 100 values)\",\n", + " \"string (6 100 values)\",\n", + " \"integer (600 000 values)\",\n", + " \"string (600 000 values)\",\n", + "]\n", + "sizes_plot(sizes, titles, \"multidimensional DataFrame size (bytes)\")" ] }, { @@ -611,8 +642,8 @@ " arr = arr // 2\n", " data_int.append(arr)\n", " data_str.append(np.frompyfunc(str, 1, 1)(arr))\n", - " field_names = ['idx_' + str(i) for i in range(len(data_int))]\n", - " return [data_int, data_str, field_names] " + " field_names = [\"idx_\" + str(i) for i in range(len(data_int))]\n", + " return [data_int, data_str, field_names]" ] }, { @@ -810,7 +841,9 @@ "source": [ "size1 = 4\n", "\n", - "df = pd.DataFrame({field: data for field, data in zip (data_int_str(size1)[2], data_int_str(size1)[0])})\n", + "df = pd.DataFrame(\n", + " {field: data for field, data in zip(data_int_str(size1)[2], data_int_str(size1)[0])}\n", + ")\n", "df" ] }, @@ -832,7 +865,7 @@ } ], "source": [ - "df.to_json(orient='values')" + "df.to_json(orient=\"values\")" ] }, { @@ -861,8 +894,10 @@ } ], "source": [ - "forma = {name:'complete' for name in df.columns[1:]}\n", - "Xdataset.from_dataframe(df).to_json(notype='all', header=False, encoded=False, format=forma)\n" + "forma = {name: \"complete\" for name in df.columns[1:]}\n", + "Xdataset.from_dataframe(df).to_json(\n", + " notype=\"all\", header=False, encoded=False, format=forma\n", + ")" ] }, { @@ -911,7 +946,7 @@ "size1 = 9\n", "size2 = 14\n", "data_set = data_int_str(size1)[:2] + data_int_str(size2)[:2]\n", - "names_set= [data_int_str(size1)[2]] * 2 + [data_int_str(size2)[2]] * 2" + "names_set = [data_int_str(size1)[2]] * 2 + [data_int_str(size2)[2]] * 2" ] }, { @@ -1127,7 +1162,10 @@ } ], "source": [ - "df_list = [pd.DataFrame({field: data for field, data in zip (field_names, data_list)}) for field_names, data_list in zip(names_set, data_set)]\n", + "df_list = [\n", + " pd.DataFrame({field: data for field, data in zip(field_names, data_list)})\n", + " for field_names, data_list in zip(names_set, data_set)\n", + "]\n", "\n", "df_list[0]" ] @@ -1160,10 +1198,15 @@ } ], "source": [ - "forma = [{name:'complete' for name in xnd.names[1:]} for xnd in xnds]\n", + "forma = [{name: \"complete\" for name in xnd.names[1:]} for xnd in xnds]\n", "sizes = [file_sizes(xnd, form) for xnd, form in zip(xnds, forma)]\n", - "titles = ['integer (5 100 values)', 'string (5 100 values)', 'integer (524 300 values)', 'string (524 300 values)']\n", - "sizes_plot(sizes, titles, 'tree DataFrame size (bytes)')" + "titles = [\n", + " \"integer (5 100 values)\",\n", + " \"string (5 100 values)\",\n", + " \"integer (524 300 values)\",\n", + " \"string (524 300 values)\",\n", + "]\n", + "sizes_plot(sizes, titles, \"tree DataFrame size (bytes)\")" ] } ], diff --git a/example/example_table_pandas.ipynb b/example/example_table_pandas.ipynb index 352f7f0..fd17f94 100644 --- a/example/example_table_pandas.ipynb +++ b/example/example_table_pandas.ipynb @@ -41,15 +41,12 @@ "metadata": {}, "outputs": [], "source": [ - "import math\n", - "import json\n", "from pprint import pprint\n", "\n", "import pandas as pd\n", "import ntv_pandas as npd\n", "from shapely.geometry import Point, Polygon, LineString\n", - "from json_ntv import Ntv\n", - "from datetime import date, datetime, time" + "from datetime import date, time" ] }, { @@ -136,12 +133,22 @@ } ], "source": [ - "df = pd.DataFrame({\n", - " 'end february::date': [date(2023,2,28), date(2024,2,29), date(2025,2,28)],\n", - " 'coordinates::point': [Point([2.3, 48.9]), Point([5.4, 43.3]), Point([4.9, 45.8])],\n", - " 'contact::email': ['john.doe@table.com', 'lisa.minelli@schema.com', 'walter.white@breaking.com']\n", - " }).astype({'contact::email': 'string'})\n", - "df\n" + "df = pd.DataFrame(\n", + " {\n", + " \"end february::date\": [date(2023, 2, 28), date(2024, 2, 29), date(2025, 2, 28)],\n", + " \"coordinates::point\": [\n", + " Point([2.3, 48.9]),\n", + " Point([5.4, 43.3]),\n", + " Point([4.9, 45.8]),\n", + " ],\n", + " \"contact::email\": [\n", + " \"john.doe@table.com\",\n", + " \"lisa.minelli@schema.com\",\n", + " \"walter.white@breaking.com\",\n", + " ],\n", + " }\n", + ").astype({\"contact::email\": \"string\"})\n", + "df" ] }, { @@ -264,7 +271,7 @@ ], "source": [ "df_from_table = npd.read_json(df_to_table)\n", - "print('df created from TableSchema is equal to initial df ? ', df_from_table.equals(df))\n", + "print(\"df created from TableSchema is equal to initial df ? \", df_from_table.equals(df))\n", "df_from_table" ] }, @@ -315,14 +322,14 @@ } ], "source": [ - "sr = pd.Series([1, 2, 3], name='value')\n", - "print('pandas object :\\n' + str(sr))\n", + "sr = pd.Series([1, 2, 3], name=\"value\")\n", + "print(\"pandas object :\\n\" + str(sr))\n", "\n", "json_table = sr.npd.to_json(table=True)\n", - "print('\\nJson Table representation : ')\n", + "print(\"\\nJson Table representation : \")\n", "pprint(json_table, width=100, sort_dicts=False)\n", "\n", - "print('\\nIs Json Table translation reversible ? ', sr.equals(npd.read_json(json_table)))" + "print(\"\\nIs Json Table translation reversible ? \", sr.equals(npd.read_json(json_table)))" ] }, { @@ -346,18 +353,25 @@ } ], "source": [ - "list_sr = [pd.Series([1, 2, 3], name='value'),\n", - " pd.Series([1.1, 2, 3], name='value'),\n", - " pd.Series([True, False, True], name='value'),\n", - " # additional types\n", - " pd.Series([1, 2, 3], name='value', dtype='int32'),\n", - " pd.Series([1, 2, 3], name='value', dtype='uint64'),\n", - " pd.Series([1.6, 2, 3], name='value', dtype='float32')]\n", + "list_sr = [\n", + " pd.Series([1, 2, 3], name=\"value\"),\n", + " pd.Series([1.1, 2, 3], name=\"value\"),\n", + " pd.Series([True, False, True], name=\"value\"),\n", + " # additional types\n", + " pd.Series([1, 2, 3], name=\"value\", dtype=\"int32\"),\n", + " pd.Series([1, 2, 3], name=\"value\", dtype=\"uint64\"),\n", + " pd.Series([1.6, 2, 3], name=\"value\", dtype=\"float32\"),\n", + "]\n", "\n", - "print('reversibility, schema field : ')\n", + "print(\"reversibility, schema field : \")\n", "for sr in list_sr:\n", " json_table = sr.npd.to_json(table=True)\n", - " print(' ', sr.equals(npd.read_json(json_table)), ', ', json_table['schema']['fields'][1])" + " print(\n", + " \" \",\n", + " sr.equals(npd.read_json(json_table)),\n", + " \", \",\n", + " json_table[\"schema\"][\"fields\"][1],\n", + " )" ] }, { @@ -395,20 +409,41 @@ } ], "source": [ - "list_sr = [pd.Series([[1, 2], ['val1', 'val2']], name='value::array'),\n", - " pd.Series([[1, 2], 3, 'test', {'val1': 5, 'val2': 6}], name='value'),\n", - " pd.Series(['az', 'er', 'cd'], name='value', dtype='string'),\n", - " pd.Series([\"geo:13.412 ,103.866\", \"mailto:John.Doe@example.com\"], name='value::uri', dtype='string'),\n", - " pd.Series([\"philippe@loco-labs.io\", \"John.Doe@example.com\"], name='value::email', dtype='string'),\n", - " # additional types\n", - " pd.Series([{'val1': 5, 'val2': 6}, {'val1': 5.1, 'val2': 6.1}], name='value::object'),\n", - " pd.Series([\"///path/to/file\", \"//host.example.com/path/to/file\"], name='value::file', dtype='string'),\n", - " pd.Series([None, None, None], name='value::null')]\n", + "list_sr = [\n", + " pd.Series([[1, 2], [\"val1\", \"val2\"]], name=\"value::array\"),\n", + " pd.Series([[1, 2], 3, \"test\", {\"val1\": 5, \"val2\": 6}], name=\"value\"),\n", + " pd.Series([\"az\", \"er\", \"cd\"], name=\"value\", dtype=\"string\"),\n", + " pd.Series(\n", + " [\"geo:13.412 ,103.866\", \"mailto:John.Doe@example.com\"],\n", + " name=\"value::uri\",\n", + " dtype=\"string\",\n", + " ),\n", + " pd.Series(\n", + " [\"philippe@loco-labs.io\", \"John.Doe@example.com\"],\n", + " name=\"value::email\",\n", + " dtype=\"string\",\n", + " ),\n", + " # additional types\n", + " pd.Series(\n", + " [{\"val1\": 5, \"val2\": 6}, {\"val1\": 5.1, \"val2\": 6.1}], name=\"value::object\"\n", + " ),\n", + " pd.Series(\n", + " [\"///path/to/file\", \"//host.example.com/path/to/file\"],\n", + " name=\"value::file\",\n", + " dtype=\"string\",\n", + " ),\n", + " pd.Series([None, None, None], name=\"value::null\"),\n", + "]\n", "\n", - "print('reversibility, schema field : ')\n", + "print(\"reversibility, schema field : \")\n", "for sr in list_sr:\n", " json_table = sr.npd.to_json(table=True)\n", - " print(' ', sr.equals(npd.read_json(json_table)), ', ', json_table['schema']['fields'][1])" + " print(\n", + " \" \",\n", + " sr.equals(npd.read_json(json_table)),\n", + " \", \",\n", + " json_table[\"schema\"][\"fields\"][1],\n", + " )" ] }, { @@ -450,25 +485,33 @@ } ], "source": [ - "list_sr = [pd.Series(['2022-01-01', '2021-01-01'], dtype='datetime64[ns]', name='value'),\n", - " pd.Series([date(2022,1,1), date(2021,1,1), date(2023,1,1)], name='value::date'),\n", - " pd.Series([time(10,20,50), time(9,20,50), time(8,20,50)], name='value::time'),\n", - " pd.Series([1, 2, 3], name='value::month'),\n", - " pd.Series([2021, 2022, 2023], name='value::year'),\n", - " # additional types\n", - " pd.Series([1, 2, 3], name='value::day'),\n", - " pd.Series([1, 2, 3], name='value::wday'),\n", - " pd.Series([1, 2, 3], name='value::yday'),\n", - " pd.Series([1, 2, 3], name='value::week'),\n", - " pd.Series([1, 2, 3], name='value::hour'),\n", - " pd.Series([1, 2, 3], name='value::minute'),\n", - " pd.Series([1, 2, 3], name='value::second')\n", - " ]\n", + "list_sr = [\n", + " pd.Series([\"2022-01-01\", \"2021-01-01\"], dtype=\"datetime64[ns]\", name=\"value\"),\n", + " pd.Series(\n", + " [date(2022, 1, 1), date(2021, 1, 1), date(2023, 1, 1)], name=\"value::date\"\n", + " ),\n", + " pd.Series([time(10, 20, 50), time(9, 20, 50), time(8, 20, 50)], name=\"value::time\"),\n", + " pd.Series([1, 2, 3], name=\"value::month\"),\n", + " pd.Series([2021, 2022, 2023], name=\"value::year\"),\n", + " # additional types\n", + " pd.Series([1, 2, 3], name=\"value::day\"),\n", + " pd.Series([1, 2, 3], name=\"value::wday\"),\n", + " pd.Series([1, 2, 3], name=\"value::yday\"),\n", + " pd.Series([1, 2, 3], name=\"value::week\"),\n", + " pd.Series([1, 2, 3], name=\"value::hour\"),\n", + " pd.Series([1, 2, 3], name=\"value::minute\"),\n", + " pd.Series([1, 2, 3], name=\"value::second\"),\n", + "]\n", "\n", - "print('reversibility, schema field : ')\n", + "print(\"reversibility, schema field : \")\n", "for sr in list_sr:\n", " json_table = sr.npd.to_json(table=True)\n", - " print(' ', sr.equals(npd.read_json(json_table)), ', ', json_table['schema']['fields'][1])" + " print(\n", + " \" \",\n", + " sr.equals(npd.read_json(json_table)),\n", + " \", \",\n", + " json_table[\"schema\"][\"fields\"][1],\n", + " )" ] }, { @@ -501,18 +544,36 @@ } ], "source": [ - "list_sr = [pd.Series(pd.Series([Point(1, 0), Point(1, 1), Point(1, 2)], name='value::point')),\n", - " pd.Series([Point(1, 0), Polygon([[1.0, 2.0], [1.0, 3.0], [2.0, 4.0]])], name='value::geojson'),\n", - " # additional types\n", - " pd.Series([Point(1, 0), Polygon([[1.0, 2.0], [1.0, 3.0], [2.0, 4.0]])], name='value::geometry'),\n", - " pd.Series([Polygon([[1, 2], [1, 3], [2, 4]]), Polygon([[1, 2], [1, 3], [2, 5]])], name='value::polygon'),\n", - " pd.Series([LineString([[1, 2], [1, 3], [2, 4]]), LineString([[1, 2], [1, 3], [2, 5]])], name='value::line')\n", - " ]\n", + "list_sr = [\n", + " pd.Series(pd.Series([Point(1, 0), Point(1, 1), Point(1, 2)], name=\"value::point\")),\n", + " pd.Series(\n", + " [Point(1, 0), Polygon([[1.0, 2.0], [1.0, 3.0], [2.0, 4.0]])],\n", + " name=\"value::geojson\",\n", + " ),\n", + " # additional types\n", + " pd.Series(\n", + " [Point(1, 0), Polygon([[1.0, 2.0], [1.0, 3.0], [2.0, 4.0]])],\n", + " name=\"value::geometry\",\n", + " ),\n", + " pd.Series(\n", + " [Polygon([[1, 2], [1, 3], [2, 4]]), Polygon([[1, 2], [1, 3], [2, 5]])],\n", + " name=\"value::polygon\",\n", + " ),\n", + " pd.Series(\n", + " [LineString([[1, 2], [1, 3], [2, 4]]), LineString([[1, 2], [1, 3], [2, 5]])],\n", + " name=\"value::line\",\n", + " ),\n", + "]\n", "\n", - "print('reversibility, schema field : ')\n", + "print(\"reversibility, schema field : \")\n", "for sr in list_sr:\n", " json_table = sr.npd.to_json(table=True)\n", - " print(' ', sr.equals(npd.read_json(json_table)), ', ', json_table['schema']['fields'][1])" + " print(\n", + " \" \",\n", + " sr.equals(npd.read_json(json_table)),\n", + " \", \",\n", + " json_table[\"schema\"][\"fields\"][1],\n", + " )" ] }, { @@ -625,35 +686,62 @@ } ], "source": [ - "df = pd.DataFrame({\n", + "df = pd.DataFrame(\n", + " {\n", " # numerical\n", - " 'float': [1.1, 2, 3],\n", - " 'boolean': [True, False, False],\n", - " 'int32': pd.Series([1, 2, 3], dtype='int32'),\n", + " \"float\": [1.1, 2, 3],\n", + " \"boolean\": [True, False, False],\n", + " \"int32\": pd.Series([1, 2, 3], dtype=\"int32\"),\n", " # json\n", - " 'ex1::array': [[1, 2], ['val1', 'val2'], [1, {'val3': 3}]],\n", - " 'json': [[1, 2], 'test', {'val1': 5, 'val2': 6}],\n", - " 'string': pd.Series(['az', 'er', 'cd'], dtype='string'),\n", - " 'ex2::uri': pd.Series([\"geo:13.412 ,103.866\", \"mailto:John.Doe@example.com\", \"\"], dtype='string'),\n", - " 'ex3::email': pd.Series([\"philippe@loco-labs.io\", \"John.Doe@example.com\", \"\"], dtype='string'),\n", - " 'ex4::object': [{'val1': 5, 'val2': 6}, {'val1': 5.1, 'val2': 6.1}, {}],\n", + " \"ex1::array\": [[1, 2], [\"val1\", \"val2\"], [1, {\"val3\": 3}]],\n", + " \"json\": [[1, 2], \"test\", {\"val1\": 5, \"val2\": 6}],\n", + " \"string\": pd.Series([\"az\", \"er\", \"cd\"], dtype=\"string\"),\n", + " \"ex2::uri\": pd.Series(\n", + " [\"geo:13.412 ,103.866\", \"mailto:John.Doe@example.com\", \"\"], dtype=\"string\"\n", + " ),\n", + " \"ex3::email\": pd.Series(\n", + " [\"philippe@loco-labs.io\", \"John.Doe@example.com\", \"\"], dtype=\"string\"\n", + " ),\n", + " \"ex4::object\": [{\"val1\": 5, \"val2\": 6}, {\"val1\": 5.1, \"val2\": 6.1}, {}],\n", " # datation\n", - " 'datetime': pd.Series(['2022-01-01', '2021-01-01', '2023-01-01'], dtype='datetime64[ns]'),\n", - " 'ex5::date': [date(2022,1,1), date(2021,1,1), date(2023,1,1)],\n", - " 'ex6::time': [time(10,20,50), time(9,20,50), time(8,20,50)],\n", - " 'ex7::month': [1, 2, 3],\n", - " 'ex8::hour': [1, 2, 3],\n", + " \"datetime\": pd.Series(\n", + " [\"2022-01-01\", \"2021-01-01\", \"2023-01-01\"], dtype=\"datetime64[ns]\"\n", + " ),\n", + " \"ex5::date\": [date(2022, 1, 1), date(2021, 1, 1), date(2023, 1, 1)],\n", + " \"ex6::time\": [time(10, 20, 50), time(9, 20, 50), time(8, 20, 50)],\n", + " \"ex7::month\": [1, 2, 3],\n", + " \"ex8::hour\": [1, 2, 3],\n", " # location\n", - " 'ex9::point': [Point(1, 0), Point(1, 1), Point(1, 2)],\n", - " 'ex10::geojson': [Point(1, 0), LineString([[1, 2], [1, 3]]), Polygon([[1.0, 2.0], [1.0, 3.0], [2.0, 4.0]])],\n", + " \"ex9::point\": [Point(1, 0), Point(1, 1), Point(1, 2)],\n", + " \"ex10::geojson\": [\n", + " Point(1, 0),\n", + " LineString([[1, 2], [1, 3]]),\n", + " Polygon([[1.0, 2.0], [1.0, 3.0], [2.0, 4.0]]),\n", + " ],\n", " # additional types\n", - " 'ex11::geometry': [Point(1, 0), LineString([[1, 2], [1, 3]]), Polygon([[1.0, 2.0], [1.0, 3.0], [2.0, 4.0]])],\n", - " 'ex12::polygon': [Polygon([[1,2], [1,3], [2,4]]), Polygon([[1,2], [1,3], [2,5]]), Polygon([[1,2], [1,3], [2,6]])],\n", - " 'ex13::line': [LineString([[1, 2], [2, 4]]), LineString([[1, 2], [2, 5]]), LineString([[1, 2], [2, 6]])] \n", - "})\n", - "print('\\nJson Table representation : ')\n", + " \"ex11::geometry\": [\n", + " Point(1, 0),\n", + " LineString([[1, 2], [1, 3]]),\n", + " Polygon([[1.0, 2.0], [1.0, 3.0], [2.0, 4.0]]),\n", + " ],\n", + " \"ex12::polygon\": [\n", + " Polygon([[1, 2], [1, 3], [2, 4]]),\n", + " Polygon([[1, 2], [1, 3], [2, 5]]),\n", + " Polygon([[1, 2], [1, 3], [2, 6]]),\n", + " ],\n", + " \"ex13::line\": [\n", + " LineString([[1, 2], [2, 4]]),\n", + " LineString([[1, 2], [2, 5]]),\n", + " LineString([[1, 2], [2, 6]]),\n", + " ],\n", + " }\n", + ")\n", + "print(\"\\nJson Table representation : \")\n", "pprint(df.npd.to_json(table=True), width=100, sort_dicts=False)\n", - "print('\\nis Json translation reversible ? ', df.equals(npd.read_json(df.npd.to_json(table=True))))" + "print(\n", + " \"\\nis Json translation reversible ? \",\n", + " df.equals(npd.read_json(df.npd.to_json(table=True))),\n", + ")" ] }, { @@ -762,15 +850,23 @@ } ], "source": [ - "data = {'index': [100, 200, 300, 400, 500],\n", - " 'dates::date': [date(1964,1,1), date(1985,2,5), date(2022,1,21), date(1964,1,1), date(1985,2,5)],\n", - " 'value': [10, 10, 20, 20, 30],\n", - " 'value32': pd.Series([12, 12, 22, 22, 32], dtype='int32'),\n", - " 'res': [10, 20, 30, 10, 20],\n", - " 'coord::point': [Point(1,2), Point(3,4), Point(5,6), Point(7,8), Point(3,4)],\n", - " 'names': pd.Series(['john', 'eric', 'judith', 'mila', 'hector'], dtype='string'),\n", - " 'unique': True }\n", - "df = pd.DataFrame(data).set_index('index')\n", + "data = {\n", + " \"index\": [100, 200, 300, 400, 500],\n", + " \"dates::date\": [\n", + " date(1964, 1, 1),\n", + " date(1985, 2, 5),\n", + " date(2022, 1, 21),\n", + " date(1964, 1, 1),\n", + " date(1985, 2, 5),\n", + " ],\n", + " \"value\": [10, 10, 20, 20, 30],\n", + " \"value32\": pd.Series([12, 12, 22, 22, 32], dtype=\"int32\"),\n", + " \"res\": [10, 20, 30, 10, 20],\n", + " \"coord::point\": [Point(1, 2), Point(3, 4), Point(5, 6), Point(7, 8), Point(3, 4)],\n", + " \"names\": pd.Series([\"john\", \"eric\", \"judith\", \"mila\", \"hector\"], dtype=\"string\"),\n", + " \"unique\": True,\n", + "}\n", + "df = pd.DataFrame(data).set_index(\"index\")\n", "df.index.name = None\n", "df" ] @@ -837,8 +933,8 @@ ], "source": [ "df_to_table = df.npd.to_json(table=True)\n", - "pprint(df_to_table['data'][0], sort_dicts=False)\n", - "pprint(df_to_table['schema'], sort_dicts=False)\n", + "pprint(df_to_table[\"data\"][0], sort_dicts=False)\n", + "pprint(df_to_table[\"schema\"], sort_dicts=False)\n", "print(npd.read_json(df_to_table).equals(df))" ] } diff --git a/example/example_xarray.ipynb b/example/example_xarray.ipynb index 805fa27..9fd1eee 100644 --- a/example/example_xarray.ipynb +++ b/example/example_xarray.ipynb @@ -62,33 +62,68 @@ "metadata": {}, "outputs": [], "source": [ - "import os\n", "from random import random\n", "\n", - "columns = ['camp', 'date', 'hour', 'city', 'coord', 'prop', 'unit', 'value']\n", + "columns = [\"camp\", \"date\", \"hour\", \"city\", \"coord\", \"prop\", \"unit\", \"value\"]\n", "\n", - "camp = 'air quality sensor measurement'\n", - "dates = ['2024/05/24', '2024/05/25', '2024/05/26', '2024/05/27', '2024/05/28', \n", - " '2024/05/29', '2024/05/30', '2024/05/31', '2024/06/01', '2024/06/02',\n", - " '2024/06/03', '2024/06/04', '2024/06/05', '2024/06/06', '2024/06/07',\n", - " '2024/06/08', '2024/06/09', '2024/06/10', '2024/06/11', '2024/06/12']\n", - "cities = ['lille', 'toulouse', 'marseille', 'strasbourg', 'brest']\n", - "coordinates = ['[3.06, 50.63]', '[1.44, 43.6]', '[5.37, 43.3]', '[7.75, 48.57]', '[4.49, 48.39]']\n", - "properties = ['PM10', 'PM2.5', 'NO2', 'O3', 'CO']\n", - "units = ['µg/m3', 'µg/m3', 'µg/m3', 'µg/m3', 'mg/m3']\n", + "camp = \"air quality sensor measurement\"\n", + "dates = [\n", + " \"2024/05/24\",\n", + " \"2024/05/25\",\n", + " \"2024/05/26\",\n", + " \"2024/05/27\",\n", + " \"2024/05/28\",\n", + " \"2024/05/29\",\n", + " \"2024/05/30\",\n", + " \"2024/05/31\",\n", + " \"2024/06/01\",\n", + " \"2024/06/02\",\n", + " \"2024/06/03\",\n", + " \"2024/06/04\",\n", + " \"2024/06/05\",\n", + " \"2024/06/06\",\n", + " \"2024/06/07\",\n", + " \"2024/06/08\",\n", + " \"2024/06/09\",\n", + " \"2024/06/10\",\n", + " \"2024/06/11\",\n", + " \"2024/06/12\",\n", + "]\n", + "cities = [\"lille\", \"toulouse\", \"marseille\", \"strasbourg\", \"brest\"]\n", + "coordinates = [\n", + " \"[3.06, 50.63]\",\n", + " \"[1.44, 43.6]\",\n", + " \"[5.37, 43.3]\",\n", + " \"[7.75, 48.57]\",\n", + " \"[4.49, 48.39]\",\n", + "]\n", + "properties = [\"PM10\", \"PM2.5\", \"NO2\", \"O3\", \"CO\"]\n", + "units = [\"µg/m3\", \"µg/m3\", \"µg/m3\", \"µg/m3\", \"mg/m3\"]\n", "hours = list(range(24))\n", "\n", + "\n", "def data_acquisition(n_dates, n_round=None, n_hours=24):\n", - " '''return a list of records for a set of dates defined by the n_dates parameter.\n", - " The n_round parameter defines the type of the measurement value: \n", + " \"\"\"return a list of records for a set of dates defined by the n_dates parameter.\n", + " The n_round parameter defines the type of the measurement value:\n", " n_round : None -> integer\n", " n_round : 0 -> float\n", - " n_round : n -> round(float, n)'''\n", - " return [(camp, date, hour, city, coord, prop, unit, random()*10 if n_round == 0 else round(random()*10, n_round))\n", - " for city, coord in zip(cities, coordinates)\n", - " for date in dates[:n_dates]\n", - " for hour in hours[:n_hours]\n", - " for prop, unit in zip(properties, units) ]" + " n_round : n -> round(float, n)\"\"\"\n", + " return [\n", + " (\n", + " camp,\n", + " date,\n", + " hour,\n", + " city,\n", + " coord,\n", + " prop,\n", + " unit,\n", + " random() * 10 if n_round == 0 else round(random() * 10, n_round),\n", + " )\n", + " for city, coord in zip(cities, coordinates)\n", + " for date in dates[:n_dates]\n", + " for hour in hours[:n_hours]\n", + " for prop, unit in zip(properties, units)\n", + " ]" ] }, { @@ -293,7 +328,11 @@ "source": [ "import pandas as pd\n", "\n", - "meas_df_10 = pd.DataFrame(data_acquisition(10), columns=columns).sample(frac=1).reset_index(drop=True)\n", + "meas_df_10 = (\n", + " pd.DataFrame(data_acquisition(10), columns=columns)\n", + " .sample(frac=1)\n", + " .reset_index(drop=True)\n", + ")\n", "meas_df_10" ] }, @@ -761,9 +800,6 @@ } ], "source": [ - "import xarray as xr\n", - "import ntv_pandas\n", - "\n", "meas_xr_10 = meas_df_10.npd.to_xarray()\n", "meas_xr_10" ] @@ -797,7 +833,7 @@ } ], "source": [ - "meas_xr_10.sel(city='lille', prop='NO2', hour=0, date='2024/05/24').values" + "meas_xr_10.sel(city=\"lille\", prop=\"NO2\", hour=0, date=\"2024/05/24\").values" ] }, { @@ -847,8 +883,12 @@ "source": [ "from pprint import pprint\n", "\n", - "notype = [True]*len(meas_df_10.columns)\n", - "meas_df_1 = pd.DataFrame(data_acquisition(1, n_round=2, n_hours=4), columns=columns).sample(frac=1).reset_index(drop=True)\n", + "notype = [True] * len(meas_df_10.columns)\n", + "meas_df_1 = (\n", + " pd.DataFrame(data_acquisition(1, n_round=2, n_hours=4), columns=columns)\n", + " .sample(frac=1)\n", + " .reset_index(drop=True)\n", + ")\n", "meas_xr_1 = meas_df_1.npd.to_xarray()\n", "meas_json = meas_xr_1.nxr.to_json(notype=notype, header=False, encoded=False)\n", "pprint(meas_json, compact=True)" @@ -872,30 +912,47 @@ "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", - "import ntv_numpy\n", "import cbor2\n", "\n", - "notype = [True]*len(meas_df_10.columns)\n", + "notype = [True] * len(meas_df_10.columns)\n", + "\n", "\n", "def file_sizes(df):\n", - " '''calculate the size of each format'''\n", - " return {'pd.to_json(values)': len(df.to_json(orient='values')),\n", - " 'pd.to_csv': len(df.to_csv()),\n", - " 'pd.to_parquet': len(df.to_parquet(engine='pyarrow')),\n", - " 'nxr.to_json': len(df.npd.to_xarray().nxr.to_json(notype=notype, header=False, encoded=True)),\n", - " 'nxr.to_json(cbor)': len(cbor2.dumps(df.npd.to_xarray().nxr.to_json(notype=notype, header=False, encoded=False)))}\n", + " \"\"\"calculate the size of each format\"\"\"\n", + " return {\n", + " \"pd.to_json(values)\": len(df.to_json(orient=\"values\")),\n", + " \"pd.to_csv\": len(df.to_csv()),\n", + " \"pd.to_parquet\": len(df.to_parquet(engine=\"pyarrow\")),\n", + " \"nxr.to_json\": len(\n", + " df.npd.to_xarray().nxr.to_json(notype=notype, header=False, encoded=True)\n", + " ),\n", + " \"nxr.to_json(cbor)\": len(\n", + " cbor2.dumps(\n", + " df.npd.to_xarray().nxr.to_json(\n", + " notype=notype, header=False, encoded=False\n", + " )\n", + " )\n", + " ),\n", + " }\n", + "\n", "\n", "def sizes_plot(sizes, titles, fig_title):\n", - " '''plot the size of some format''' \n", + " \"\"\"plot the size of some format\"\"\"\n", " fig, axs = plt.subplots(1, 4, figsize=(24, 6))\n", " fig.suptitle(fig_title)\n", " for idx, (size, title) in enumerate(zip(sizes, titles)):\n", - " bar_colors = ['tab:red' if val == min(size.values()) else 'tab:blue' for val in size.values()]\n", - " percent = [str(round(val / list(size.values())[1] * 100, 1)) + ' %' for val in size.values()]\n", + " bar_colors = [\n", + " \"tab:red\" if val == min(size.values()) else \"tab:blue\"\n", + " for val in size.values()\n", + " ]\n", + " percent = [\n", + " str(round(val / list(size.values())[1] * 100, 1)) + \" %\"\n", + " for val in size.values()\n", + " ]\n", " bar_plt = axs[idx].bar(size.keys(), size.values(), color=bar_colors)\n", " axs[idx].set_title(title)\n", - " axs[idx].bar_label(bar_plt, percent, label_type='center')\n", - " axs[idx].tick_params(axis='x', rotation=55)\n", + " axs[idx].bar_label(bar_plt, percent, label_type=\"center\")\n", + " axs[idx].tick_params(axis=\"x\", rotation=55)\n", " plt.show()" ] }, @@ -919,9 +976,16 @@ "source": [ "# plot measurement of one date\n", "options = [(None, \"integer\"), (0, \"float\"), (4, \"round(4)\"), (6, \"round(6)\")]\n", - "sizes = [file_sizes(pd.DataFrame(data_acquisition(1, opt[0]), columns=columns).sample(frac=1).reset_index(drop=True)) for opt in options]\n", + "sizes = [\n", + " file_sizes(\n", + " pd.DataFrame(data_acquisition(1, opt[0]), columns=columns)\n", + " .sample(frac=1)\n", + " .reset_index(drop=True)\n", + " )\n", + " for opt in options\n", + "]\n", "titles = [opt[1] for opt in options]\n", - "sizes_plot(sizes, titles, 'one date')" + "sizes_plot(sizes, titles, \"one date\")" ] }, { @@ -943,9 +1007,16 @@ ], "source": [ "# plot measurement of twenty dates\n", - "sizes = [file_sizes(pd.DataFrame(data_acquisition(20, opt[0]), columns=columns).sample(frac=1).reset_index(drop=True)) for opt in options]\n", + "sizes = [\n", + " file_sizes(\n", + " pd.DataFrame(data_acquisition(20, opt[0]), columns=columns)\n", + " .sample(frac=1)\n", + " .reset_index(drop=True)\n", + " )\n", + " for opt in options\n", + "]\n", "titles = [opt[1] for opt in options]\n", - "sizes_plot(sizes, titles, 'twenty dates')" + "sizes_plot(sizes, titles, \"twenty dates\")" ] } ], diff --git a/example/other examples/example_communes.ipynb b/example/other examples/example_communes.ipynb index 20ca134..5bc662f 100644 --- a/example/other examples/example_communes.ipynb +++ b/example/other examples/example_communes.ipynb @@ -246,8 +246,8 @@ "source": [ "import pandas as pd\n", "\n", - "comm = pd.read_csv('donnees_communes.csv', sep=';')\n", - "comm_min = comm[['REG', 'DEP', 'Région', 'COM']]\n", + "comm = pd.read_csv(\"donnees_communes.csv\", sep=\";\")\n", + "comm_min = comm[[\"REG\", \"DEP\", \"Région\", \"COM\"]]\n", "comm" ] }, @@ -277,8 +277,6 @@ } ], "source": [ - "import ntv_pandas\n", - "\n", "analys = comm.npd.analysis()\n", "print(analys.tree())" ] @@ -305,9 +303,9 @@ "from tab_dataset.dataset import Sdataset\n", "\n", "comm_sd = Sdataset(comm_min)\n", - "jsn = comm_sd.to_ntv(modecodec='optimize').to_obj(encoded=False)\n", + "jsn = comm_sd.to_ntv(modecodec=\"optimize\").to_obj(encoded=False)\n", "len(json.dumps(jsn))\n", - "#jsn" + "# jsn" ] }, { @@ -329,6 +327,7 @@ ], "source": [ "import cbor2\n", + "\n", "len(cbor2.dumps(jsn))" ] }, @@ -601,7 +600,7 @@ } ], "source": [ - "comm = pd.read_csv('v_commune_2024.csv', sep=',')\n", + "comm = pd.read_csv(\"v_commune_2024.csv\", sep=\",\")\n", "comm" ] } diff --git a/example/other examples/example_hierarchical.ipynb b/example/other examples/example_hierarchical.ipynb index 42351e2..404870e 100644 --- a/example/other examples/example_hierarchical.ipynb +++ b/example/other examples/example_hierarchical.ipynb @@ -52,14 +52,25 @@ "from datetime import date, timedelta\n", "from random import random\n", "\n", - "columns = ['study', 'n_event', 'date', 'year', 'yearmonth', 'month', 'weekday', 'yearday', 'day']\n", + "columns = [\n", + " \"study\",\n", + " \"n_event\",\n", + " \"date\",\n", + " \"year\",\n", + " \"yearmonth\",\n", + " \"month\",\n", + " \"weekday\",\n", + " \"yearday\",\n", + " \"day\",\n", + "]\n", "\n", - "study = 'event analysis'\n", - "t0 = date.fromisoformat('2000-01-01')\n", + "study = \"event analysis\"\n", + "t0 = date.fromisoformat(\"2000-01-01\")\n", "event0 = 100\n", "\n", + "\n", "def data_acquisition(n_dates):\n", - " '''return a list of records '''\n", + " \"\"\"return a list of records\"\"\"\n", " data = []\n", " for dt in range(n_dates):\n", " dat = t0 + timedelta(dt)\n", @@ -68,10 +79,16 @@ " yearmonth = dat_iso[:7]\n", " month = dat.month\n", " weekday = dat.isoweekday()\n", - " yearday = (dat - date.fromisoformat(str(dat.isoformat()[:4] + '-01-01'))).days\n", + " yearday = (dat - date.fromisoformat(str(dat.isoformat()[:4] + \"-01-01\"))).days\n", " day = dat.day\n", - " n_event = round(event0 + (month-6) * random() + (weekday - 3.5) * random() + 0.1 * yearday * random() + random() * 10) \n", - " #data.append((study, n_event, dat_iso, year, yearmonth, month, weekday, yearday, day))\n", + " n_event = round(\n", + " event0\n", + " + (month - 6) * random()\n", + " + (weekday - 3.5) * random()\n", + " + 0.1 * yearday * random()\n", + " + random() * 10\n", + " )\n", + " # data.append((study, n_event, dat_iso, year, yearmonth, month, weekday, yearday, day))\n", " data.append((study, n_event, dat_iso, year, yearmonth, month))\n", " return data" ] @@ -241,7 +258,11 @@ "source": [ "import pandas as pd\n", "\n", - "time_sr = pd.DataFrame(data_acquisition(10000), columns=columns[:6]).sample(frac=1).reset_index(drop=True)\n", + "time_sr = (\n", + " pd.DataFrame(data_acquisition(10000), columns=columns[:6])\n", + " .sample(frac=1)\n", + " .reset_index(drop=True)\n", + ")\n", "time_sr" ] }, @@ -266,8 +287,6 @@ } ], "source": [ - "import ntv_pandas\n", - "\n", "analys = time_sr.npd.analysis()\n", "print(analys.tree())" ] @@ -290,13 +309,13 @@ } ], "source": [ - "from tab_dataset.dataset import Sdataset, Ndataset\n", + "from tab_dataset.dataset import Sdataset\n", "\n", "time_sd = Sdataset(time_sr)\n", - "jsn = time_sd.to_ntv(modecodec='optimize').to_obj(encoded=True)\n", - "jsn2 = time_sd.to_ntv(modecodec='optimize').to_obj(encoded=False)\n", + "jsn = time_sd.to_ntv(modecodec=\"optimize\").to_obj(encoded=True)\n", + "jsn2 = time_sd.to_ntv(modecodec=\"optimize\").to_obj(encoded=False)\n", "len(jsn)\n", - "#jsn" + "# jsn" ] }, { @@ -318,6 +337,7 @@ ], "source": [ "import cbor2\n", + "\n", "len(cbor2.dumps(jsn2))" ] }, diff --git a/example/xarray_pandas_converter.ipynb b/example/xarray_pandas_converter.ipynb index e49466a..9b4a3bc 100644 --- a/example/xarray_pandas_converter.ipynb +++ b/example/xarray_pandas_converter.ipynb @@ -429,11 +429,8 @@ ], "source": [ "import numpy as np\n", - "import pandas as pd\n", "import xarray as xr\n", "\n", - "import ntv_pandas # activate pandas npd accessor\n", - "import ntv_numpy # activate xarray nxr accessor\n", "\n", "ds = xr.Dataset(\n", " {\"foo\": ((\"x\", \"y\"), np.random.randn(2, 3))},\n", @@ -443,7 +440,7 @@ " \"along_x\": (\"x\", np.random.randn(2)),\n", " \"scalar\": 123,\n", " },\n", - " attrs={\"example\": \"Xarray user-guide\"}\n", + " attrs={\"example\": \"Xarray user-guide\"},\n", ")\n", "ds" ] @@ -1102,7 +1099,9 @@ } ], "source": [ - "df_min = ds.nxr.to_dataframe(ntv_type=False, info=False, index=False) # without additional data\n", + "df_min = ds.nxr.to_dataframe(\n", + " ntv_type=False, info=False, index=False\n", + ") # without additional data\n", "df_min" ] },