diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 583b24d1d..2ea9f40cf 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -35,7 +35,7 @@ repos: - "--config=pyproject.toml" - repo: https://github.com/astral-sh/ruff-pre-commit # Ruff version. - rev: v0.1.9 + rev: v0.1.11 hooks: - id: ruff args: [--fix] diff --git a/CHANGELOG.md b/CHANGELOG.md index 717bb4bb8..fb22eded7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ - [BUG] Fix logic for groupby in complete. Index support deprecated. Fix deprecation warning for fillna in `complete` PR #1289 @samukweku - [ENH] `select` function now supports variable arguments - PR #1288 @samukweku - [ENH] `conditional_join` now supports timedelta dtype. - PR #1297 @samukweku +- [ENH] `get_join_indices` function added - returns only join indices between two dataframes. Issue #1310 @samukweku +- [ENH] `explode_index` function added. - Issue #1283 - [ENH] Add `glue` and `axis` parameters to `collapse_levels`. - Issue #211 @samukweku ## [v0.26.0] - 2023-09-18 diff --git a/examples/notebooks/conditional_join.ipynb b/examples/notebooks/conditional_join.ipynb deleted file mode 100644 index 5d10c321c..000000000 --- a/examples/notebooks/conditional_join.ipynb +++ /dev/null @@ -1,1470 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Joining on Non-Equi Operators" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import janitor\n", - "import numpy as np" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "#https://stackoverflow.com/q/61948103/7175713 \n", - "df1 = pd.DataFrame({'id': [1,1,1,2,2,3], \n", - " 'value_1': [2,5,7,1,3,4]})\n", - "\n", - "df2 = pd.DataFrame({'id': [1,1,1,1,2,2,2,3], \n", - " 'value_2A': [0,3,7,12,0,2,3,1], \n", - " 'value_2B': [1,5,9,15,1,4,6,3]})" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idvalue_1
012
115
217
321
423
534
\n", - "
" - ], - "text/plain": [ - " id value_1\n", - "0 1 2\n", - "1 1 5\n", - "2 1 7\n", - "3 2 1\n", - "4 2 3\n", - "5 3 4" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df1" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idvalue_2Avalue_2B
0101
1135
2179
311215
4201
5224
6236
7313
\n", - "
" - ], - "text/plain": [ - " id value_2A value_2B\n", - "0 1 0 1\n", - "1 1 3 5\n", - "2 1 7 9\n", - "3 1 12 15\n", - "4 2 0 1\n", - "5 2 2 4\n", - "6 2 3 6\n", - "7 3 1 3" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df2" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Join on equi and non-equi operators is possible:" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
leftright
idvalue_1idvalue_2Avalue_2B
015135
117179
221201
323224
423236
\n", - "
" - ], - "text/plain": [ - " left right \n", - " id value_1 id value_2A value_2B\n", - "0 1 5 1 3 5\n", - "1 1 7 1 7 9\n", - "2 2 1 2 0 1\n", - "3 2 3 2 2 4\n", - "4 2 3 2 3 6" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df1.conditional_join(\n", - " df2,\n", - " ('id', 'id', '=='),\n", - " ('value_1', 'value_2A', '>='),\n", - " ('value_1', 'value_2B', '<='),\n", - " sort_by_appearance = True\n", - " )" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The default join is inner. left and right joins are supported as well:" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
leftright
idvalue_1idvalue_2Avalue_2B
012NaNNaNNaN
1151.03.05.0
2171.07.09.0
3212.00.01.0
4232.02.04.0
5232.03.06.0
634NaNNaNNaN
\n", - "
" - ], - "text/plain": [ - " left right \n", - " id value_1 id value_2A value_2B\n", - "0 1 2 NaN NaN NaN\n", - "1 1 5 1.0 3.0 5.0\n", - "2 1 7 1.0 7.0 9.0\n", - "3 2 1 2.0 0.0 1.0\n", - "4 2 3 2.0 2.0 4.0\n", - "5 2 3 2.0 3.0 6.0\n", - "6 3 4 NaN NaN NaN" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df1.conditional_join(\n", - " df2,\n", - " ('id', 'id', '=='),\n", - " ('value_1', 'value_2A', '>='),\n", - " ('value_1', 'value_2B', '<='),\n", - " how='left',\n", - " sort_by_appearance = True\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
leftright
idvalue_1idvalue_2Avalue_2B
0NaNNaN101
11.05.0135
21.07.0179
3NaNNaN11215
42.01.0201
52.03.0224
62.03.0236
7NaNNaN313
\n", - "
" - ], - "text/plain": [ - " left right \n", - " id value_1 id value_2A value_2B\n", - "0 NaN NaN 1 0 1\n", - "1 1.0 5.0 1 3 5\n", - "2 1.0 7.0 1 7 9\n", - "3 NaN NaN 1 12 15\n", - "4 2.0 1.0 2 0 1\n", - "5 2.0 3.0 2 2 4\n", - "6 2.0 3.0 2 3 6\n", - "7 NaN NaN 3 1 3" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df1.conditional_join(\n", - " df2,\n", - " ('id', 'id', '=='),\n", - " ('value_1', 'value_2A', '>='),\n", - " ('value_1', 'value_2B', '<='),\n", - " how='right',\n", - " sort_by_appearance = True\n", - " )" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Join on just the non-equi joins is also possible:" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
leftright
idvalue_1idvalue_2Avalue_2B
012313
115236
223224
334135
434236
\n", - "
" - ], - "text/plain": [ - " left right \n", - " id value_1 id value_2A value_2B\n", - "0 1 2 3 1 3\n", - "1 1 5 2 3 6\n", - "2 2 3 2 2 4\n", - "3 3 4 1 3 5\n", - "4 3 4 2 3 6" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df1.conditional_join(\n", - " df2,\n", - " ('value_1', 'value_2A', '>'),\n", - " ('value_1', 'value_2B', '<'),\n", - " how='inner',\n", - " sort_by_appearance = True\n", - " )" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Join on not equal -> !=" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
leftright
idvalue_1idvalue_2Avalue_2B
012201
112224
212236
312313
415201
515224
615236
715313
817201
917224
1017236
1117313
1221313
1323313
1421101
1521135
1621179
172111215
1823101
1923135
2023179
212311215
2234101
2334135
2434179
253411215
2634201
2734224
2834236
\n", - "
" - ], - "text/plain": [ - " left right \n", - " id value_1 id value_2A value_2B\n", - "0 1 2 2 0 1\n", - "1 1 2 2 2 4\n", - "2 1 2 2 3 6\n", - "3 1 2 3 1 3\n", - "4 1 5 2 0 1\n", - "5 1 5 2 2 4\n", - "6 1 5 2 3 6\n", - "7 1 5 3 1 3\n", - "8 1 7 2 0 1\n", - "9 1 7 2 2 4\n", - "10 1 7 2 3 6\n", - "11 1 7 3 1 3\n", - "12 2 1 3 1 3\n", - "13 2 3 3 1 3\n", - "14 2 1 1 0 1\n", - "15 2 1 1 3 5\n", - "16 2 1 1 7 9\n", - "17 2 1 1 12 15\n", - "18 2 3 1 0 1\n", - "19 2 3 1 3 5\n", - "20 2 3 1 7 9\n", - "21 2 3 1 12 15\n", - "22 3 4 1 0 1\n", - "23 3 4 1 3 5\n", - "24 3 4 1 7 9\n", - "25 3 4 1 12 15\n", - "26 3 4 2 0 1\n", - "27 3 4 2 2 4\n", - "28 3 4 2 3 6" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df1.conditional_join(\n", - " df2,\n", - " ('id', 'id', \"!=\")\n", - " )" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If the columns from both dataframes have nothing in common, a single indexed column is returned:" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
value_1value_2Avalue_2B
0213
1536
2324
3435
4436
\n", - "
" - ], - "text/plain": [ - " value_1 value_2A value_2B\n", - "0 2 1 3\n", - "1 5 3 6\n", - "2 3 2 4\n", - "3 4 3 5\n", - "4 4 3 6" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "(df1.select_columns('value_1')\n", - " .conditional_join(\n", - " df2.select_columns('val*'),\n", - " ('value_1', 'value_2A', '>'),\n", - " ('value_1', 'value_2B', '<'),\n", - " )\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Selection of relevant columns within `conditional_join`: " - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
leftright
idid
012
112
212
313
412
512
612
713
812
912
1012
1113
1223
1323
\n", - "
" - ], - "text/plain": [ - " left right\n", - " id id\n", - "0 1 2\n", - "1 1 2\n", - "2 1 2\n", - "3 1 3\n", - "4 1 2\n", - "5 1 2\n", - "6 1 2\n", - "7 1 3\n", - "8 1 2\n", - "9 1 2\n", - "10 1 2\n", - "11 1 3\n", - "12 2 3\n", - "13 2 3" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df1.conditional_join(\n", - " df2,\n", - " ('id', 'id', \"<\"),\n", - " df_columns = 'id',\n", - " right_columns = 'id'\n", - " )" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Column renaming is also possible:" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
df_idright_id
012
112
212
313
412
512
612
713
812
912
1012
1113
1223
1323
\n", - "
" - ], - "text/plain": [ - " df_id right_id\n", - "0 1 2\n", - "1 1 2\n", - "2 1 2\n", - "3 1 3\n", - "4 1 2\n", - "5 1 2\n", - "6 1 2\n", - "7 1 3\n", - "8 1 2\n", - "9 1 2\n", - "10 1 2\n", - "11 1 3\n", - "12 2 3\n", - "13 2 3" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df1.conditional_join(\n", - " df2,\n", - " ('id', 'id', \"<\"),\n", - " df_columns = {'id':'df_id'},\n", - " right_columns = {'id':'right_id'}\n", - " )" - ] - } - ], - "metadata": { - "interpreter": { - "hash": "d4d1e4263499bec80672ea0156c357c1ee493ec2b1c70f0acce89fc37c4a6abe" - }, - "kernelspec": { - "display_name": "PyJanitor development", - "language": "python", - "name": "pyjanitor-dev" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.15" - }, - "orig_nbformat": 4 - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/janitor/functions/__init__.py b/janitor/functions/__init__.py index b2dbe5d26..7897e50e2 100644 --- a/janitor/functions/__init__.py +++ b/janitor/functions/__init__.py @@ -26,7 +26,7 @@ from .collapse_levels import collapse_levels from .complete import complete from .concatenate_columns import concatenate_columns -from .conditional_join import conditional_join +from .conditional_join import conditional_join, get_join_indices from .convert_date import ( convert_excel_date, convert_matlab_date, @@ -41,6 +41,7 @@ from .encode_categorical import encode_categorical from .expand_column import expand_column from .expand_grid import expand_grid +from .explode_index import explode_index from .factorize_columns import factorize_columns from .fill import fill_direction, fill_empty from .filter import filter_column_isin, filter_date, filter_on, filter_string @@ -108,6 +109,7 @@ "encode_categorical", "expand_column", "expand_grid", + "explode_index", "factorize_columns", "fill_direction", "fill_empty", @@ -118,6 +120,7 @@ "find_replace", "flag_nulls", "get_dupes", + "get_join_indices", "groupby_agg", "groupby_topk", "impute", diff --git a/janitor/functions/collapse_levels.py b/janitor/functions/collapse_levels.py index 6d20bb8c4..75737036c 100644 --- a/janitor/functions/collapse_levels.py +++ b/janitor/functions/collapse_levels.py @@ -46,7 +46,7 @@ class max_speed type 2 bird 24 parrot 3 mammal 80 Lion 4 mammal 21 Monkey - >>> grouped_df = df.groupby("class").agg(["mean", "median"]) + >>> grouped_df = df.groupby("class")[['max_speed']].agg(["mean", "median"]) >>> grouped_df # doctest: +NORMALIZE_WHITESPACE max_speed mean median diff --git a/janitor/functions/conditional_join.py b/janitor/functions/conditional_join.py index b1ef156a9..6c3dbf182 100644 --- a/janitor/functions/conditional_join.py +++ b/janitor/functions/conditional_join.py @@ -44,7 +44,7 @@ def conditional_join( force: bool = False, ) -> pd.DataFrame: """The conditional_join function operates similarly to `pd.merge`, - but supports efficient joins on inequality operators, + but supports joins on inequality operators, or a combination of equi and non-equi joins. Joins solely on equality are not supported. @@ -157,14 +157,15 @@ def conditional_join( 5 4 5.0 6 4 6.0 - Rename columns, after the join: - >>> df1.conditional_join( - ... df2, - ... ("value_1", "value_2A", ">"), - ... ("value_1", "value_2B", "<"), - ... df_columns={'value_1':'left_column'}, - ... right_columns='value_2B', - ... how='outer' + Rename columns, before the join: + >>> (df1 + ... .rename(columns={'value_1':'left_column'}) + ... .conditional_join( + ... df2, + ... ("left_column", "value_2A", ">"), + ... ("left_column", "value_2B", "<"), + ... right_columns='value_2B', + ... how='outer') ... ) left_column value_2B 0 7.0 NaN @@ -263,11 +264,9 @@ def conditional_join( df_columns: Columns to select from `df` in the final output dataframe. Column selection is based on the [`select`][janitor.functions.select.select] syntax. - It is also possible to rename the output columns via a dictionary. right_columns: Columns to select from `right` in the final output dataframe. Column selection is based on the [`select`][janitor.functions.select.select] syntax. - It is also possible to rename the output columns via a dictionary. use_numba: Use numba, if installed, to accelerate the computation. keep: Choose whether to return the first match, last match or all matches. indicator: If `True`, adds a column to the output DataFrame @@ -327,6 +326,7 @@ def _conditional_join_preliminary_checks( use_numba: bool, indicator: Union[bool, str], force: bool, + return_matching_indices: bool = False, ) -> tuple: """ Preliminary checks for conditional_join are conducted here. @@ -380,8 +380,11 @@ def _conditional_join_preliminary_checks( check_column(right, [right_on]) _check_operator(op) - if all( - (op == _JoinOperator.STRICTLY_EQUAL.value for *_, op in conditions) + if ( + all( + (op == _JoinOperator.STRICTLY_EQUAL.value for *_, op in conditions) + ) + and not return_matching_indices ): raise ValueError("Equality only joins are not supported.") @@ -483,6 +486,7 @@ def _conditional_join_compute( use_numba: bool, indicator: Union[bool, str], force: bool, + return_matching_indices=False, ) -> pd.DataFrame: """ This is where the actual computation @@ -513,6 +517,7 @@ def _conditional_join_compute( use_numba, indicator, force, + return_matching_indices, ) eq_check = False @@ -526,11 +531,10 @@ def _conditional_join_compute( eq_check = True elif op in less_than_join_types.union(greater_than_join_types): le_lt_check = True - df.index = range(len(df)) right.index = range(len(right)) - if len(conditions) > 1: + if (len(conditions) > 1) or eq_check: if eq_check: result = _multiple_conditional_join_eq( df, @@ -569,6 +573,9 @@ def _conditional_join_compute( if result is None: result = np.array([], dtype=np.intp), np.array([], dtype=np.intp) + if return_matching_indices: + return result + return _create_frame( df, right, @@ -790,12 +797,6 @@ def _multiple_conditional_join_eq( left_on = [*left_on] right_on = [*right_on] - rest = ( - (df[left_on], right[right_on], op) - for left_on, right_on, op in conditions - if op != _JoinOperator.STRICTLY_EQUAL.value - ) - left_index, right_index = _MergeOperation( df, right, @@ -807,8 +808,16 @@ def _multiple_conditional_join_eq( if not left_index.size: return None - indices = _generate_indices(left_index, right_index, rest) + rest = [ + (df[left_on], right[right_on], op) + for left_on, right_on, op in conditions + if op != _JoinOperator.STRICTLY_EQUAL.value + ] + + if not rest: + return _keep_output(keep, left_index, right_index) + indices = _generate_indices(left_index, right_index, rest) if not indices: return None @@ -1082,22 +1091,6 @@ def _range_indices( return left_index, right_index -def _cond_join_select_columns(columns: Any, df: pd.DataFrame): - """ - Select columns in a DataFrame. - Optionally rename the columns while selecting. - Returns a Pandas DataFrame. - """ - - if isinstance(columns, dict): - df = df.select(columns=[*columns]) - df.columns = [columns.get(name, name) for name in df] - else: - df = df.select(columns=columns) - - return df - - def _create_multiindex_column(df: pd.DataFrame, right: pd.DataFrame): """ Create a MultiIndex column for conditional_join. @@ -1137,9 +1130,9 @@ def _create_frame( if (df_columns is None) and (right_columns is None): raise ValueError("df_columns and right_columns cannot both be None.") if (df_columns is not None) and (df_columns != slice(None)): - df = _cond_join_select_columns(df_columns, df) + df = df.select(columns=df_columns) if (right_columns is not None) and (right_columns != slice(None)): - right = _cond_join_select_columns(right_columns, right) + right = right.select(columns=right_columns) if df_columns is None: df = pd.DataFrame([]) elif right_columns is None: @@ -1302,3 +1295,50 @@ def _inner( return pd.concat( contents, axis=0, copy=False, sort=False, ignore_index=True ) + + +def get_join_indices( + df: pd.DataFrame, + right: Union[pd.DataFrame, pd.Series], + conditions: list[tuple[str]], + keep: Literal["first", "last", "all"] = "all", + use_numba: bool = False, + force: bool = False, +) -> tuple[np.ndarray, np.ndarray]: + """Convenience function to return the matching indices from an inner join. + + !!! info "New in version 0.27.0" + + Args: + df: A pandas DataFrame. + right: Named Series or DataFrame to join to. + conditions: List of arguments of tuple(s) of the form + `(left_on, right_on, op)`, where `left_on` is the column + label from `df`, `right_on` is the column label from `right`, + while `op` is the operator. + The `col` class is also supported. The operator can be any of + `==`, `!=`, `<=`, `<`, `>=`, `>`. For multiple conditions, + the and(`&`) operator is used to combine the results + of the individual conditions. + use_numba: Use numba, if installed, to accelerate the computation. + keep: Choose whether to return the first match, last match or all matches. + force: If `True`, force the non-equi join conditions + to execute before the equi join. + + Returns: + A tuple of indices for the rows in the dataframes that match. + """ + return _conditional_join_compute( + df=df, + right=right, + conditions=conditions, + how="inner", + sort_by_appearance=False, + df_columns=None, + right_columns=None, + keep=keep, + use_numba=use_numba, + indicator=False, + force=force, + return_matching_indices=True, + ) diff --git a/janitor/functions/explode_index.py b/janitor/functions/explode_index.py new file mode 100644 index 000000000..55922d9aa --- /dev/null +++ b/janitor/functions/explode_index.py @@ -0,0 +1,112 @@ +"""Implementation of the `explode_index` function.""" + +from __future__ import annotations + +import re +from typing import Union + +import pandas as pd +import pandas_flavor as pf + +from janitor.utils import check + + +@pf.register_dataframe_method +def explode_index( + df: pd.DataFrame, + names_sep: Union[str, None] = None, + names_pattern: Union[str, None] = None, + axis: str = "columns", + level_names: list = None, +) -> pd.DataFrame: + """Explode a single index DataFrame into a MultiIndex DataFrame. + + This method does not mutate the original DataFrame. + + Examples: + >>> import pandas as pd + >>> import janitor + >>> df = pd.DataFrame( + ... {'max_speed_mean': [267.3333333333333, 50.5], + ... 'max_speed_median': [389.0, 50.5]}) + >>> df + max_speed_mean max_speed_median + 0 267.333333 389.0 + 1 50.500000 50.5 + >>> df.explode_index(names_sep='_',axis='columns') # doctest: +NORMALIZE_WHITESPACE + max + speed + mean median + 0 267.333333 389.0 + 1 50.500000 50.5 + >>> df.explode_index(names_pattern=r"(.+speed)_(.+)",axis='columns') # doctest: +NORMALIZE_WHITESPACE + max_speed + mean median + 0 267.333333 389.0 + 1 50.500000 50.5 + >>> df.explode_index( + ... names_pattern=r"(?P.+speed)_(?P.+)", + ... axis='columns' + ... ) # doctest: +NORMALIZE_WHITESPACE + measurement max_speed + aggregation mean median + 0 267.333333 389.0 + 1 50.500000 50.5 + >>> df.explode_index( + ... names_sep='_', + ... axis='columns', + ... level_names = ['min or max', 'measurement','aggregation'] + ... ) # doctest: +NORMALIZE_WHITESPACE + min or max max + measurement speed + aggregation mean median + 0 267.333333 389.0 + 1 50.500000 50.5 + + Args: + df: A pandas DataFrame. + names_sep: string or compiled regex used to split the column/index into levels. + names_pattern: regex to extract new levels from the column/index. + axis: 'index/columns'. Determines which axis to explode. + level_names: names of the levels in the MultiIndex. + + Returns: + A pandas DataFrame with a MultiIndex. + """ # noqa: E501 + check("axis", axis, [str]) + if axis not in {"index", "columns"}: + raise ValueError("axis should be either index or columns.") + if (names_sep is None) and (names_pattern is None): + raise ValueError( + "Provide argument for either names_sep or names_pattern." + ) + if (names_sep is not None) and (names_pattern is not None): + raise ValueError( + "Provide argument for either names_sep or names_pattern, not both." + ) + if names_sep is not None: + check("names_sep", names_sep, [str]) + if names_pattern is not None: + check("names_pattern", names_pattern, [str]) + if level_names is not None: + check("level_names", level_names, [list]) + + new_index = getattr(df, axis) + if isinstance(new_index, pd.MultiIndex): + return df + # avoid a copy - Index is immutable; a slice is safe to use. + df = df[:] + if names_sep: + new_index = new_index.str.split(names_sep, expand=True) + else: + named_groups = re.compile(names_pattern).groupindex + if named_groups and not level_names: + level_names = list(named_groups) + new_index = new_index.str.extract(names_pattern) + new_index = [arr.array for _, arr in new_index.items()] + new_index = pd.MultiIndex.from_arrays(new_index) + if level_names: + new_index.names = level_names + + setattr(df, axis, new_index) + return df diff --git a/tests/functions/test_conditional_join.py b/tests/functions/test_conditional_join.py index 7189093c6..1d05d6d19 100644 --- a/tests/functions/test_conditional_join.py +++ b/tests/functions/test_conditional_join.py @@ -3,9 +3,9 @@ import pytest from hypothesis import given, settings from pandas import Timedelta -from pandas.testing import assert_frame_equal +from pandas.testing import assert_frame_equal, assert_index_equal -from janitor import col +from janitor import col, get_join_indices from janitor.testing_utils.strategies import ( conditional_df, conditional_right, @@ -194,7 +194,7 @@ def test_check_how_type(dummy, series): def test_check_force_type(dummy, series): """ - Raise TypeError if `how` is not a string. + Raise TypeError if `force` is not boolean. """ with pytest.raises(TypeError, match="force should be one of.+"): dummy.conditional_join(series, ("id", "B", "<"), force=1) @@ -3444,6 +3444,116 @@ def test_ge_eq_and_le_datess_numba(df, right): assert_frame_equal(expected, actual) +@settings(deadline=None, max_examples=10) +@given(df=conditional_df(), right=conditional_right()) +@pytest.mark.turtle +def test_ge_eq_and_le_datess_numba_indices(df, right): + """compare join indices for multiple conditions.""" + + expected = ( + df.reset_index() + .dropna(subset=["E"]) + .merge( + right.dropna(subset=["Dates"]), + left_on="E", + right_on="Dates", + how="inner", + sort=False, + ) + .loc[ + lambda df: df.B.gt(df.Floats) + & df.A.lt(df.Integers) + & df.B.ne(df.Numeric), + "index", + ] + ) + expected = pd.Index(expected) + + actual, _ = get_join_indices( + df[["B", "A", "E"]], + right[["Floats", "Integers", "Dates", "Numeric"]], + [ + ("A", "Integers", "<"), + ("E", "Dates", "=="), + ("B", "Floats", ">"), + ("B", "Numeric", "!="), + ], + use_numba=True, + ) + actual = df.index[actual] + assert_index_equal(expected, actual, check_names=False) + + +@settings(deadline=None, max_examples=10) +@given(df=conditional_df(), right=conditional_right()) +@pytest.mark.turtle +def test_eq_indices(df, right): + """compare join indices for multiple conditions.""" + + expected = ( + df.reset_index() + .dropna(subset=["E"]) + .merge( + right.dropna(subset=["Dates"]), + left_on="E", + right_on="Dates", + how="inner", + sort=False, + ) + .loc[:, "index"] + ) + expected = pd.Index(expected) + + actual, _ = get_join_indices( + df.dropna(subset=["E"]), + right.dropna(subset=["Dates"]), + [ + ("E", "Dates", "=="), + ], + ) + actual = df.index[actual] + assert_index_equal(expected, actual, check_names=False) + + +@settings(deadline=None, max_examples=10) +@given(df=conditional_df(), right=conditional_right()) +@pytest.mark.turtle +def test_ge_eq_and_le_datess_indices(df, right): + """compare join indices for multiple conditions.""" + + expected = ( + df.reset_index() + .dropna(subset=["E"]) + .merge( + right.dropna(subset=["Dates"]), + left_on="E", + right_on="Dates", + how="inner", + sort=False, + ) + .loc[ + lambda df: df.B.gt(df.Floats) + & df.A.lt(df.Integers) + & df.B.ne(df.Numeric), + "index", + ] + ) + expected = pd.Index(expected) + + actual, _ = get_join_indices( + df[["B", "A", "E"]], + right[["Floats", "Integers", "Dates", "Numeric"]], + [ + ("A", "Integers", "<"), + ("E", "Dates", "=="), + ("B", "Floats", ">"), + ("B", "Numeric", "!="), + ], + ) + actual = df.index[actual] + assert_index_equal(expected, actual, check_names=False) + + @pytest.mark.turtle @settings(deadline=None, max_examples=10) @given(df=conditional_df(), right=conditional_right()) @@ -3585,20 +3695,20 @@ def test_multiple_non_eqi(df, right): ) actual = ( - df.conditional_join( - right, + df.rename(columns={"B": "b"}) + .conditional_join( + right.rename( + columns={ + "Floats": "floats", + } + ), ("A", "Integers", ">="), ("E", "Dates", ">"), - ("B", "Floats", ">"), + ("b", "floats", ">"), how="inner", sort_by_appearance=False, - df_columns={"B": "b", "A": "A", "E": "E"}, - right_columns={ - "Floats": "floats", - "Integers": "Integers", - "Dates": "Dates", - }, ) + .loc[:, ["b", "A", "E", "floats", "Integers", "Dates"]] .sort_values( ["b", "A", "E", "floats", "Integers", "Dates"], ignore_index=True ) @@ -3632,21 +3742,20 @@ def test_multiple_non_eqi_numba(df, right): ) actual = ( - df.conditional_join( - right, + df.rename(columns={"B": "b"}) + .conditional_join( + right.rename( + columns={ + "Floats": "floats", + } + ), ("A", "Integers", ">="), ("E", "Dates", ">"), - ("B", "Floats", ">"), + ("b", "floats", ">"), how="inner", - use_numba=True, sort_by_appearance=False, - df_columns={"B": "b", "A": "A", "E": "E"}, - right_columns={ - "Floats": "floats", - "Integers": "Integers", - "Dates": "Dates", - }, ) + .loc[:, ["b", "A", "E", "floats", "Integers", "Dates"]] .sort_values( ["b", "A", "E", "floats", "Integers", "Dates"], ignore_index=True ) diff --git a/tests/functions/test_explode_index.py b/tests/functions/test_explode_index.py new file mode 100644 index 000000000..48bb685dc --- /dev/null +++ b/tests/functions/test_explode_index.py @@ -0,0 +1,104 @@ +import pandas as pd +import pytest +from pandas.testing import assert_frame_equal + + +@pytest.fixture +def df_checks(): + """fixture dataframe""" + return pd.DataFrame( + { + "fam_id": [1, 1, 1, 2, 2, 2, 3, 3, 3], + } + ) + + +@pytest.fixture +def df_multi(): + """MultiIndex dataframe fixture.""" + return pd.DataFrame( + { + ("name", "a"): {0: "Wilbur", 1: "Petunia", 2: "Gregory"}, + ("names", "aa"): {0: 67, 1: 80, 2: 64}, + ("more_names", "aaa"): {0: 56, 1: 90, 2: 50}, + } + ) + + +def test_type_axis(df_checks): + """Raise TypeError if wrong type is provided for axis.""" + with pytest.raises(TypeError, match="axis should be one of.+"): + df_checks.explode_index(axis=1) + + +def test_axis_values(df_checks): + """Raise ValueError if wrong value is provided for axis.""" + msg = "axis should be either index or columns." + with pytest.raises(ValueError, match=msg): + df_checks.explode_index(axis="INDEX") + + +def test_names_sep_pattern(df_checks): + """Raise ValueError if both names_sep and names_pattern is provided.""" + msg = "Provide argument for either names_sep or names_pattern, not both." + with pytest.raises(ValueError, match=msg): + df_checks.explode_index( + axis="columns", names_sep="_", names_pattern=r"(.+)_(.+)" + ) + + +def test_names_sep_pattern_both_none(df_checks): + """Raise ValueError if neither names_sep nor names_pattern is provided.""" + msg = "Provide argument for either names_sep or names_pattern." + with pytest.raises(ValueError, match=msg): + df_checks.explode_index( + axis="columns", names_sep=None, names_pattern=None + ) + + +def test_names_sep_typeerror(df_checks): + """Raise TypeError if names_sep is a wrong type.""" + with pytest.raises(TypeError, match="names_sep should be one of.+"): + df_checks.explode_index(axis="columns", names_sep=1) + + +def test_names_pattern_typeerror(df_checks): + """Raise TypeError if names_pattern is a wrong type.""" + with pytest.raises(TypeError, match="names_pattern should be one of.+"): + df_checks.explode_index(names_pattern=1) + + +def test_level_names_typeerror(df_checks): + """Raise TypeError if level_names is a wrong type.""" + with pytest.raises(TypeError, match="level_names should be one of.+"): + df_checks.explode_index(names_sep="_", level_names="new_level") + + +def test_multiindex(df_multi): + """Test output if df.columns is a multiindex""" + actual = df_multi.explode_index(names_sep="_") + assert_frame_equal(df_multi, actual) + + +def test_names_sep(df_checks): + """test output if names_sep""" + actual = df_checks.explode_index(names_sep="_", level_names=["a", "b"]) + expected = pd.DataFrame( + { + ("fam", "id"): [1, 1, 1, 2, 2, 2, 3, 3, 3], + } + ) + expected.columns.names = ["a", "b"] + assert_frame_equal(actual, expected) + + +def test_names_pattern(df_checks): + """test output if names_pattern""" + actual = df_checks.explode_index(names_pattern=r"(?P.+)_(?P.+)") + expected = pd.DataFrame( + { + ("fam", "id"): [1, 1, 1, 2, 2, 2, 3, 3, 3], + } + ) + expected.columns.names = ["a", "b"] + assert_frame_equal(actual, expected)