From 2244bcade548a3ae2b54e4cd8ea648f7320cb105 Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Fri, 14 Jun 2024 21:49:10 +1000 Subject: [PATCH 1/3] create polars equivalent of pandas_flavor --- janitor/polars/__init__.py | 15 +- janitor/polars/clean_names.py | 162 ++++ janitor/polars/complete.py | 302 ++++++++ janitor/polars/dataframe.py | 732 ------------------ janitor/polars/expressions.py | 93 --- janitor/polars/lazyframe.py | 428 ---------- janitor/polars/pivot_longer.py | 237 ++++++ janitor/polars/polars_flavor.py | 111 +++ janitor/polars/row_to_names.py | 109 +++ .../functions/test_clean_names_polars.py | 16 +- .../polars/functions/test_complete_polars.py | 18 +- .../functions/test_pivot_longer_polars.py | 58 +- .../functions/test_row_to_names_polars.py | 28 +- 13 files changed, 985 insertions(+), 1324 deletions(-) delete mode 100644 janitor/polars/dataframe.py delete mode 100644 janitor/polars/expressions.py delete mode 100644 janitor/polars/lazyframe.py create mode 100644 janitor/polars/polars_flavor.py diff --git a/janitor/polars/__init__.py b/janitor/polars/__init__.py index b16be7a7a..1485ad3f2 100644 --- a/janitor/polars/__init__.py +++ b/janitor/polars/__init__.py @@ -1,12 +1,13 @@ -from .dataframe import PolarsDataFrame -from .expressions import PolarsExpr -from .lazyframe import PolarsLazyFrame -from .pivot_longer import pivot_longer_spec +from .clean_names import clean_names, make_clean_names +from .complete import complete +from .pivot_longer import pivot_longer, pivot_longer_spec +from .row_to_names import row_to_names __all__ = [ "pivot_longer_spec", + "pivot_longer", "clean_names", - "PolarsDataFrame", - "PolarsLazyFrame", - "PolarsExpr", + "make_clean_names", + "row_to_names", + "complete", ] diff --git a/janitor/polars/clean_names.py b/janitor/polars/clean_names.py index 5cb28e5f9..7d80054a8 100644 --- a/janitor/polars/clean_names.py +++ b/janitor/polars/clean_names.py @@ -15,6 +15,12 @@ ) from janitor.utils import import_message +from .polars_flavor import ( + register_dataframe_method, + register_expr_method, + register_lazyframe_method, +) + try: import polars as pl except ImportError: @@ -26,6 +32,162 @@ ) +@register_lazyframe_method +@register_dataframe_method +def clean_names( + df, + strip_underscores: str | bool = None, + case_type: str = "lower", + remove_special: bool = False, + strip_accents: bool = False, + truncate_limit: int = None, +) -> pl.DataFrame: + """ + Clean the column names in a polars DataFrame. + + `clean_names` can also be applied to a LazyFrame. + + Examples: + >>> import polars as pl + >>> import janitor.polars + >>> df = pl.DataFrame( + ... { + ... "Aloha": range(3), + ... "Bell Chart": range(3), + ... "Animals@#$%^": range(3) + ... } + ... ) + >>> df + shape: (3, 3) + ┌───────┬────────────┬──────────────┐ + │ Aloha ┆ Bell Chart ┆ Animals@#$%^ │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═══════╪════════════╪══════════════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 1 ┆ 1 │ + │ 2 ┆ 2 ┆ 2 │ + └───────┴────────────┴──────────────┘ + >>> df.janitor.clean_names(remove_special=True) + shape: (3, 3) + ┌───────┬────────────┬─────────┐ + │ aloha ┆ bell_chart ┆ animals │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═══════╪════════════╪═════════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 1 ┆ 1 │ + │ 2 ┆ 2 ┆ 2 │ + └───────┴────────────┴─────────┘ + + !!! info "New in version 0.28.0" + + Args: + strip_underscores: Removes the outer underscores from all + column names. Default None keeps outer underscores. Values can be + either 'left', 'right' or 'both' or the respective shorthand 'l', + 'r' and True. + case_type: Whether to make the column names lower or uppercase. + Current case may be preserved with 'preserve', + while snake case conversion (from CamelCase or camelCase only) + can be turned on using "snake". + Default 'lower' makes all characters lowercase. + remove_special: Remove special characters from the column names. + Only letters, numbers and underscores are preserved. + strip_accents: Whether or not to remove accents from + the labels. + truncate_limit: Truncates formatted column names to + the specified length. Default None does not truncate. + + Returns: + A polars DataFrame. + """ # noqa: E501 + return df.rename( + lambda col: _clean_column_names( + obj=col, + strip_accents=strip_accents, + strip_underscores=strip_underscores, + case_type=case_type, + remove_special=remove_special, + truncate_limit=truncate_limit, + ) + ) + + +@register_expr_method +def make_clean_names( + expression, + strip_underscores: str | bool = None, + case_type: str = "lower", + remove_special: bool = False, + strip_accents: bool = False, + enforce_string: bool = False, + truncate_limit: int = None, +) -> pl.Expr: + """ + Clean the labels in a polars Expression. + + Examples: + >>> import polars as pl + >>> import janitor.polars + >>> df = pl.DataFrame({"raw": ["Abçdê fgí j"]}) + >>> df + shape: (1, 1) + ┌─────────────┐ + │ raw │ + │ --- │ + │ str │ + ╞═════════════╡ + │ Abçdê fgí j │ + └─────────────┘ + + Clean the column values: + >>> df.with_columns(pl.col("raw").janitor.make_clean_names(strip_accents=True)) + shape: (1, 1) + ┌─────────────┐ + │ raw │ + │ --- │ + │ str │ + ╞═════════════╡ + │ abcde_fgi_j │ + └─────────────┘ + + !!! info "New in version 0.28.0" + + Args: + strip_underscores: Removes the outer underscores + from all labels in the expression. + Default None keeps outer underscores. + Values can be either 'left', 'right' + or 'both' or the respective shorthand 'l', + 'r' and True. + case_type: Whether to make the labels in the expression lower or uppercase. + Current case may be preserved with 'preserve', + while snake case conversion (from CamelCase or camelCase only) + can be turned on using "snake". + Default 'lower' makes all characters lowercase. + remove_special: Remove special characters from the values in the expression. + Only letters, numbers and underscores are preserved. + strip_accents: Whether or not to remove accents from + the expression. + enforce_string: Whether or not to cast the expression to a string type. + truncate_limit: Truncates formatted labels in the expression to + the specified length. Default None does not truncate. + + Returns: + A polars Expression. + """ + return _clean_expr_names( + obj=expression, + strip_accents=strip_accents, + strip_underscores=strip_underscores, + case_type=case_type, + remove_special=remove_special, + enforce_string=enforce_string, + truncate_limit=truncate_limit, + ) + + def _change_case_expr( obj: pl.Expr, case_type: str, diff --git a/janitor/polars/complete.py b/janitor/polars/complete.py index 793c958e7..ea02473ad 100644 --- a/janitor/polars/complete.py +++ b/janitor/polars/complete.py @@ -6,6 +6,8 @@ from janitor.utils import check, import_message +from .polars_flavor import register_dataframe_method, register_lazyframe_method + try: import polars as pl import polars.selectors as cs @@ -19,6 +21,306 @@ ) +@register_lazyframe_method +@register_dataframe_method +def complete( + df, + *columns: ColumnNameOrSelector, + fill_value: dict | Any | pl.Expr = None, + explicit: bool = True, + sort: bool = False, + by: ColumnNameOrSelector = None, +) -> pl.DataFrame: + """ + Turns implicit missing values into explicit missing values + + It is modeled after tidyr's `complete` function. + In a way, it is the inverse of `pl.drop_nulls`, + as it exposes implicitly missing rows. + + If the combination involves multiple columns, pass it as a struct, + with an alias - the name of the struct should not exist in the DataFrame. + + If new values need to be introduced, a polars Expression + with the new values can be passed, as long as the polars Expression + has a name that already exists in the DataFrame. + + It is up to the user to ensure that the polars expression returns + unique values and/or sorted values. + + Note that if the polars expression evaluates to a struct, + then the fields, not the name, should already exist in the DataFrame. + + `complete` can also be applied to a LazyFrame. + + Examples: + >>> import polars as pl + >>> import janitor.polars + >>> df = pl.DataFrame( + ... dict( + ... group=(1, 2, 1, 2), + ... item_id=(1, 2, 2, 3), + ... item_name=("a", "a", "b", "b"), + ... value1=(1, None, 3, 4), + ... value2=range(4, 8), + ... ) + ... ) + >>> df + shape: (4, 5) + ┌───────┬─────────┬───────────┬────────┬────────┐ + │ group ┆ item_id ┆ item_name ┆ value1 ┆ value2 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str ┆ i64 ┆ i64 │ + ╞═══════╪═════════╪═══════════╪════════╪════════╡ + │ 1 ┆ 1 ┆ a ┆ 1 ┆ 4 │ + │ 2 ┆ 2 ┆ a ┆ null ┆ 5 │ + │ 1 ┆ 2 ┆ b ┆ 3 ┆ 6 │ + │ 2 ┆ 3 ┆ b ┆ 4 ┆ 7 │ + └───────┴─────────┴───────────┴────────┴────────┘ + + Generate all possible combinations of + `group`, `item_id`, and `item_name` + (whether or not they appear in the data) + >>> with pl.Config(tbl_rows=-1): + ... df.complete("group", "item_id", "item_name", sort=True) + shape: (12, 5) + ┌───────┬─────────┬───────────┬────────┬────────┐ + │ group ┆ item_id ┆ item_name ┆ value1 ┆ value2 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str ┆ i64 ┆ i64 │ + ╞═══════╪═════════╪═══════════╪════════╪════════╡ + │ 1 ┆ 1 ┆ a ┆ 1 ┆ 4 │ + │ 1 ┆ 1 ┆ b ┆ null ┆ null │ + │ 1 ┆ 2 ┆ a ┆ null ┆ null │ + │ 1 ┆ 2 ┆ b ┆ 3 ┆ 6 │ + │ 1 ┆ 3 ┆ a ┆ null ┆ null │ + │ 1 ┆ 3 ┆ b ┆ null ┆ null │ + │ 2 ┆ 1 ┆ a ┆ null ┆ null │ + │ 2 ┆ 1 ┆ b ┆ null ┆ null │ + │ 2 ┆ 2 ┆ a ┆ null ┆ 5 │ + │ 2 ┆ 2 ┆ b ┆ null ┆ null │ + │ 2 ┆ 3 ┆ a ┆ null ┆ null │ + │ 2 ┆ 3 ┆ b ┆ 4 ┆ 7 │ + └───────┴─────────┴───────────┴────────┴────────┘ + + Cross all possible `group` values with the unique pairs of + `(item_id, item_name)` that already exist in the data. + For such situations, where there is a group of columns, + pass it in as a struct: + >>> with pl.Config(tbl_rows=-1): + ... df.complete( + ... "group", + ... pl.struct("item_id", "item_name").unique().sort().alias("rar"), + ... sort=True + ... ) + shape: (8, 5) + ┌───────┬─────────┬───────────┬────────┬────────┐ + │ group ┆ item_id ┆ item_name ┆ value1 ┆ value2 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str ┆ i64 ┆ i64 │ + ╞═══════╪═════════╪═══════════╪════════╪════════╡ + │ 1 ┆ 1 ┆ a ┆ 1 ┆ 4 │ + │ 1 ┆ 2 ┆ a ┆ null ┆ null │ + │ 1 ┆ 2 ┆ b ┆ 3 ┆ 6 │ + │ 1 ┆ 3 ┆ b ┆ null ┆ null │ + │ 2 ┆ 1 ┆ a ┆ null ┆ null │ + │ 2 ┆ 2 ┆ a ┆ null ┆ 5 │ + │ 2 ┆ 2 ┆ b ┆ null ┆ null │ + │ 2 ┆ 3 ┆ b ┆ 4 ┆ 7 │ + └───────┴─────────┴───────────┴────────┴────────┘ + + Fill in nulls: + >>> with pl.Config(tbl_rows=-1): + ... df.complete( + ... "group", + ... pl.struct("item_id", "item_name").unique().sort().alias('rar'), + ... fill_value={"value1": 0, "value2": 99}, + ... explicit=True, + ... sort=True, + ... ) + shape: (8, 5) + ┌───────┬─────────┬───────────┬────────┬────────┐ + │ group ┆ item_id ┆ item_name ┆ value1 ┆ value2 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str ┆ i64 ┆ i64 │ + ╞═══════╪═════════╪═══════════╪════════╪════════╡ + │ 1 ┆ 1 ┆ a ┆ 1 ┆ 4 │ + │ 1 ┆ 2 ┆ a ┆ 0 ┆ 99 │ + │ 1 ┆ 2 ┆ b ┆ 3 ┆ 6 │ + │ 1 ┆ 3 ┆ b ┆ 0 ┆ 99 │ + │ 2 ┆ 1 ┆ a ┆ 0 ┆ 99 │ + │ 2 ┆ 2 ┆ a ┆ 0 ┆ 5 │ + │ 2 ┆ 2 ┆ b ┆ 0 ┆ 99 │ + │ 2 ┆ 3 ┆ b ┆ 4 ┆ 7 │ + └───────┴─────────┴───────────┴────────┴────────┘ + + Limit the fill to only the newly created + missing values with `explicit = FALSE` + >>> with pl.Config(tbl_rows=-1): + ... df.complete( + ... "group", + ... pl.struct("item_id", "item_name").unique().sort().alias('rar'), + ... fill_value={"value1": 0, "value2": 99}, + ... explicit=False, + ... sort=True, + ... ) + shape: (8, 5) + ┌───────┬─────────┬───────────┬────────┬────────┐ + │ group ┆ item_id ┆ item_name ┆ value1 ┆ value2 │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str ┆ i64 ┆ i64 │ + ╞═══════╪═════════╪═══════════╪════════╪════════╡ + │ 1 ┆ 1 ┆ a ┆ 1 ┆ 4 │ + │ 1 ┆ 2 ┆ a ┆ 0 ┆ 99 │ + │ 1 ┆ 2 ┆ b ┆ 3 ┆ 6 │ + │ 1 ┆ 3 ┆ b ┆ 0 ┆ 99 │ + │ 2 ┆ 1 ┆ a ┆ 0 ┆ 99 │ + │ 2 ┆ 2 ┆ a ┆ null ┆ 5 │ + │ 2 ┆ 2 ┆ b ┆ 0 ┆ 99 │ + │ 2 ┆ 3 ┆ b ┆ 4 ┆ 7 │ + └───────┴─────────┴───────────┴────────┴────────┘ + + >>> df = pl.DataFrame( + ... { + ... "Year": [1999, 2000, 2004, 1999, 2004], + ... "Taxon": [ + ... "Saccharina", + ... "Saccharina", + ... "Saccharina", + ... "Agarum", + ... "Agarum", + ... ], + ... "Abundance": [4, 5, 2, 1, 8], + ... } + ... ) + >>> df + shape: (5, 3) + ┌──────┬────────────┬───────────┐ + │ Year ┆ Taxon ┆ Abundance │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 │ + ╞══════╪════════════╪═══════════╡ + │ 1999 ┆ Saccharina ┆ 4 │ + │ 2000 ┆ Saccharina ┆ 5 │ + │ 2004 ┆ Saccharina ┆ 2 │ + │ 1999 ┆ Agarum ┆ 1 │ + │ 2004 ┆ Agarum ┆ 8 │ + └──────┴────────────┴───────────┘ + + Expose missing years from 1999 to 2004 - + pass a polars expression with the new dates, + and ensure the expression's name already exists + in the DataFrame: + >>> expression = pl.int_range(1999,2005).alias('Year') + >>> with pl.Config(tbl_rows=-1): + ... df.complete(expression,'Taxon',sort=True) + shape: (12, 3) + ┌──────┬────────────┬───────────┐ + │ Year ┆ Taxon ┆ Abundance │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 │ + ╞══════╪════════════╪═══════════╡ + │ 1999 ┆ Agarum ┆ 1 │ + │ 1999 ┆ Saccharina ┆ 4 │ + │ 2000 ┆ Agarum ┆ null │ + │ 2000 ┆ Saccharina ┆ 5 │ + │ 2001 ┆ Agarum ┆ null │ + │ 2001 ┆ Saccharina ┆ null │ + │ 2002 ┆ Agarum ┆ null │ + │ 2002 ┆ Saccharina ┆ null │ + │ 2003 ┆ Agarum ┆ null │ + │ 2003 ┆ Saccharina ┆ null │ + │ 2004 ┆ Agarum ┆ 8 │ + │ 2004 ┆ Saccharina ┆ 2 │ + └──────┴────────────┴───────────┘ + + Expose missing rows per group: + >>> df = pl.DataFrame( + ... { + ... "state": ["CA", "CA", "HI", "HI", "HI", "NY", "NY"], + ... "year": [2010, 2013, 2010, 2012, 2016, 2009, 2013], + ... "value": [1, 3, 1, 2, 3, 2, 5], + ... } + ... ) + >>> df + shape: (7, 3) + ┌───────┬──────┬───────┐ + │ state ┆ year ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════╪══════╪═══════╡ + │ CA ┆ 2010 ┆ 1 │ + │ CA ┆ 2013 ┆ 3 │ + │ HI ┆ 2010 ┆ 1 │ + │ HI ┆ 2012 ┆ 2 │ + │ HI ┆ 2016 ┆ 3 │ + │ NY ┆ 2009 ┆ 2 │ + │ NY ┆ 2013 ┆ 5 │ + └───────┴──────┴───────┘ + >>> low = pl.col('year').min() + >>> high = pl.col('year').max().add(1) + >>> new_year_values=pl.int_range(low,high).alias('year') + >>> with pl.Config(tbl_rows=-1): + ... df.complete(new_year_values,by='state',sort=True) + shape: (16, 3) + ┌───────┬──────┬───────┐ + │ state ┆ year ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═══════╪══════╪═══════╡ + │ CA ┆ 2010 ┆ 1 │ + │ CA ┆ 2011 ┆ null │ + │ CA ┆ 2012 ┆ null │ + │ CA ┆ 2013 ┆ 3 │ + │ HI ┆ 2010 ┆ 1 │ + │ HI ┆ 2011 ┆ null │ + │ HI ┆ 2012 ┆ 2 │ + │ HI ┆ 2013 ┆ null │ + │ HI ┆ 2014 ┆ null │ + │ HI ┆ 2015 ┆ null │ + │ HI ┆ 2016 ┆ 3 │ + │ NY ┆ 2009 ┆ 2 │ + │ NY ┆ 2010 ┆ null │ + │ NY ┆ 2011 ┆ null │ + │ NY ┆ 2012 ┆ null │ + │ NY ┆ 2013 ┆ 5 │ + └───────┴──────┴───────┘ + + + !!! info "New in version 0.28.0" + + Args: + *columns: This refers to the columns to be completed. + It can be a string or a column selector or a polars expression. + A polars expression can be used to introduced new values, + as long as the polars expression has a name that already exists + in the DataFrame. + It is up to the user to ensure that the polars expression returns + unique values. + fill_value: Scalar value or polars expression to use instead of nulls + for missing combinations. A dictionary, mapping columns names + to a scalar value is also accepted. + explicit: Determines if only implicitly missing values + should be filled (`False`), or all nulls existing in the LazyFrame + (`True`). `explicit` is applicable only + if `fill_value` is not `None`. + sort: Sort the DataFrame based on *columns. + by: Column(s) to group by. + The explicit missing rows are returned per group. + + Returns: + A polars DataFrame. + """ # noqa: E501 + return _complete( + df=df, + columns=columns, + fill_value=fill_value, + explicit=explicit, + sort=sort, + by=by, + ) + + def _complete( df: pl.DataFrame | pl.LazyFrame, columns: tuple[ColumnNameOrSelector], diff --git a/janitor/polars/dataframe.py b/janitor/polars/dataframe.py deleted file mode 100644 index 31a55e468..000000000 --- a/janitor/polars/dataframe.py +++ /dev/null @@ -1,732 +0,0 @@ -from __future__ import annotations - -from typing import Any - -from polars.type_aliases import ColumnNameOrSelector - -from janitor.utils import import_message - -from .clean_names import _clean_column_names -from .complete import _complete -from .pivot_longer import _pivot_longer -from .row_to_names import _row_to_names - -try: - import polars as pl -except ImportError: - import_message( - submodule="polars", - package="polars", - conda_channel="conda-forge", - pip_install=True, - ) - - -@pl.api.register_dataframe_namespace("janitor") -class PolarsDataFrame: - def __init__(self, df: pl.DataFrame) -> pl.DataFrame: - self._df = df - - def clean_names( - self, - strip_underscores: str | bool = None, - case_type: str = "lower", - remove_special: bool = False, - strip_accents: bool = False, - truncate_limit: int = None, - ) -> pl.DataFrame: - """ - Clean the column names in a polars DataFrame. - - Examples: - >>> import polars as pl - >>> import janitor.polars - >>> df = pl.DataFrame( - ... { - ... "Aloha": range(3), - ... "Bell Chart": range(3), - ... "Animals@#$%^": range(3) - ... } - ... ) - >>> df - shape: (3, 3) - ┌───────┬────────────┬──────────────┐ - │ Aloha ┆ Bell Chart ┆ Animals@#$%^ │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═══════╪════════════╪══════════════╡ - │ 0 ┆ 0 ┆ 0 │ - │ 1 ┆ 1 ┆ 1 │ - │ 2 ┆ 2 ┆ 2 │ - └───────┴────────────┴──────────────┘ - >>> df.janitor.clean_names(remove_special=True) - shape: (3, 3) - ┌───────┬────────────┬─────────┐ - │ aloha ┆ bell_chart ┆ animals │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═══════╪════════════╪═════════╡ - │ 0 ┆ 0 ┆ 0 │ - │ 1 ┆ 1 ┆ 1 │ - │ 2 ┆ 2 ┆ 2 │ - └───────┴────────────┴─────────┘ - - !!! info "New in version 0.28.0" - - Args: - strip_underscores: Removes the outer underscores from all - column names. Default None keeps outer underscores. Values can be - either 'left', 'right' or 'both' or the respective shorthand 'l', - 'r' and True. - case_type: Whether to make the column names lower or uppercase. - Current case may be preserved with 'preserve', - while snake case conversion (from CamelCase or camelCase only) - can be turned on using "snake". - Default 'lower' makes all characters lowercase. - remove_special: Remove special characters from the column names. - Only letters, numbers and underscores are preserved. - strip_accents: Whether or not to remove accents from - the labels. - truncate_limit: Truncates formatted column names to - the specified length. Default None does not truncate. - - Returns: - A polars DataFrame. - """ # noqa: E501 - return self._df.rename( - lambda col: _clean_column_names( - obj=col, - strip_accents=strip_accents, - strip_underscores=strip_underscores, - case_type=case_type, - remove_special=remove_special, - truncate_limit=truncate_limit, - ) - ) - - def pivot_longer( - self, - index: ColumnNameOrSelector = None, - column_names: ColumnNameOrSelector = None, - names_to: list | tuple | str = "variable", - values_to: str = "value", - names_sep: str = None, - names_pattern: str = None, - names_transform: pl.Expr = None, - ) -> pl.DataFrame: - """ - Unpivots a DataFrame from *wide* to *long* format. - - It is modeled after the `pivot_longer` function in R's tidyr package, - and also takes inspiration from the `melt` function in R's data.table package. - - This function is useful to massage a DataFrame into a format where - one or more columns are considered measured variables, and all other - columns are considered as identifier variables. - - All measured variables are *unpivoted* (and typically duplicated) along the - row axis. - - For more granular control on the unpivoting, have a look at - `pivot_longer_spec`. - - Examples: - >>> import polars as pl - >>> import polars.selectors as cs - >>> import janitor.polars - >>> df = pl.DataFrame( - ... { - ... "Sepal.Length": [5.1, 5.9], - ... "Sepal.Width": [3.5, 3.0], - ... "Petal.Length": [1.4, 5.1], - ... "Petal.Width": [0.2, 1.8], - ... "Species": ["setosa", "virginica"], - ... } - ... ) - >>> df - shape: (2, 5) - ┌──────────────┬─────────────┬──────────────┬─────────────┬───────────┐ - │ Sepal.Length ┆ Sepal.Width ┆ Petal.Length ┆ Petal.Width ┆ Species │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ f64 ┆ f64 ┆ str │ - ╞══════════════╪═════════════╪══════════════╪═════════════╪═══════════╡ - │ 5.1 ┆ 3.5 ┆ 1.4 ┆ 0.2 ┆ setosa │ - │ 5.9 ┆ 3.0 ┆ 5.1 ┆ 1.8 ┆ virginica │ - └──────────────┴─────────────┴──────────────┴─────────────┴───────────┘ - - Replicate polars' [melt](https://docs.pola.rs/py-polars/html/reference/dataframe/api/polars.DataFrame.melt.html#polars-dataframe-melt): - >>> df.janitor.pivot_longer(index = 'Species') - shape: (8, 3) - ┌───────────┬──────────────┬───────┐ - │ Species ┆ variable ┆ value │ - │ --- ┆ --- ┆ --- │ - │ str ┆ str ┆ f64 │ - ╞═══════════╪══════════════╪═══════╡ - │ setosa ┆ Sepal.Length ┆ 5.1 │ - │ virginica ┆ Sepal.Length ┆ 5.9 │ - │ setosa ┆ Sepal.Width ┆ 3.5 │ - │ virginica ┆ Sepal.Width ┆ 3.0 │ - │ setosa ┆ Petal.Length ┆ 1.4 │ - │ virginica ┆ Petal.Length ┆ 5.1 │ - │ setosa ┆ Petal.Width ┆ 0.2 │ - │ virginica ┆ Petal.Width ┆ 1.8 │ - └───────────┴──────────────┴───────┘ - - Split the column labels into individual columns: - >>> df.janitor.pivot_longer( - ... index = 'Species', - ... names_to = ('part', 'dimension'), - ... names_sep = '.', - ... ).select('Species','part','dimension','value') - shape: (8, 4) - ┌───────────┬───────┬───────────┬───────┐ - │ Species ┆ part ┆ dimension ┆ value │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ str ┆ str ┆ f64 │ - ╞═══════════╪═══════╪═══════════╪═══════╡ - │ setosa ┆ Sepal ┆ Length ┆ 5.1 │ - │ virginica ┆ Sepal ┆ Length ┆ 5.9 │ - │ setosa ┆ Sepal ┆ Width ┆ 3.5 │ - │ virginica ┆ Sepal ┆ Width ┆ 3.0 │ - │ setosa ┆ Petal ┆ Length ┆ 1.4 │ - │ virginica ┆ Petal ┆ Length ┆ 5.1 │ - │ setosa ┆ Petal ┆ Width ┆ 0.2 │ - │ virginica ┆ Petal ┆ Width ┆ 1.8 │ - └───────────┴───────┴───────────┴───────┘ - - Retain parts of the column names as headers: - >>> df.janitor.pivot_longer( - ... index = 'Species', - ... names_to = ('part', '.value'), - ... names_sep = '.', - ... ).select('Species','part','Length','Width') - shape: (4, 4) - ┌───────────┬───────┬────────┬───────┐ - │ Species ┆ part ┆ Length ┆ Width │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ str ┆ f64 ┆ f64 │ - ╞═══════════╪═══════╪════════╪═══════╡ - │ setosa ┆ Sepal ┆ 5.1 ┆ 3.5 │ - │ virginica ┆ Sepal ┆ 5.9 ┆ 3.0 │ - │ setosa ┆ Petal ┆ 1.4 ┆ 0.2 │ - │ virginica ┆ Petal ┆ 5.1 ┆ 1.8 │ - └───────────┴───────┴────────┴───────┘ - - Split the column labels based on regex: - >>> df = pl.DataFrame({"id": [1], "new_sp_m5564": [2], "newrel_f65": [3]}) - >>> df - shape: (1, 3) - ┌─────┬──────────────┬────────────┐ - │ id ┆ new_sp_m5564 ┆ newrel_f65 │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═════╪══════════════╪════════════╡ - │ 1 ┆ 2 ┆ 3 │ - └─────┴──────────────┴────────────┘ - >>> df.janitor.pivot_longer( - ... index = 'id', - ... names_to = ('diagnosis', 'gender', 'age'), - ... names_pattern = r"new_?(.+)_(.)([0-9]+)", - ... ).select('id','diagnosis','gender','age','value').sort(by=pl.all()) - shape: (2, 5) - ┌─────┬───────────┬────────┬──────┬───────┐ - │ id ┆ diagnosis ┆ gender ┆ age ┆ value │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ str ┆ str ┆ i64 │ - ╞═════╪═══════════╪════════╪══════╪═══════╡ - │ 1 ┆ rel ┆ f ┆ 65 ┆ 3 │ - │ 1 ┆ sp ┆ m ┆ 5564 ┆ 2 │ - └─────┴───────────┴────────┴──────┴───────┘ - - Convert the dtypes of specific columns with `names_transform`: - >>> df.janitor.pivot_longer( - ... index = "id", - ... names_pattern=r"new_?(.+)_(.)([0-9]+)", - ... names_to=("diagnosis", "gender", "age"), - ... names_transform=pl.col('age').cast(pl.Int32), - ... ).select("id", "diagnosis", "gender", "age", "value").sort(by=pl.all()) - shape: (2, 5) - ┌─────┬───────────┬────────┬──────┬───────┐ - │ id ┆ diagnosis ┆ gender ┆ age ┆ value │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ str ┆ i32 ┆ i64 │ - ╞═════╪═══════════╪════════╪══════╪═══════╡ - │ 1 ┆ rel ┆ f ┆ 65 ┆ 3 │ - │ 1 ┆ sp ┆ m ┆ 5564 ┆ 2 │ - └─────┴───────────┴────────┴──────┴───────┘ - - Use multiple `.value` to reshape the dataframe: - >>> df = pl.DataFrame( - ... [ - ... { - ... "x_1_mean": 10, - ... "x_2_mean": 20, - ... "y_1_mean": 30, - ... "y_2_mean": 40, - ... "unit": 50, - ... } - ... ] - ... ) - >>> df - shape: (1, 5) - ┌──────────┬──────────┬──────────┬──────────┬──────┐ - │ x_1_mean ┆ x_2_mean ┆ y_1_mean ┆ y_2_mean ┆ unit │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ - ╞══════════╪══════════╪══════════╪══════════╪══════╡ - │ 10 ┆ 20 ┆ 30 ┆ 40 ┆ 50 │ - └──────────┴──────────┴──────────┴──────────┴──────┘ - >>> df.janitor.pivot_longer( - ... index="unit", - ... names_to=(".value", "time", ".value"), - ... names_pattern=r"(x|y)_([0-9])(_mean)", - ... ).select('unit','time','x_mean','y_mean').sort(by=pl.all()) - shape: (2, 4) - ┌──────┬──────┬────────┬────────┐ - │ unit ┆ time ┆ x_mean ┆ y_mean │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ i64 ┆ i64 │ - ╞══════╪══════╪════════╪════════╡ - │ 50 ┆ 1 ┆ 10 ┆ 30 │ - │ 50 ┆ 2 ┆ 20 ┆ 40 │ - └──────┴──────┴────────┴────────┘ - - !!! info "New in version 0.28.0" - - Args: - index: Column(s) or selector(s) to use as identifier variables. - column_names: Column(s) or selector(s) to unpivot. - names_to: Name of new column as a string that will contain - what were previously the column names in `column_names`. - The default is `variable` if no value is provided. It can - also be a list/tuple of strings that will serve as new column - names, if `name_sep` or `names_pattern` is provided. - If `.value` is in `names_to`, new column names will be extracted - from part of the existing column names and overrides `values_to`. - values_to: Name of new column as a string that will contain what - were previously the values of the columns in `column_names`. - names_sep: Determines how the column name is broken up, if - `names_to` contains multiple values. It takes the same - specification as polars' `str.split` method. - names_pattern: Determines how the column name is broken up. - It can be a regular expression containing matching groups. - It takes the same - specification as polars' `str.extract_groups` method. - names_transform: Use this option to change the types of columns that - have been transformed to rows. - This does not applies to the values' columns. - Accepts a polars expression or a list of polars expressions. - Applicable only if one of names_sep - or names_pattern is provided. - - Returns: - A polars DataFrame that has been unpivoted from wide to long - format. - """ # noqa: E501 - return _pivot_longer( - df=self._df, - index=index, - column_names=column_names, - names_pattern=names_pattern, - names_sep=names_sep, - names_to=names_to, - values_to=values_to, - names_transform=names_transform, - ) - - def row_to_names( - self, - row_numbers: int | list = 0, - remove_rows: bool = False, - remove_rows_above: bool = False, - separator: str = "_", - ) -> pl.DataFrame: - """ - Elevates a row, or rows, to be the column names of a DataFrame. - - Examples: - Replace column names with the first row. - - >>> import polars as pl - >>> import janitor.polars - >>> df = pl.DataFrame({ - ... "a": ["nums", '6', '9'], - ... "b": ["chars", "x", "y"], - ... }) - >>> df - shape: (3, 2) - ┌──────┬───────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ str ┆ str │ - ╞══════╪═══════╡ - │ nums ┆ chars │ - │ 6 ┆ x │ - │ 9 ┆ y │ - └──────┴───────┘ - >>> df.janitor.row_to_names(0, remove_rows=True) - shape: (2, 2) - ┌──────┬───────┐ - │ nums ┆ chars │ - │ --- ┆ --- │ - │ str ┆ str │ - ╞══════╪═══════╡ - │ 6 ┆ x │ - │ 9 ┆ y │ - └──────┴───────┘ - >>> df.janitor.row_to_names(row_numbers=[0,1], remove_rows=True) - shape: (1, 2) - ┌────────┬─────────┐ - │ nums_6 ┆ chars_x │ - │ --- ┆ --- │ - │ str ┆ str │ - ╞════════╪═════════╡ - │ 9 ┆ y │ - └────────┴─────────┘ - - Remove rows above the elevated row and the elevated row itself. - - >>> df = pl.DataFrame({ - ... "a": ["bla1", "nums", '6', '9'], - ... "b": ["bla2", "chars", "x", "y"], - ... }) - >>> df - shape: (4, 2) - ┌──────┬───────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ str ┆ str │ - ╞══════╪═══════╡ - │ bla1 ┆ bla2 │ - │ nums ┆ chars │ - │ 6 ┆ x │ - │ 9 ┆ y │ - └──────┴───────┘ - >>> df.janitor.row_to_names(1, remove_rows=True, remove_rows_above=True) - shape: (2, 2) - ┌──────┬───────┐ - │ nums ┆ chars │ - │ --- ┆ --- │ - │ str ┆ str │ - ╞══════╪═══════╡ - │ 6 ┆ x │ - │ 9 ┆ y │ - └──────┴───────┘ - - !!! info "New in version 0.28.0" - - Args: - row_numbers: Position of the row(s) containing the variable names. - Note that indexing starts from 0. It can also be a list. - Defaults to 0 (first row). - remove_rows: Whether the row(s) should be removed from the DataFrame. - remove_rows_above: Whether the row(s) above the selected row should - be removed from the DataFrame. - separator: Combines the labels into a single string, - if row_numbers is a list of integers. Default is '_'. - - Returns: - A polars DataFrame. - """ # noqa: E501 - return _row_to_names( - self._df, - row_numbers=row_numbers, - remove_rows=remove_rows, - remove_rows_above=remove_rows_above, - separator=separator, - ) - - def complete( - self, - *columns: ColumnNameOrSelector, - fill_value: dict | Any | pl.Expr = None, - explicit: bool = True, - sort: bool = False, - by: ColumnNameOrSelector = None, - ) -> pl.DataFrame: - """ - Turns implicit missing values into explicit missing values - - It is modeled after tidyr's `complete` function. - In a way, it is the inverse of `pl.drop_nulls`, - as it exposes implicitly missing rows. - - If the combination involves multiple columns, pass it as a struct, - with an alias - the name of the struct should not exist in the DataFrame. - - If new values need to be introduced, a polars Expression - with the new values can be passed, as long as the polars Expression - has a name that already exists in the DataFrame. - - It is up to the user to ensure that the polars expression returns - unique values and/or sorted values. - - Note that if the polars expression evaluates to a struct, - then the fields, not the name, should already exist in the DataFrame. - - Examples: - >>> import polars as pl - >>> import janitor.polars - >>> df = pl.DataFrame( - ... dict( - ... group=(1, 2, 1, 2), - ... item_id=(1, 2, 2, 3), - ... item_name=("a", "a", "b", "b"), - ... value1=(1, None, 3, 4), - ... value2=range(4, 8), - ... ) - ... ) - >>> df - shape: (4, 5) - ┌───────┬─────────┬───────────┬────────┬────────┐ - │ group ┆ item_id ┆ item_name ┆ value1 ┆ value2 │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str ┆ i64 ┆ i64 │ - ╞═══════╪═════════╪═══════════╪════════╪════════╡ - │ 1 ┆ 1 ┆ a ┆ 1 ┆ 4 │ - │ 2 ┆ 2 ┆ a ┆ null ┆ 5 │ - │ 1 ┆ 2 ┆ b ┆ 3 ┆ 6 │ - │ 2 ┆ 3 ┆ b ┆ 4 ┆ 7 │ - └───────┴─────────┴───────────┴────────┴────────┘ - - Generate all possible combinations of - `group`, `item_id`, and `item_name` - (whether or not they appear in the data) - >>> with pl.Config(tbl_rows=-1): - ... df.janitor.complete("group", "item_id", "item_name", sort=True) - shape: (12, 5) - ┌───────┬─────────┬───────────┬────────┬────────┐ - │ group ┆ item_id ┆ item_name ┆ value1 ┆ value2 │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str ┆ i64 ┆ i64 │ - ╞═══════╪═════════╪═══════════╪════════╪════════╡ - │ 1 ┆ 1 ┆ a ┆ 1 ┆ 4 │ - │ 1 ┆ 1 ┆ b ┆ null ┆ null │ - │ 1 ┆ 2 ┆ a ┆ null ┆ null │ - │ 1 ┆ 2 ┆ b ┆ 3 ┆ 6 │ - │ 1 ┆ 3 ┆ a ┆ null ┆ null │ - │ 1 ┆ 3 ┆ b ┆ null ┆ null │ - │ 2 ┆ 1 ┆ a ┆ null ┆ null │ - │ 2 ┆ 1 ┆ b ┆ null ┆ null │ - │ 2 ┆ 2 ┆ a ┆ null ┆ 5 │ - │ 2 ┆ 2 ┆ b ┆ null ┆ null │ - │ 2 ┆ 3 ┆ a ┆ null ┆ null │ - │ 2 ┆ 3 ┆ b ┆ 4 ┆ 7 │ - └───────┴─────────┴───────────┴────────┴────────┘ - - Cross all possible `group` values with the unique pairs of - `(item_id, item_name)` that already exist in the data. - For such situations, where there is a group of columns, - pass it in as a struct: - >>> with pl.Config(tbl_rows=-1): - ... df.janitor.complete( - ... "group", - ... pl.struct("item_id", "item_name").unique().sort().alias("rar"), - ... sort=True - ... ) - shape: (8, 5) - ┌───────┬─────────┬───────────┬────────┬────────┐ - │ group ┆ item_id ┆ item_name ┆ value1 ┆ value2 │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str ┆ i64 ┆ i64 │ - ╞═══════╪═════════╪═══════════╪════════╪════════╡ - │ 1 ┆ 1 ┆ a ┆ 1 ┆ 4 │ - │ 1 ┆ 2 ┆ a ┆ null ┆ null │ - │ 1 ┆ 2 ┆ b ┆ 3 ┆ 6 │ - │ 1 ┆ 3 ┆ b ┆ null ┆ null │ - │ 2 ┆ 1 ┆ a ┆ null ┆ null │ - │ 2 ┆ 2 ┆ a ┆ null ┆ 5 │ - │ 2 ┆ 2 ┆ b ┆ null ┆ null │ - │ 2 ┆ 3 ┆ b ┆ 4 ┆ 7 │ - └───────┴─────────┴───────────┴────────┴────────┘ - - Fill in nulls: - >>> with pl.Config(tbl_rows=-1): - ... df.janitor.complete( - ... "group", - ... pl.struct("item_id", "item_name").unique().sort().alias('rar'), - ... fill_value={"value1": 0, "value2": 99}, - ... explicit=True, - ... sort=True, - ... ) - shape: (8, 5) - ┌───────┬─────────┬───────────┬────────┬────────┐ - │ group ┆ item_id ┆ item_name ┆ value1 ┆ value2 │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str ┆ i64 ┆ i64 │ - ╞═══════╪═════════╪═══════════╪════════╪════════╡ - │ 1 ┆ 1 ┆ a ┆ 1 ┆ 4 │ - │ 1 ┆ 2 ┆ a ┆ 0 ┆ 99 │ - │ 1 ┆ 2 ┆ b ┆ 3 ┆ 6 │ - │ 1 ┆ 3 ┆ b ┆ 0 ┆ 99 │ - │ 2 ┆ 1 ┆ a ┆ 0 ┆ 99 │ - │ 2 ┆ 2 ┆ a ┆ 0 ┆ 5 │ - │ 2 ┆ 2 ┆ b ┆ 0 ┆ 99 │ - │ 2 ┆ 3 ┆ b ┆ 4 ┆ 7 │ - └───────┴─────────┴───────────┴────────┴────────┘ - - Limit the fill to only the newly created - missing values with `explicit = FALSE` - >>> with pl.Config(tbl_rows=-1): - ... df.janitor.complete( - ... "group", - ... pl.struct("item_id", "item_name").unique().sort().alias('rar'), - ... fill_value={"value1": 0, "value2": 99}, - ... explicit=False, - ... sort=True, - ... ) - shape: (8, 5) - ┌───────┬─────────┬───────────┬────────┬────────┐ - │ group ┆ item_id ┆ item_name ┆ value1 ┆ value2 │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str ┆ i64 ┆ i64 │ - ╞═══════╪═════════╪═══════════╪════════╪════════╡ - │ 1 ┆ 1 ┆ a ┆ 1 ┆ 4 │ - │ 1 ┆ 2 ┆ a ┆ 0 ┆ 99 │ - │ 1 ┆ 2 ┆ b ┆ 3 ┆ 6 │ - │ 1 ┆ 3 ┆ b ┆ 0 ┆ 99 │ - │ 2 ┆ 1 ┆ a ┆ 0 ┆ 99 │ - │ 2 ┆ 2 ┆ a ┆ null ┆ 5 │ - │ 2 ┆ 2 ┆ b ┆ 0 ┆ 99 │ - │ 2 ┆ 3 ┆ b ┆ 4 ┆ 7 │ - └───────┴─────────┴───────────┴────────┴────────┘ - - >>> df = pl.DataFrame( - ... { - ... "Year": [1999, 2000, 2004, 1999, 2004], - ... "Taxon": [ - ... "Saccharina", - ... "Saccharina", - ... "Saccharina", - ... "Agarum", - ... "Agarum", - ... ], - ... "Abundance": [4, 5, 2, 1, 8], - ... } - ... ) - >>> df - shape: (5, 3) - ┌──────┬────────────┬───────────┐ - │ Year ┆ Taxon ┆ Abundance │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ i64 │ - ╞══════╪════════════╪═══════════╡ - │ 1999 ┆ Saccharina ┆ 4 │ - │ 2000 ┆ Saccharina ┆ 5 │ - │ 2004 ┆ Saccharina ┆ 2 │ - │ 1999 ┆ Agarum ┆ 1 │ - │ 2004 ┆ Agarum ┆ 8 │ - └──────┴────────────┴───────────┘ - - Expose missing years from 1999 to 2004 - - pass a polars expression with the new dates, - and ensure the expression's name already exists - in the DataFrame: - >>> expression = pl.int_range(1999,2005).alias('Year') - >>> with pl.Config(tbl_rows=-1): - ... df.janitor.complete(expression,'Taxon',sort=True) - shape: (12, 3) - ┌──────┬────────────┬───────────┐ - │ Year ┆ Taxon ┆ Abundance │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ str ┆ i64 │ - ╞══════╪════════════╪═══════════╡ - │ 1999 ┆ Agarum ┆ 1 │ - │ 1999 ┆ Saccharina ┆ 4 │ - │ 2000 ┆ Agarum ┆ null │ - │ 2000 ┆ Saccharina ┆ 5 │ - │ 2001 ┆ Agarum ┆ null │ - │ 2001 ┆ Saccharina ┆ null │ - │ 2002 ┆ Agarum ┆ null │ - │ 2002 ┆ Saccharina ┆ null │ - │ 2003 ┆ Agarum ┆ null │ - │ 2003 ┆ Saccharina ┆ null │ - │ 2004 ┆ Agarum ┆ 8 │ - │ 2004 ┆ Saccharina ┆ 2 │ - └──────┴────────────┴───────────┘ - - Expose missing rows per group: - >>> df = pl.DataFrame( - ... { - ... "state": ["CA", "CA", "HI", "HI", "HI", "NY", "NY"], - ... "year": [2010, 2013, 2010, 2012, 2016, 2009, 2013], - ... "value": [1, 3, 1, 2, 3, 2, 5], - ... } - ... ) - >>> df - shape: (7, 3) - ┌───────┬──────┬───────┐ - │ state ┆ year ┆ value │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═══════╪══════╪═══════╡ - │ CA ┆ 2010 ┆ 1 │ - │ CA ┆ 2013 ┆ 3 │ - │ HI ┆ 2010 ┆ 1 │ - │ HI ┆ 2012 ┆ 2 │ - │ HI ┆ 2016 ┆ 3 │ - │ NY ┆ 2009 ┆ 2 │ - │ NY ┆ 2013 ┆ 5 │ - └───────┴──────┴───────┘ - >>> low = pl.col('year').min() - >>> high = pl.col('year').max().add(1) - >>> new_year_values=pl.int_range(low,high).alias('year') - >>> with pl.Config(tbl_rows=-1): - ... df.janitor.complete(new_year_values,by='state',sort=True) - shape: (16, 3) - ┌───────┬──────┬───────┐ - │ state ┆ year ┆ value │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═══════╪══════╪═══════╡ - │ CA ┆ 2010 ┆ 1 │ - │ CA ┆ 2011 ┆ null │ - │ CA ┆ 2012 ┆ null │ - │ CA ┆ 2013 ┆ 3 │ - │ HI ┆ 2010 ┆ 1 │ - │ HI ┆ 2011 ┆ null │ - │ HI ┆ 2012 ┆ 2 │ - │ HI ┆ 2013 ┆ null │ - │ HI ┆ 2014 ┆ null │ - │ HI ┆ 2015 ┆ null │ - │ HI ┆ 2016 ┆ 3 │ - │ NY ┆ 2009 ┆ 2 │ - │ NY ┆ 2010 ┆ null │ - │ NY ┆ 2011 ┆ null │ - │ NY ┆ 2012 ┆ null │ - │ NY ┆ 2013 ┆ 5 │ - └───────┴──────┴───────┘ - - - !!! info "New in version 0.28.0" - - Args: - *columns: This refers to the columns to be completed. - It can be a string or a column selector or a polars expression. - A polars expression can be used to introduced new values, - as long as the polars expression has a name that already exists - in the DataFrame. - It is up to the user to ensure that the polars expression returns - unique values. - fill_value: Scalar value or polars expression to use instead of nulls - for missing combinations. A dictionary, mapping columns names - to a scalar value is also accepted. - explicit: Determines if only implicitly missing values - should be filled (`False`), or all nulls existing in the LazyFrame - (`True`). `explicit` is applicable only - if `fill_value` is not `None`. - sort: Sort the DataFrame based on *columns. - by: Column(s) to group by. - The explicit missing rows are returned per group. - - Returns: - A polars DataFrame. - """ # noqa: E501 - return _complete( - df=self._df, - columns=columns, - fill_value=fill_value, - explicit=explicit, - sort=sort, - by=by, - ) diff --git a/janitor/polars/expressions.py b/janitor/polars/expressions.py deleted file mode 100644 index 46f1706e2..000000000 --- a/janitor/polars/expressions.py +++ /dev/null @@ -1,93 +0,0 @@ -from __future__ import annotations - -from janitor.utils import import_message - -from .clean_names import _clean_expr_names - -try: - import polars as pl -except ImportError: - import_message( - submodule="polars", - package="polars", - conda_channel="conda-forge", - pip_install=True, - ) - - -@pl.api.register_expr_namespace("janitor") -class PolarsExpr: - def __init__(self, expr: pl.Expr) -> pl.Expr: - self._expr = expr - - def clean_names( - self, - strip_underscores: str | bool = None, - case_type: str = "lower", - remove_special: bool = False, - strip_accents: bool = False, - enforce_string: bool = False, - truncate_limit: int = None, - ) -> pl.Expr: - """ - Clean the labels in a polars Expression. - - Examples: - >>> import polars as pl - >>> import janitor.polars - >>> df = pl.DataFrame({"raw": ["Abçdê fgí j"]}) - >>> df - shape: (1, 1) - ┌─────────────┐ - │ raw │ - │ --- │ - │ str │ - ╞═════════════╡ - │ Abçdê fgí j │ - └─────────────┘ - - Clean the column values: - >>> df.with_columns(pl.col("raw").janitor.clean_names(strip_accents=True)) - shape: (1, 1) - ┌─────────────┐ - │ raw │ - │ --- │ - │ str │ - ╞═════════════╡ - │ abcde_fgi_j │ - └─────────────┘ - - !!! info "New in version 0.28.0" - - Args: - strip_underscores: Removes the outer underscores - from all labels in the expression. - Default None keeps outer underscores. - Values can be either 'left', 'right' - or 'both' or the respective shorthand 'l', - 'r' and True. - case_type: Whether to make the labels in the expression lower or uppercase. - Current case may be preserved with 'preserve', - while snake case conversion (from CamelCase or camelCase only) - can be turned on using "snake". - Default 'lower' makes all characters lowercase. - remove_special: Remove special characters from the values in the expression. - Only letters, numbers and underscores are preserved. - strip_accents: Whether or not to remove accents from - the expression. - enforce_string: Whether or not to cast the expression to a string type. - truncate_limit: Truncates formatted labels in the expression to - the specified length. Default None does not truncate. - - Returns: - A polars Expression. - """ - return _clean_expr_names( - obj=self._expr, - strip_accents=strip_accents, - strip_underscores=strip_underscores, - case_type=case_type, - remove_special=remove_special, - enforce_string=enforce_string, - truncate_limit=truncate_limit, - ) diff --git a/janitor/polars/lazyframe.py b/janitor/polars/lazyframe.py deleted file mode 100644 index f059ab1f5..000000000 --- a/janitor/polars/lazyframe.py +++ /dev/null @@ -1,428 +0,0 @@ -from __future__ import annotations - -from typing import Any - -from polars.type_aliases import ColumnNameOrSelector - -from janitor.utils import import_message - -from .clean_names import _clean_column_names -from .complete import _complete -from .pivot_longer import _pivot_longer -from .row_to_names import _row_to_names - -try: - import polars as pl -except ImportError: - import_message( - submodule="polars", - package="polars", - conda_channel="conda-forge", - pip_install=True, - ) - - -@pl.api.register_lazyframe_namespace("janitor") -class PolarsLazyFrame: - def __init__(self, df: pl.LazyFrame) -> pl.LazyFrame: - self._df = df - - def clean_names( - self, - strip_underscores: str | bool = None, - case_type: str = "lower", - remove_special: bool = False, - strip_accents: bool = False, - truncate_limit: int = None, - ) -> pl.LazyFrame: - """ - Clean the column names in a polars LazyFrame. - - Examples: - >>> import polars as pl - >>> import janitor.polars - >>> df = pl.LazyFrame( - ... { - ... "Aloha": range(3), - ... "Bell Chart": range(3), - ... "Animals@#$%^": range(3) - ... } - ... ) - >>> df.collect() - shape: (3, 3) - ┌───────┬────────────┬──────────────┐ - │ Aloha ┆ Bell Chart ┆ Animals@#$%^ │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═══════╪════════════╪══════════════╡ - │ 0 ┆ 0 ┆ 0 │ - │ 1 ┆ 1 ┆ 1 │ - │ 2 ┆ 2 ┆ 2 │ - └───────┴────────────┴──────────────┘ - >>> df.janitor.clean_names(remove_special=True).collect() - shape: (3, 3) - ┌───────┬────────────┬─────────┐ - │ aloha ┆ bell_chart ┆ animals │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═══════╪════════════╪═════════╡ - │ 0 ┆ 0 ┆ 0 │ - │ 1 ┆ 1 ┆ 1 │ - │ 2 ┆ 2 ┆ 2 │ - └───────┴────────────┴─────────┘ - - !!! info "New in version 0.28.0" - - Args: - strip_underscores: Removes the outer underscores from all - column names. Default None keeps outer underscores. Values can be - either 'left', 'right' or 'both' or the respective shorthand 'l', - 'r' and True. - case_type: Whether to make the column names lower or uppercase. - Current case may be preserved with 'preserve', - while snake case conversion (from CamelCase or camelCase only) - can be turned on using "snake". - Default 'lower' makes all characters lowercase. - remove_special: Remove special characters from the column names. - Only letters, numbers and underscores are preserved. - strip_accents: Whether or not to remove accents from - the labels. - truncate_limit: Truncates formatted column names to - the specified length. Default None does not truncate. - - Returns: - A polars LazyFrame. - """ # noqa: E501 - return self._df.rename( - lambda col: _clean_column_names( - obj=col, - strip_accents=strip_accents, - strip_underscores=strip_underscores, - case_type=case_type, - remove_special=remove_special, - truncate_limit=truncate_limit, - ) - ) - - def pivot_longer( - self, - index: ColumnNameOrSelector = None, - column_names: ColumnNameOrSelector = None, - names_to: list | tuple | str = "variable", - values_to: str = "value", - names_sep: str = None, - names_pattern: str = None, - names_transform: pl.Expr = None, - ) -> pl.LazyFrame: - """ - Unpivots a LazyFrame from *wide* to *long* format. - - It is modeled after the `pivot_longer` function in R's tidyr package, - and also takes inspiration from the `melt` function in R's data.table package. - - This function is useful to massage a LazyFrame into a format where - one or more columns are considered measured variables, and all other - columns are considered as identifier variables. - - All measured variables are *unpivoted* (and typically duplicated) along the - row axis. - - For more granular control on the unpivoting, have a look at - `pivot_longer_spec`. - - Examples: - >>> import polars as pl - >>> import polars.selectors as cs - >>> import janitor.polars - >>> df = pl.LazyFrame( - ... { - ... "Sepal.Length": [5.1, 5.9], - ... "Sepal.Width": [3.5, 3.0], - ... "Petal.Length": [1.4, 5.1], - ... "Petal.Width": [0.2, 1.8], - ... "Species": ["setosa", "virginica"], - ... } - ... ) - >>> df.collect() - shape: (2, 5) - ┌──────────────┬─────────────┬──────────────┬─────────────┬───────────┐ - │ Sepal.Length ┆ Sepal.Width ┆ Petal.Length ┆ Petal.Width ┆ Species │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ f64 ┆ f64 ┆ f64 ┆ f64 ┆ str │ - ╞══════════════╪═════════════╪══════════════╪═════════════╪═══════════╡ - │ 5.1 ┆ 3.5 ┆ 1.4 ┆ 0.2 ┆ setosa │ - │ 5.9 ┆ 3.0 ┆ 5.1 ┆ 1.8 ┆ virginica │ - └──────────────┴─────────────┴──────────────┴─────────────┴───────────┘ - - >>> df.janitor.pivot_longer(index = 'Species').collect() - shape: (8, 3) - ┌───────────┬──────────────┬───────┐ - │ Species ┆ variable ┆ value │ - │ --- ┆ --- ┆ --- │ - │ str ┆ str ┆ f64 │ - ╞═══════════╪══════════════╪═══════╡ - │ setosa ┆ Sepal.Length ┆ 5.1 │ - │ virginica ┆ Sepal.Length ┆ 5.9 │ - │ setosa ┆ Sepal.Width ┆ 3.5 │ - │ virginica ┆ Sepal.Width ┆ 3.0 │ - │ setosa ┆ Petal.Length ┆ 1.4 │ - │ virginica ┆ Petal.Length ┆ 5.1 │ - │ setosa ┆ Petal.Width ┆ 0.2 │ - │ virginica ┆ Petal.Width ┆ 1.8 │ - └───────────┴──────────────┴───────┘ - - !!! info "New in version 0.28.0" - - Args: - index: Column(s) or selector(s) to use as identifier variables. - column_names: Column(s) or selector(s) to unpivot. - names_to: Name of new column as a string that will contain - what were previously the column names in `column_names`. - The default is `variable` if no value is provided. It can - also be a list/tuple of strings that will serve as new column - names, if `name_sep` or `names_pattern` is provided. - If `.value` is in `names_to`, new column names will be extracted - from part of the existing column names and overrides `values_to`. - values_to: Name of new column as a string that will contain what - were previously the values of the columns in `column_names`. - names_sep: Determines how the column name is broken up, if - `names_to` contains multiple values. It takes the same - specification as polars' `str.split` method. - names_pattern: Determines how the column name is broken up. - It can be a regular expression containing matching groups. - It takes the same - specification as polars' `str.extract_groups` method. - names_transform: Use this option to change the types of columns that - have been transformed to rows. - This does not applies to the values' columns. - Accepts a polars expression or a list of polars expressions. - Applicable only if one of names_sep - or names_pattern is provided. - - Returns: - A polars LazyFrame that has been unpivoted from wide to long - format. - """ # noqa: E501 - return _pivot_longer( - df=self._df, - index=index, - column_names=column_names, - names_pattern=names_pattern, - names_sep=names_sep, - names_to=names_to, - values_to=values_to, - names_transform=names_transform, - ) - - def row_to_names( - self, - row_numbers: int | list = 0, - remove_rows: bool = False, - remove_rows_above: bool = False, - separator: str = "_", - ) -> pl.LazyFrame: - """ - Elevates a row, or rows, to be the column names of a DataFrame. - - Examples: - Replace column names with the first row. - - >>> import polars as pl - >>> import janitor.polars - >>> df = pl.LazyFrame({ - ... "a": ["nums", '6', '9'], - ... "b": ["chars", "x", "y"], - ... }) - >>> df.collect() - shape: (3, 2) - ┌──────┬───────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ str ┆ str │ - ╞══════╪═══════╡ - │ nums ┆ chars │ - │ 6 ┆ x │ - │ 9 ┆ y │ - └──────┴───────┘ - >>> df.janitor.row_to_names(0, remove_rows=True).collect() - shape: (2, 2) - ┌──────┬───────┐ - │ nums ┆ chars │ - │ --- ┆ --- │ - │ str ┆ str │ - ╞══════╪═══════╡ - │ 6 ┆ x │ - │ 9 ┆ y │ - └──────┴───────┘ - >>> df.janitor.row_to_names(row_numbers=[0,1], remove_rows=True).collect() - shape: (1, 2) - ┌────────┬─────────┐ - │ nums_6 ┆ chars_x │ - │ --- ┆ --- │ - │ str ┆ str │ - ╞════════╪═════════╡ - │ 9 ┆ y │ - └────────┴─────────┘ - - Remove rows above the elevated row and the elevated row itself. - - >>> df = pl.LazyFrame({ - ... "a": ["bla1", "nums", '6', '9'], - ... "b": ["bla2", "chars", "x", "y"], - ... }) - >>> df.collect() - shape: (4, 2) - ┌──────┬───────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ str ┆ str │ - ╞══════╪═══════╡ - │ bla1 ┆ bla2 │ - │ nums ┆ chars │ - │ 6 ┆ x │ - │ 9 ┆ y │ - └──────┴───────┘ - >>> df.janitor.row_to_names(1, remove_rows=True, remove_rows_above=True).collect() - shape: (2, 2) - ┌──────┬───────┐ - │ nums ┆ chars │ - │ --- ┆ --- │ - │ str ┆ str │ - ╞══════╪═══════╡ - │ 6 ┆ x │ - │ 9 ┆ y │ - └──────┴───────┘ - - !!! info "New in version 0.28.0" - - Args: - row_numbers: Position of the row(s) containing the variable names. - Note that indexing starts from 0. It can also be a list. - Defaults to 0 (first row). - remove_rows: Whether the row(s) should be removed from the DataFrame. - remove_rows_above: Whether the row(s) above the selected row should - be removed from the DataFrame. - separator: If `row_numbers` is a list of numbers, this parameter - determines how the labels will be combined into a single string. - - Returns: - A polars LazyFrame. - """ # noqa: E501 - return _row_to_names( - self._df, - row_numbers=row_numbers, - remove_rows=remove_rows, - remove_rows_above=remove_rows_above, - separator=separator, - ) - - def complete( - self, - *columns: ColumnNameOrSelector, - fill_value: dict | Any | pl.Expr = None, - explicit: bool = True, - sort: bool = False, - by: ColumnNameOrSelector = None, - ) -> pl.LazyFrame: - """ - Turns implicit missing values into explicit missing values. - - It is modeled after tidyr's `complete` function. - In a way, it is the inverse of `pl.drop_nulls`, - as it exposes implicitly missing rows. - - If the combination involves multiple columns, pass it as a struct, - with an alias - the name of the struct should not exist in the LazyFrame. - - If new values need to be introduced, a polars Expression - with the new values can be passed, as long as the polars Expression - has a name that already exists in the LazyFrame. - - It is up to the user to ensure that the polars expression returns - unique values. - - Note that if the polars expression evaluates to a struct, - then the fields, not the name, should already exist in the LazyFrame. - - Examples: - >>> import polars as pl - >>> import janitor.polars - >>> df = pl.LazyFrame( - ... dict( - ... group=(1, 2, 1, 2), - ... item_id=(1, 2, 2, 3), - ... item_name=("a", "a", "b", "b"), - ... value1=(1, None, 3, 4), - ... value2=range(4, 8), - ... ) - ... ) - >>> df.collect() - shape: (4, 5) - ┌───────┬─────────┬───────────┬────────┬────────┐ - │ group ┆ item_id ┆ item_name ┆ value1 ┆ value2 │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str ┆ i64 ┆ i64 │ - ╞═══════╪═════════╪═══════════╪════════╪════════╡ - │ 1 ┆ 1 ┆ a ┆ 1 ┆ 4 │ - │ 2 ┆ 2 ┆ a ┆ null ┆ 5 │ - │ 1 ┆ 2 ┆ b ┆ 3 ┆ 6 │ - │ 2 ┆ 3 ┆ b ┆ 4 ┆ 7 │ - └───────┴─────────┴───────────┴────────┴────────┘ - - Generate all possible combinations of - `group`, `item_id`, and `item_name` - (whether or not they appear in the data) - >>> with pl.Config(tbl_rows=-1): - ... df.janitor.complete("group", "item_id", "item_name", sort=True).collect() - shape: (12, 5) - ┌───────┬─────────┬───────────┬────────┬────────┐ - │ group ┆ item_id ┆ item_name ┆ value1 ┆ value2 │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str ┆ i64 ┆ i64 │ - ╞═══════╪═════════╪═══════════╪════════╪════════╡ - │ 1 ┆ 1 ┆ a ┆ 1 ┆ 4 │ - │ 1 ┆ 1 ┆ b ┆ null ┆ null │ - │ 1 ┆ 2 ┆ a ┆ null ┆ null │ - │ 1 ┆ 2 ┆ b ┆ 3 ┆ 6 │ - │ 1 ┆ 3 ┆ a ┆ null ┆ null │ - │ 1 ┆ 3 ┆ b ┆ null ┆ null │ - │ 2 ┆ 1 ┆ a ┆ null ┆ null │ - │ 2 ┆ 1 ┆ b ┆ null ┆ null │ - │ 2 ┆ 2 ┆ a ┆ null ┆ 5 │ - │ 2 ┆ 2 ┆ b ┆ null ┆ null │ - │ 2 ┆ 3 ┆ a ┆ null ┆ null │ - │ 2 ┆ 3 ┆ b ┆ 4 ┆ 7 │ - └───────┴─────────┴───────────┴────────┴────────┘ - - !!! info "New in version 0.28.0" - - Args: - *columns: This refers to the columns to be completed. - It can be a string or a column selector or a polars expression. - A polars expression can be used to introduced new values, - as long as the polars expression has a name that already exists - in the LazyFrame. - It is up to the user to ensure that the polars expression returns - unique values. - fill_value: Scalar value or polars expression to use instead of nulls - for missing combinations. A dictionary, mapping columns names - to a scalar value is also accepted. - explicit: Determines if only implicitly missing values - should be filled (`False`), or all nulls existing in the LazyFrame - (`True`). `explicit` is applicable only - if `fill_value` is not `None`. - sort: Sort the LazyFrame based on *columns. - by: Column(s) to group by. - The explicit missing rows are returned per group. - - Returns: - A polars LazyFrame. - """ # noqa: E501 - return _complete( - df=self._df, - columns=columns, - fill_value=fill_value, - explicit=explicit, - sort=sort, - by=by, - ) diff --git a/janitor/polars/pivot_longer.py b/janitor/polars/pivot_longer.py index 6e7024cc7..870f457a2 100644 --- a/janitor/polars/pivot_longer.py +++ b/janitor/polars/pivot_longer.py @@ -7,6 +7,8 @@ from janitor.utils import check, import_message +from .polars_flavor import register_dataframe_method, register_lazyframe_method + try: import polars as pl import polars.selectors as cs @@ -151,6 +153,241 @@ def pivot_longer_spec( ) +@register_lazyframe_method +@register_dataframe_method +def pivot_longer( + df, + index: ColumnNameOrSelector = None, + column_names: ColumnNameOrSelector = None, + names_to: list | tuple | str = "variable", + values_to: str = "value", + names_sep: str = None, + names_pattern: str = None, + names_transform: pl.Expr = None, +) -> pl.DataFrame: + """ + Unpivots a DataFrame from *wide* to *long* format. + + It is modeled after the `pivot_longer` function in R's tidyr package, + and also takes inspiration from the `melt` function in R's data.table package. + + This function is useful to massage a DataFrame into a format where + one or more columns are considered measured variables, and all other + columns are considered as identifier variables. + + All measured variables are *unpivoted* (and typically duplicated) along the + row axis. + + For more granular control on the unpivoting, have a look at + `pivot_longer_spec`. + + `pivot_longer` can also be applied to a LazyFrame. + + Examples: + >>> import polars as pl + >>> import polars.selectors as cs + >>> import janitor.polars + >>> df = pl.DataFrame( + ... { + ... "Sepal.Length": [5.1, 5.9], + ... "Sepal.Width": [3.5, 3.0], + ... "Petal.Length": [1.4, 5.1], + ... "Petal.Width": [0.2, 1.8], + ... "Species": ["setosa", "virginica"], + ... } + ... ) + >>> df + shape: (2, 5) + ┌──────────────┬─────────────┬──────────────┬─────────────┬───────────┐ + │ Sepal.Length ┆ Sepal.Width ┆ Petal.Length ┆ Petal.Width ┆ Species │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ f64 ┆ f64 ┆ f64 ┆ str │ + ╞══════════════╪═════════════╪══════════════╪═════════════╪═══════════╡ + │ 5.1 ┆ 3.5 ┆ 1.4 ┆ 0.2 ┆ setosa │ + │ 5.9 ┆ 3.0 ┆ 5.1 ┆ 1.8 ┆ virginica │ + └──────────────┴─────────────┴──────────────┴─────────────┴───────────┘ + + Replicate polars' [melt](https://docs.pola.rs/py-polars/html/reference/dataframe/api/polars.DataFrame.melt.html#polars-dataframe-melt): + >>> df.pivot_longer(index = 'Species') + shape: (8, 3) + ┌───────────┬──────────────┬───────┐ + │ Species ┆ variable ┆ value │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ f64 │ + ╞═══════════╪══════════════╪═══════╡ + │ setosa ┆ Sepal.Length ┆ 5.1 │ + │ virginica ┆ Sepal.Length ┆ 5.9 │ + │ setosa ┆ Sepal.Width ┆ 3.5 │ + │ virginica ┆ Sepal.Width ┆ 3.0 │ + │ setosa ┆ Petal.Length ┆ 1.4 │ + │ virginica ┆ Petal.Length ┆ 5.1 │ + │ setosa ┆ Petal.Width ┆ 0.2 │ + │ virginica ┆ Petal.Width ┆ 1.8 │ + └───────────┴──────────────┴───────┘ + + Split the column labels into individual columns: + >>> df.pivot_longer( + ... index = 'Species', + ... names_to = ('part', 'dimension'), + ... names_sep = '.', + ... ).select('Species','part','dimension','value') + shape: (8, 4) + ┌───────────┬───────┬───────────┬───────┐ + │ Species ┆ part ┆ dimension ┆ value │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ f64 │ + ╞═══════════╪═══════╪═══════════╪═══════╡ + │ setosa ┆ Sepal ┆ Length ┆ 5.1 │ + │ virginica ┆ Sepal ┆ Length ┆ 5.9 │ + │ setosa ┆ Sepal ┆ Width ┆ 3.5 │ + │ virginica ┆ Sepal ┆ Width ┆ 3.0 │ + │ setosa ┆ Petal ┆ Length ┆ 1.4 │ + │ virginica ┆ Petal ┆ Length ┆ 5.1 │ + │ setosa ┆ Petal ┆ Width ┆ 0.2 │ + │ virginica ┆ Petal ┆ Width ┆ 1.8 │ + └───────────┴───────┴───────────┴───────┘ + + Retain parts of the column names as headers: + >>> df.pivot_longer( + ... index = 'Species', + ... names_to = ('part', '.value'), + ... names_sep = '.', + ... ).select('Species','part','Length','Width') + shape: (4, 4) + ┌───────────┬───────┬────────┬───────┐ + │ Species ┆ part ┆ Length ┆ Width │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ f64 ┆ f64 │ + ╞═══════════╪═══════╪════════╪═══════╡ + │ setosa ┆ Sepal ┆ 5.1 ┆ 3.5 │ + │ virginica ┆ Sepal ┆ 5.9 ┆ 3.0 │ + │ setosa ┆ Petal ┆ 1.4 ┆ 0.2 │ + │ virginica ┆ Petal ┆ 5.1 ┆ 1.8 │ + └───────────┴───────┴────────┴───────┘ + + Split the column labels based on regex: + >>> df = pl.DataFrame({"id": [1], "new_sp_m5564": [2], "newrel_f65": [3]}) + >>> df + shape: (1, 3) + ┌─────┬──────────────┬────────────┐ + │ id ┆ new_sp_m5564 ┆ newrel_f65 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪══════════════╪════════════╡ + │ 1 ┆ 2 ┆ 3 │ + └─────┴──────────────┴────────────┘ + >>> df.pivot_longer( + ... index = 'id', + ... names_to = ('diagnosis', 'gender', 'age'), + ... names_pattern = r"new_?(.+)_(.)([0-9]+)", + ... ).select('id','diagnosis','gender','age','value').sort(by=pl.all()) + shape: (2, 5) + ┌─────┬───────────┬────────┬──────┬───────┐ + │ id ┆ diagnosis ┆ gender ┆ age ┆ value │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str ┆ str ┆ i64 │ + ╞═════╪═══════════╪════════╪══════╪═══════╡ + │ 1 ┆ rel ┆ f ┆ 65 ┆ 3 │ + │ 1 ┆ sp ┆ m ┆ 5564 ┆ 2 │ + └─────┴───────────┴────────┴──────┴───────┘ + + Convert the dtypes of specific columns with `names_transform`: + >>> df.pivot_longer( + ... index = "id", + ... names_pattern=r"new_?(.+)_(.)([0-9]+)", + ... names_to=("diagnosis", "gender", "age"), + ... names_transform=pl.col('age').cast(pl.Int32), + ... ).select("id", "diagnosis", "gender", "age", "value").sort(by=pl.all()) + shape: (2, 5) + ┌─────┬───────────┬────────┬──────┬───────┐ + │ id ┆ diagnosis ┆ gender ┆ age ┆ value │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ str ┆ i32 ┆ i64 │ + ╞═════╪═══════════╪════════╪══════╪═══════╡ + │ 1 ┆ rel ┆ f ┆ 65 ┆ 3 │ + │ 1 ┆ sp ┆ m ┆ 5564 ┆ 2 │ + └─────┴───────────┴────────┴──────┴───────┘ + + Use multiple `.value` to reshape the dataframe: + >>> df = pl.DataFrame( + ... [ + ... { + ... "x_1_mean": 10, + ... "x_2_mean": 20, + ... "y_1_mean": 30, + ... "y_2_mean": 40, + ... "unit": 50, + ... } + ... ] + ... ) + >>> df + shape: (1, 5) + ┌──────────┬──────────┬──────────┬──────────┬──────┐ + │ x_1_mean ┆ x_2_mean ┆ y_1_mean ┆ y_2_mean ┆ unit │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞══════════╪══════════╪══════════╪══════════╪══════╡ + │ 10 ┆ 20 ┆ 30 ┆ 40 ┆ 50 │ + └──────────┴──────────┴──────────┴──────────┴──────┘ + >>> df.pivot_longer( + ... index="unit", + ... names_to=(".value", "time", ".value"), + ... names_pattern=r"(x|y)_([0-9])(_mean)", + ... ).select('unit','time','x_mean','y_mean').sort(by=pl.all()) + shape: (2, 4) + ┌──────┬──────┬────────┬────────┐ + │ unit ┆ time ┆ x_mean ┆ y_mean │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ i64 │ + ╞══════╪══════╪════════╪════════╡ + │ 50 ┆ 1 ┆ 10 ┆ 30 │ + │ 50 ┆ 2 ┆ 20 ┆ 40 │ + └──────┴──────┴────────┴────────┘ + + !!! info "New in version 0.28.0" + + Args: + index: Column(s) or selector(s) to use as identifier variables. + column_names: Column(s) or selector(s) to unpivot. + names_to: Name of new column as a string that will contain + what were previously the column names in `column_names`. + The default is `variable` if no value is provided. It can + also be a list/tuple of strings that will serve as new column + names, if `name_sep` or `names_pattern` is provided. + If `.value` is in `names_to`, new column names will be extracted + from part of the existing column names and overrides `values_to`. + values_to: Name of new column as a string that will contain what + were previously the values of the columns in `column_names`. + names_sep: Determines how the column name is broken up, if + `names_to` contains multiple values. It takes the same + specification as polars' `str.split` method. + names_pattern: Determines how the column name is broken up. + It can be a regular expression containing matching groups. + It takes the same + specification as polars' `str.extract_groups` method. + names_transform: Use this option to change the types of columns that + have been transformed to rows. + This does not applies to the values' columns. + Accepts a polars expression or a list of polars expressions. + Applicable only if one of names_sep + or names_pattern is provided. + + Returns: + A polars DataFrame that has been unpivoted from wide to long + format. + """ # noqa: E501 + return _pivot_longer( + df=df, + index=index, + column_names=column_names, + names_pattern=names_pattern, + names_sep=names_sep, + names_to=names_to, + values_to=values_to, + names_transform=names_transform, + ) + + def _pivot_longer( df: pl.DataFrame | pl.LazyFrame, index: ColumnNameOrSelector, diff --git a/janitor/polars/polars_flavor.py b/janitor/polars/polars_flavor.py new file mode 100644 index 000000000..0df193267 --- /dev/null +++ b/janitor/polars/polars_flavor.py @@ -0,0 +1,111 @@ +"""polars variant of pandas_flavor""" + +from __future__ import annotations + +from functools import wraps +from typing import Callable + +from janitor.utils import import_message + +try: + import polars as pl +except ImportError: + import_message( + submodule="polars", + package="polars", + conda_channel="conda-forge", + pip_install=True, + ) + + +def register_dataframe_method(method: Callable) -> Callable: + """Register a function as a method attached to the Polars DataFrame. + + Example: + >>> @register_dataframe_method + >>> def print_column(df, col): + ... '''Print the dataframe column given''' + ... print(df[col]) + + !!! info "New in version 0.28.0" + + Args: + method: Function to be registered as a method on the DataFrame. + + Returns: + A Callable. + """ + + def inner(*args, **kwargs): + + class AccessorMethod(object): + + def __init__(self, polars_obj): + self._obj = polars_obj + + @wraps(method) + def __call__(self, *args, **kwargs): + return method(self._obj, *args, **kwargs) + + pl.api.register_dataframe_namespace(method.__name__)(AccessorMethod) + return method + + return inner() + + +def register_lazyframe_method(method: Callable) -> Callable: + """Register a function as a method attached to the Polars LazyFrame. + + Example: + >>> @register_lazyframe_method + >>> def print_column(df, col): + ... '''Print the dataframe column given''' + ... print(df[col]) + + !!! info "New in version 0.28.0" + + Args: + method: Function to be registered as a method on the LazyFrame. + + Returns: + A Callable. + """ + + def inner(*args, **kwargs): + + class AccessorMethod(object): + + def __init__(self, polars_obj): + self._obj = polars_obj + + @wraps(method) + def __call__(self, *args, **kwargs): + return method(self._obj, *args, **kwargs) + + pl.api.register_lazyframe_namespace(method.__name__)(AccessorMethod) + + return method + + return inner() + + +def register_expr_method(method): + """Register a function as a method attached to a Polars Expression.""" + + def inner(*args, **kwargs): + + class AccessorMethod(object): + __doc__ = method.__doc__ + + def __init__(self, polars_expr): + self._obj = polars_expr + + @wraps(method) + def __call__(self, *args, **kwargs): + return method(self._obj, *args, **kwargs) + + pl.api.register_expr_namespace(method.__name__)(AccessorMethod) + + return method + + return inner() diff --git a/janitor/polars/row_to_names.py b/janitor/polars/row_to_names.py index 7fe1b0b9e..54f016877 100644 --- a/janitor/polars/row_to_names.py +++ b/janitor/polars/row_to_names.py @@ -4,6 +4,8 @@ from janitor.utils import check, import_message +from .polars_flavor import register_dataframe_method, register_lazyframe_method + try: import polars as pl except ImportError: @@ -15,6 +17,113 @@ ) +@register_lazyframe_method +@register_dataframe_method +def row_to_names( + df, + row_numbers: int | list = 0, + remove_rows: bool = False, + remove_rows_above: bool = False, + separator: str = "_", +) -> pl.DataFrame: + """ + Elevates a row, or rows, to be the column names of a DataFrame. + + `row_to_names` can also be applied to a LazyFrame. + + Examples: + Replace column names with the first row. + + >>> import polars as pl + >>> import janitor.polars + >>> df = pl.DataFrame({ + ... "a": ["nums", '6', '9'], + ... "b": ["chars", "x", "y"], + ... }) + >>> df + shape: (3, 2) + ┌──────┬───────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ str │ + ╞══════╪═══════╡ + │ nums ┆ chars │ + │ 6 ┆ x │ + │ 9 ┆ y │ + └──────┴───────┘ + >>> df.row_to_names(0, remove_rows=True) + shape: (2, 2) + ┌──────┬───────┐ + │ nums ┆ chars │ + │ --- ┆ --- │ + │ str ┆ str │ + ╞══════╪═══════╡ + │ 6 ┆ x │ + │ 9 ┆ y │ + └──────┴───────┘ + >>> df.row_to_names(row_numbers=[0,1], remove_rows=True) + shape: (1, 2) + ┌────────┬─────────┐ + │ nums_6 ┆ chars_x │ + │ --- ┆ --- │ + │ str ┆ str │ + ╞════════╪═════════╡ + │ 9 ┆ y │ + └────────┴─────────┘ + + Remove rows above the elevated row and the elevated row itself. + + >>> df = pl.DataFrame({ + ... "a": ["bla1", "nums", '6', '9'], + ... "b": ["bla2", "chars", "x", "y"], + ... }) + >>> df + shape: (4, 2) + ┌──────┬───────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ str ┆ str │ + ╞══════╪═══════╡ + │ bla1 ┆ bla2 │ + │ nums ┆ chars │ + │ 6 ┆ x │ + │ 9 ┆ y │ + └──────┴───────┘ + >>> df.row_to_names(1, remove_rows=True, remove_rows_above=True) + shape: (2, 2) + ┌──────┬───────┐ + │ nums ┆ chars │ + │ --- ┆ --- │ + │ str ┆ str │ + ╞══════╪═══════╡ + │ 6 ┆ x │ + │ 9 ┆ y │ + └──────┴───────┘ + + !!! info "New in version 0.28.0" + + Args: + row_numbers: Position of the row(s) containing the variable names. + Note that indexing starts from 0. It can also be a list. + Defaults to 0 (first row). + remove_rows: Whether the row(s) should be removed from the DataFrame. + remove_rows_above: Whether the row(s) above the selected row should + be removed from the DataFrame. + separator: Combines the labels into a single string, + if row_numbers is a list of integers. Default is '_'. + + Returns: + A polars DataFrame. + """ # noqa: E501 + return _row_to_names( + df=df, + row_numbers=row_numbers, + remove_rows=remove_rows, + remove_rows_above=remove_rows_above, + separator=separator, + ) + + def _row_to_names( df: pl.DataFrame | pl.LazyFrame, row_numbers: int | list, diff --git a/tests/polars/functions/test_clean_names_polars.py b/tests/polars/functions/test_clean_names_polars.py index 23ce38742..3c2340698 100644 --- a/tests/polars/functions/test_clean_names_polars.py +++ b/tests/polars/functions/test_clean_names_polars.py @@ -8,7 +8,7 @@ def test_clean_names_method_chain(dataframe): """Tests clean_names default args in a method chain.""" df = pl.from_pandas(dataframe) - df = df.janitor.clean_names() + df = df.clean_names() expected_columns = [ "a", "bell_chart", @@ -23,7 +23,7 @@ def test_clean_names_method_chain(dataframe): def test_clean_names_special_characters(dataframe): """Tests clean_names `remove_special` parameter.""" df = pl.from_pandas(dataframe) - df = df.janitor.clean_names(remove_special=True) + df = df.clean_names(remove_special=True) expected_columns = [ "a", "bell_chart", @@ -38,7 +38,7 @@ def test_clean_names_special_characters(dataframe): def test_clean_names_uppercase(dataframe): """Tests clean_names `case_type` parameter = upper.""" df = pl.from_pandas(dataframe) - df = df.janitor.clean_names(remove_special=True, case_type="upper") + df = df.clean_names(remove_special=True, case_type="upper") expected_columns = [ "A", "BELL_CHART", @@ -53,7 +53,7 @@ def test_clean_names_uppercase(dataframe): def test_clean_names_strip_accents(): """Tests clean_names `strip_accents` parameter.""" df = pl.DataFrame({"João": [1, 2], "Лука́ся": [1, 2], "Käfer": [1, 2]}) - df = df.janitor.clean_names(strip_accents=True) + df = df.clean_names(strip_accents=True) expected_columns = ["joao", "лукася", "kafer"] assert df.columns == expected_columns @@ -65,7 +65,7 @@ def test_clean_names_camelcase_to_snake(dataframe): df = ( df.select("a") .rename({"a": "AColumnName"}) - .janitor.clean_names(remove_special=True, case_type="snake") + .clean_names(remove_special=True, case_type="snake") ) assert df.columns == ["a_column_name"] @@ -74,7 +74,7 @@ def test_clean_names_camelcase_to_snake(dataframe): def test_clean_names_truncate_limit(dataframe): """Tests clean_names `truncate_limit` parameter.""" df = pl.from_pandas(dataframe) - df = df.janitor.clean_names(truncate_limit=7) + df = df.clean_names(truncate_limit=7) expected_columns = ["a", "bell_ch", "decorat", "animals", "cities"] assert df.columns == expected_columns @@ -88,7 +88,7 @@ def test_charac(): r"Current accountbalance(in % of GDP)": range(5), } ) - df = df.janitor.clean_names(strip_underscores=True, case_type="lower") + df = df.clean_names(strip_underscores=True, case_type="lower") assert "current_accountbalance_in_%_of_gdp" in df.columns @@ -97,6 +97,6 @@ def test_clean_column_values(): """Clean column values""" raw = pl.DataFrame({"raw": ["Abçdê fgí j"]}) outcome = raw.with_columns( - pl.col("raw").janitor.clean_names(strip_accents=True) + pl.col("raw").make_clean_names(strip_accents=True) ) assert list(outcome)[0][0] == "abcde_fgi_j" diff --git a/tests/polars/functions/test_complete_polars.py b/tests/polars/functions/test_complete_polars.py index 4c9bd14dd..4af7e204d 100644 --- a/tests/polars/functions/test_complete_polars.py +++ b/tests/polars/functions/test_complete_polars.py @@ -40,7 +40,7 @@ def taxonomy_df(): def test_column_None(fill_df): """Test output if *columns is empty.""" - assert_frame_equal(fill_df.janitor.complete(), fill_df) + assert_frame_equal(fill_df.complete(), fill_df) def test_empty_groups(fill_df): @@ -49,19 +49,19 @@ def test_empty_groups(fill_df): msg += "should either be a string, a column selector, " msg += "or a polars expression, instead got.+" with pytest.raises(TypeError, match=msg): - fill_df.janitor.complete("group", {}) + fill_df.complete("group", {}) def test_type_sort(fill_df): """Raise TypeError if `sort` is not boolean.""" with pytest.raises(TypeError): - fill_df.janitor.complete("group", "item_id", sort=11) + fill_df.complete("group", "item_id", sort=11) def test_type_explicit(fill_df): """Raise TypeError if `explicit` is not boolean.""" with pytest.raises(TypeError): - fill_df.janitor.complete("group", "item_id", explicit=11) + fill_df.complete("group", "item_id", explicit=11) def test_complete_1(fill_df): @@ -69,7 +69,7 @@ def test_complete_1(fill_df): Test output for janitor.complete. """ trimmed = fill_df.lazy().select(~cs.starts_with("value")) - result = trimmed.janitor.complete( + result = trimmed.complete( cs.by_name("group"), pl.struct("item_id", "item_name").alias("rar").unique().sort(), fill_value=0, @@ -115,7 +115,7 @@ def test_groupby_complete(): df = pl.LazyFrame(data) expected = ( - df.janitor.complete("Date", "Site", by="Grid Cell") + df.complete("Date", "Site", by="Grid Cell") .select("Grid Cell", "Site", "Date", "Value") .sort(by=pl.all()) ) @@ -147,7 +147,7 @@ def test_groupby_complete(): # https://tidyr.tidyverse.org/reference/complete.html def test_complete_2(fill_df): """Test output for janitor.complete.""" - result = fill_df.janitor.complete( + result = fill_df.complete( "group", pl.struct("item_id", "item_name").alias("rar").unique().sort(), fill_value={"value1": 0, "value2": 99}, @@ -241,7 +241,7 @@ def test_complete_multiple_groupings(): } ) - result = df3.janitor.complete( + result = df3.complete( pl.struct("meta", "domain1").alias("bar").unique().sort(), pl.struct("project_id", "question_count").alias("foo").unique().sort(), fill_value={"tag_count": 0}, @@ -255,6 +255,6 @@ def test_complete_3(fill_df): Test output for janitor.complete """ assert_frame_equal( - fill_df.janitor.complete("group", sort=True).sort("group"), + fill_df.complete("group", sort=True).sort("group"), fill_df.sort("group"), ) diff --git a/tests/polars/functions/test_pivot_longer_polars.py b/tests/polars/functions/test_pivot_longer_polars.py index 46bc61c12..de43db0d7 100644 --- a/tests/polars/functions/test_pivot_longer_polars.py +++ b/tests/polars/functions/test_pivot_longer_polars.py @@ -24,7 +24,7 @@ def test_type_index(df_checks): msg = "The argument passed to the index parameter " msg += "should be a type that is supported in the.+" with pytest.raises(TypeError, match=msg): - df_checks.janitor.pivot_longer(index=2007, names_sep="_") + df_checks.pivot_longer(index=2007, names_sep="_") def test_type_column_names(df_checks): @@ -32,14 +32,14 @@ def test_type_column_names(df_checks): msg = "The argument passed to the column_names parameter " msg += "should be a type that is supported in the.+" with pytest.raises(TypeError, match=msg): - df_checks.janitor.pivot_longer(column_names=2007, names_sep="_") + df_checks.pivot_longer(column_names=2007, names_sep="_") def test_type_names_to(df_checks): """Raise TypeError if wrong type is provided for names_to.""" msg = "names_to should be one of .+" with pytest.raises(TypeError, match=msg): - df_checks.janitor.pivot_longer(names_to=2007, names_sep="_") + df_checks.pivot_longer(names_to=2007, names_sep="_") def test_subtype_names_to(df_checks): @@ -49,15 +49,13 @@ def test_subtype_names_to(df_checks): in names_to. """ with pytest.raises(TypeError, match="'1' in names_to.+"): - df_checks.janitor.pivot_longer(names_to=[1], names_sep="_") + df_checks.pivot_longer(names_to=[1], names_sep="_") def test_duplicate_names_to(df_checks): """Raise error if names_to contains duplicates.""" with pytest.raises(ValueError, match="'y' is duplicated in names_to."): - df_checks.janitor.pivot_longer( - names_to=["y", "y"], names_pattern="(.+)(.)" - ) + df_checks.pivot_longer(names_to=["y", "y"], names_pattern="(.+)(.)") def test_both_names_sep_and_pattern(df_checks): @@ -69,7 +67,7 @@ def test_both_names_sep_and_pattern(df_checks): ValueError, match="Only one of names_pattern or names_sep should be provided.", ): - df_checks.janitor.pivot_longer( + df_checks.pivot_longer( names_to=["rar", "bar"], names_sep="-", names_pattern="(.+)(.)" ) @@ -77,28 +75,24 @@ def test_both_names_sep_and_pattern(df_checks): def test_name_pattern_wrong_type(df_checks): """Raise TypeError if the wrong type is provided for names_pattern.""" with pytest.raises(TypeError, match="names_pattern should be one of.+"): - df_checks.janitor.pivot_longer( - names_to=["rar", "bar"], names_pattern=2007 - ) + df_checks.pivot_longer(names_to=["rar", "bar"], names_pattern=2007) def test_name_sep_wrong_type(df_checks): """Raise TypeError if the wrong type is provided for names_sep.""" with pytest.raises(TypeError, match="names_sep should be one of.+"): - df_checks.janitor.pivot_longer( - names_to=[".value", "num"], names_sep=["_"] - ) + df_checks.pivot_longer(names_to=[".value", "num"], names_sep=["_"]) def test_values_to_wrong_type(df_checks): """Raise TypeError if the wrong type is provided for `values_to`.""" with pytest.raises(TypeError, match="values_to should be one of.+"): - df_checks.janitor.pivot_longer(values_to={"salvo"}, names_sep="_") + df_checks.pivot_longer(values_to={"salvo"}, names_sep="_") def test_pivot_index_only(df_checks): """Test output if only index is passed.""" - result = df_checks.janitor.pivot_longer( + result = df_checks.pivot_longer( index=["famid", "birth"], names_to="dim", values_to="num", @@ -113,7 +107,7 @@ def test_pivot_index_only(df_checks): def test_pivot_column_only(df_checks): """Test output if only column_names is passed.""" - result = df_checks.janitor.pivot_longer( + result = df_checks.pivot_longer( column_names=["ht1", "ht2"], names_to="dim", values_to="num", @@ -138,7 +132,7 @@ def test_names_to_names_pattern_len(df_checks): msg += "not match the number of fields extracted.+" with pytest.raises(ValueError, match=msg): - df_checks.janitor.pivot_longer( + df_checks.pivot_longer( column_names=cs.starts_with("ht"), names_to=(".value"), names_pattern=r"(\d+)(.)", @@ -155,7 +149,7 @@ def test_names_to_names_pattern_mismatch(df_checks): with pytest.raises(ValueError, match=msg): - df_checks.janitor.pivot_longer( + df_checks.pivot_longer( column_names=cs.starts_with("ht"), names_to=(".value", "age"), names_pattern=r"(\d+)(.)", @@ -167,7 +161,7 @@ def test_names_pat_str(df_checks): Test output when names_pattern is a string, and .value is present. """ - result = df_checks.janitor.pivot_longer( + result = df_checks.pivot_longer( column_names=cs.starts_with("ht"), names_to=(".value", "age"), names_pattern="(.+)(.)", @@ -207,7 +201,7 @@ def test_no_column_names(df_checks): are assigned to the index parameter. """ assert_frame_equal( - df_checks.janitor.pivot_longer(index=pl.all()), + df_checks.pivot_longer(index=pl.all()), df_checks, ) @@ -316,7 +310,7 @@ def test_df(): def test_names_pattern_dot_value(test_df): """Test output for names_pattern and .value.""" - result = test_df.janitor.pivot_longer( + result = test_df.pivot_longer( column_names=pl.all(), names_to=["set", ".value"], names_pattern="(.+)_(.+)", @@ -327,7 +321,7 @@ def test_names_pattern_dot_value(test_df): def test_names_sep_dot_value(test_df): """Test output for names_pattern and .value.""" - result = test_df.janitor.pivot_longer( + result = test_df.pivot_longer( column_names=pl.all(), names_to=["set", ".value"], names_sep="_", @@ -353,7 +347,7 @@ def test_not_dot_value_sep(not_dot_value): """Test output when names_sep and no dot_value""" result = ( - not_dot_value.janitor.pivot_longer( + not_dot_value.pivot_longer( "country", names_to=("event", "year"), names_sep="_", @@ -383,7 +377,7 @@ def test_not_dot_value_sep(not_dot_value): def test_not_dot_value_sep2(not_dot_value): """Test output when names_sep and no dot_value""" - result = not_dot_value.janitor.pivot_longer( + result = not_dot_value.pivot_longer( "country", names_to="event", names_sep="/", @@ -401,7 +395,7 @@ def test_not_dot_value_pattern(not_dot_value): """Test output when names_pattern is a string and no dot_value""" result = ( - not_dot_value.janitor.pivot_longer( + not_dot_value.pivot_longer( index="country", names_to=("event", "year"), names_pattern=r"(.+)_(.+)", @@ -445,7 +439,7 @@ def test_multiple_dot_value(): ) result = ( - df.janitor.pivot_longer( + df.pivot_longer( index="unit", names_to=(".value", "time", ".value"), names_pattern=r"(x|y)_([0-9])(_mean|_sd)", @@ -484,7 +478,7 @@ def single_val(): def test_multiple_dot_value2(single_val): """Test output for multiple .value.""" - result = single_val.janitor.pivot_longer( + result = single_val.pivot_longer( index="id", names_to=(".value", ".value"), names_pattern="(.)(.)" ) @@ -508,7 +502,7 @@ def test_names_pattern_single_column(single_val): Test output if names_to is only '.value'. """ - result = single_val.janitor.pivot_longer( + result = single_val.pivot_longer( "id", names_to=".value", names_pattern="(.)." ) @@ -519,7 +513,7 @@ def test_names_pattern_single_column_not_dot_value(single_val): """ Test output if names_to is not '.value'. """ - result = single_val.janitor.pivot_longer( + result = single_val.pivot_longer( index="id", column_names="x1", names_to="yA", names_pattern="(.+)" ) @@ -534,7 +528,7 @@ def test_names_pattern_single_column_not_dot_value1(single_val): """ Test output if names_to is not '.value'. """ - result = single_val.select("x1").janitor.pivot_longer( + result = single_val.select("x1").pivot_longer( names_to="yA", names_pattern="(.+)" ) @@ -574,7 +568,7 @@ def df_null(): def test_names_pattern_nulls_in_data(df_null): """Test output if nulls are present in data.""" result = ( - df_null.janitor.pivot_longer( + df_null.pivot_longer( index="family", names_to=[".value", "child"], names_pattern=r"(.+)_(.+)", diff --git a/tests/polars/functions/test_row_to_names_polars.py b/tests/polars/functions/test_row_to_names_polars.py index be5e07fdd..47c01fb92 100644 --- a/tests/polars/functions/test_row_to_names_polars.py +++ b/tests/polars/functions/test_row_to_names_polars.py @@ -19,7 +19,7 @@ def test_separator_type(df): Raise if separator is not a string """ with pytest.raises(TypeError, match="separator should be.+"): - df.janitor.row_to_names([1, 2], separator=1) + df.row_to_names([1, 2], separator=1) @pytest.mark.parametrize("df", [df, df.lazy()]) @@ -28,7 +28,7 @@ def test_row_numbers_type(df): Raise if row_numbers is not an int/list """ with pytest.raises(TypeError, match="row_numbers should be.+"): - df.janitor.row_to_names({1, 2}) + df.row_to_names({1, 2}) @pytest.mark.parametrize("df", [df, df.lazy()]) @@ -40,12 +40,12 @@ def test_row_numbers_list_type(df): with pytest.raises( TypeError, match="entry in the row_numbers argument should be.+" ): - df.janitor.row_to_names(["1", 2]) + df.row_to_names(["1", 2]) @pytest.mark.parametrize("df", [df, df.lazy()]) def test_row_to_names(df): - df = df.janitor.row_to_names(2) + df = df.row_to_names(2) assert df.columns[0] == "3.2346125" assert df.columns[1] == "3" assert df.columns[2] == "lion" @@ -55,7 +55,7 @@ def test_row_to_names(df): @pytest.mark.parametrize("df", [df, df.lazy()]) def test_row_to_names_single_list(df): "Test output if row_numbers is a list, and contains a single item." - df = df.janitor.row_to_names([2]) + df = df.row_to_names([2]) assert df.columns[0] == "3.2346125" assert df.columns[1] == "3" assert df.columns[2] == "lion" @@ -65,7 +65,7 @@ def test_row_to_names_single_list(df): @pytest.mark.parametrize("df", [df, df.lazy()]) def test_row_to_names_list(df): "Test output if row_numbers is a list." - df = df.janitor.row_to_names([1, 2]) + df = df.row_to_names([1, 2]) assert df.columns[0] == "2.456234_3.2346125" assert df.columns[1] == "2_3" assert df.columns[2] == "leopard_lion" @@ -74,7 +74,7 @@ def test_row_to_names_list(df): @pytest.mark.parametrize("df", [df, df.lazy()]) def test_row_to_names_delete_this_row(df): - df = df.janitor.row_to_names(2, remove_rows=True) + df = df.row_to_names(2, remove_rows=True) if isinstance(df, pl.LazyFrame): df = df.collect() assert df.to_series(0)[0] == 1.234_523_45 @@ -85,7 +85,7 @@ def test_row_to_names_delete_this_row(df): @pytest.mark.parametrize("df", [df, df.lazy()]) def test_row_to_names_list_delete_this_row(df): - df = df.janitor.row_to_names([2], remove_rows=True) + df = df.row_to_names([2], remove_rows=True) if isinstance(df, pl.LazyFrame): df = df.collect() assert df.to_series(0)[0] == 1.234_523_45 @@ -96,7 +96,7 @@ def test_row_to_names_list_delete_this_row(df): @pytest.mark.parametrize("df", [df, df.lazy()]) def test_row_to_names_delete_above(df): - df = df.janitor.row_to_names(2, remove_rows_above=True) + df = df.row_to_names(2, remove_rows_above=True) if isinstance(df, pl.LazyFrame): df = df.collect() assert df.to_series(0)[0] == 3.234_612_5 @@ -108,7 +108,7 @@ def test_row_to_names_delete_above(df): @pytest.mark.parametrize("df", [df, df.lazy()]) def test_row_to_names_delete_above_list(df): "Test output if row_numbers is a list" - df = df.janitor.row_to_names([2, 3], remove_rows_above=True) + df = df.row_to_names([2, 3], remove_rows_above=True) if isinstance(df, pl.LazyFrame): df = df.collect() assert df.to_series(0)[0] == 3.234_612_5 @@ -123,9 +123,7 @@ def test_row_to_names_delete_above_delete_rows(df): Test output for remove_rows=True and remove_rows_above=True """ - df = df.janitor.row_to_names( - [2, 3], remove_rows=True, remove_rows_above=True - ) + df = df.row_to_names([2, 3], remove_rows=True, remove_rows_above=True) if isinstance(df, pl.LazyFrame): df = df.collect() assert df.to_series(0)[0] == 2.456234 @@ -140,7 +138,7 @@ def test_row_to_names_delete_above_delete_rows_scalar(df): Test output for remove_rows=True and remove_rows_above=True """ - df = df.janitor.row_to_names(2, remove_rows=True, remove_rows_above=True) + df = df.row_to_names(2, remove_rows=True, remove_rows_above=True) if isinstance(df, pl.LazyFrame): df = df.collect() assert df.to_series(0)[0] == 1.23452345 @@ -157,4 +155,4 @@ def test_row_to_names_delete_above_list_non_consecutive(df): msg += "or the integers in a list are consecutive increasing, " msg += "with a difference of 1." with pytest.raises(ValueError, match=msg): - df.janitor.row_to_names([1, 3], remove_rows_above=True) + df.row_to_names([1, 3], remove_rows_above=True) From cce489668547d80538927d70d5fbcd4b40bc3fc8 Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Fri, 14 Jun 2024 21:58:03 +1000 Subject: [PATCH 2/3] docs cleanup --- janitor/polars/clean_names.py | 6 +++--- janitor/polars/complete.py | 6 +++--- janitor/polars/pivot_longer.py | 11 ++++++----- janitor/polars/row_to_names.py | 8 ++++---- mkdocs/api/polars.md | 8 ++++---- 5 files changed, 20 insertions(+), 19 deletions(-) diff --git a/janitor/polars/clean_names.py b/janitor/polars/clean_names.py index 7d80054a8..e9a54b08a 100644 --- a/janitor/polars/clean_names.py +++ b/janitor/polars/clean_names.py @@ -35,13 +35,13 @@ @register_lazyframe_method @register_dataframe_method def clean_names( - df, + df: pl.DataFrame | pl.LazyFrame, strip_underscores: str | bool = None, case_type: str = "lower", remove_special: bool = False, strip_accents: bool = False, truncate_limit: int = None, -) -> pl.DataFrame: +) -> pl.DataFrame | pl.LazyFrame: """ Clean the column names in a polars DataFrame. @@ -100,7 +100,7 @@ def clean_names( the specified length. Default None does not truncate. Returns: - A polars DataFrame. + A polars DataFrame/LazyFrame. """ # noqa: E501 return df.rename( lambda col: _clean_column_names( diff --git a/janitor/polars/complete.py b/janitor/polars/complete.py index ea02473ad..a484a09c4 100644 --- a/janitor/polars/complete.py +++ b/janitor/polars/complete.py @@ -24,13 +24,13 @@ @register_lazyframe_method @register_dataframe_method def complete( - df, + df: pl.DataFrame | pl.LazyFrame, *columns: ColumnNameOrSelector, fill_value: dict | Any | pl.Expr = None, explicit: bool = True, sort: bool = False, by: ColumnNameOrSelector = None, -) -> pl.DataFrame: +) -> pl.DataFrame | pl.LazyFrame: """ Turns implicit missing values into explicit missing values @@ -309,7 +309,7 @@ def complete( The explicit missing rows are returned per group. Returns: - A polars DataFrame. + A polars DataFrame/LazyFrame. """ # noqa: E501 return _complete( df=df, diff --git a/janitor/polars/pivot_longer.py b/janitor/polars/pivot_longer.py index 870f457a2..ff11fbc44 100644 --- a/janitor/polars/pivot_longer.py +++ b/janitor/polars/pivot_longer.py @@ -27,7 +27,7 @@ def pivot_longer_spec( spec: pl.DataFrame, ) -> pl.DataFrame | pl.LazyFrame: """ - A declarative interface to pivot a DataFrame + A declarative interface to pivot a Polars Frame from wide to long form, where you describe how the data will be unpivoted, using a DataFrame. This gives you, the user, @@ -96,6 +96,7 @@ def pivot_longer_spec( Args: df: The source DataFrame to unpivot. + It can also be a LazyFrame. spec: A specification DataFrame. At a minimum, the spec DataFrame must have a `.name` column @@ -156,7 +157,7 @@ def pivot_longer_spec( @register_lazyframe_method @register_dataframe_method def pivot_longer( - df, + df: pl.DataFrame | pl.LazyFrame, index: ColumnNameOrSelector = None, column_names: ColumnNameOrSelector = None, names_to: list | tuple | str = "variable", @@ -164,7 +165,7 @@ def pivot_longer( names_sep: str = None, names_pattern: str = None, names_transform: pl.Expr = None, -) -> pl.DataFrame: +) -> pl.DataFrame | pl.LazyFrame: """ Unpivots a DataFrame from *wide* to *long* format. @@ -373,8 +374,8 @@ def pivot_longer( or names_pattern is provided. Returns: - A polars DataFrame that has been unpivoted from wide to long - format. + A polars DataFrame/LazyFrame that has been unpivoted + from wide to long format. """ # noqa: E501 return _pivot_longer( df=df, diff --git a/janitor/polars/row_to_names.py b/janitor/polars/row_to_names.py index 54f016877..d67f30f6b 100644 --- a/janitor/polars/row_to_names.py +++ b/janitor/polars/row_to_names.py @@ -1,4 +1,4 @@ -"""clean_names implementation for polars.""" +"""row_to_names implementation for polars.""" from __future__ import annotations @@ -20,12 +20,12 @@ @register_lazyframe_method @register_dataframe_method def row_to_names( - df, + df: pl.DataFrame | pl.LazyFrame, row_numbers: int | list = 0, remove_rows: bool = False, remove_rows_above: bool = False, separator: str = "_", -) -> pl.DataFrame: +) -> pl.DataFrame | pl.LazyFrame: """ Elevates a row, or rows, to be the column names of a DataFrame. @@ -113,7 +113,7 @@ def row_to_names( if row_numbers is a list of integers. Default is '_'. Returns: - A polars DataFrame. + A polars DataFrame/LazyFrame. """ # noqa: E501 return _row_to_names( df=df, diff --git a/mkdocs/api/polars.md b/mkdocs/api/polars.md index b7cf79b3f..8c586351c 100644 --- a/mkdocs/api/polars.md +++ b/mkdocs/api/polars.md @@ -3,7 +3,7 @@ ::: janitor.polars options: members: - - PolarsExpr - - PolarsDataFrame - - PolarsLazyFrame - - pivot_longer_spec + - clean_names + - complete + - pivot_longer + - row_to_names From 32b41056b070049653b735fff709e2bc1a2c2c2b Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Fri, 14 Jun 2024 22:05:50 +1000 Subject: [PATCH 3/3] fix docs --- janitor/polars/clean_names.py | 4 ++-- janitor/polars/polars_flavor.py | 16 ++++++++-------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/janitor/polars/clean_names.py b/janitor/polars/clean_names.py index e9a54b08a..8c1f4120f 100644 --- a/janitor/polars/clean_names.py +++ b/janitor/polars/clean_names.py @@ -68,7 +68,7 @@ def clean_names( │ 1 ┆ 1 ┆ 1 │ │ 2 ┆ 2 ┆ 2 │ └───────┴────────────┴──────────────┘ - >>> df.janitor.clean_names(remove_special=True) + >>> df.clean_names(remove_special=True) shape: (3, 3) ┌───────┬────────────┬─────────┐ │ aloha ┆ bell_chart ┆ animals │ @@ -142,7 +142,7 @@ def make_clean_names( └─────────────┘ Clean the column values: - >>> df.with_columns(pl.col("raw").janitor.make_clean_names(strip_accents=True)) + >>> df.with_columns(pl.col("raw").make_clean_names(strip_accents=True)) shape: (1, 1) ┌─────────────┐ │ raw │ diff --git a/janitor/polars/polars_flavor.py b/janitor/polars/polars_flavor.py index 0df193267..85149ce29 100644 --- a/janitor/polars/polars_flavor.py +++ b/janitor/polars/polars_flavor.py @@ -22,10 +22,10 @@ def register_dataframe_method(method: Callable) -> Callable: """Register a function as a method attached to the Polars DataFrame. Example: - >>> @register_dataframe_method - >>> def print_column(df, col): - ... '''Print the dataframe column given''' - ... print(df[col]) + >>> @register_dataframe_method # doctest: +SKIP + >>> def print_column(df, col): # doctest: +SKIP + ... '''Print the dataframe column given''' # doctest: +SKIP + ... print(df[col]) # doctest: +SKIP !!! info "New in version 0.28.0" @@ -57,10 +57,10 @@ def register_lazyframe_method(method: Callable) -> Callable: """Register a function as a method attached to the Polars LazyFrame. Example: - >>> @register_lazyframe_method - >>> def print_column(df, col): - ... '''Print the dataframe column given''' - ... print(df[col]) + >>> @register_lazyframe_method # doctest: +SKIP + >>> def print_column(df, col): # doctest: +SKIP + ... '''Print the dataframe column given''' # doctest: +SKIP + ... print(df[col]) # doctest: +SKIP !!! info "New in version 0.28.0"