Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

create polars equivalent of pandas_flavor #1374

Merged
merged 7 commits into from
Jun 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 8 additions & 7 deletions janitor/polars/__init__.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
from .dataframe import PolarsDataFrame
from .expressions import PolarsExpr
from .lazyframe import PolarsLazyFrame
from .pivot_longer import pivot_longer_spec
from .clean_names import clean_names, make_clean_names
from .complete import complete
from .pivot_longer import pivot_longer, pivot_longer_spec
from .row_to_names import row_to_names

__all__ = [
"pivot_longer_spec",
"pivot_longer",
"clean_names",
"PolarsDataFrame",
"PolarsLazyFrame",
"PolarsExpr",
"make_clean_names",
"row_to_names",
"complete",
]
162 changes: 162 additions & 0 deletions janitor/polars/clean_names.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,12 @@
)
from janitor.utils import import_message

from .polars_flavor import (
register_dataframe_method,
register_expr_method,
register_lazyframe_method,
)

try:
import polars as pl
except ImportError:
Expand All @@ -26,6 +32,162 @@
)


@register_lazyframe_method
@register_dataframe_method
def clean_names(
df: pl.DataFrame | pl.LazyFrame,
strip_underscores: str | bool = None,
case_type: str = "lower",
remove_special: bool = False,
strip_accents: bool = False,
truncate_limit: int = None,
) -> pl.DataFrame | pl.LazyFrame:
"""
Clean the column names in a polars DataFrame.

`clean_names` can also be applied to a LazyFrame.

Examples:
>>> import polars as pl
>>> import janitor.polars
>>> df = pl.DataFrame(
... {
... "Aloha": range(3),
... "Bell Chart": range(3),
... "Animals@#$%^": range(3)
... }
... )
>>> df
shape: (3, 3)
┌───────┬────────────┬──────────────┐
│ Aloha ┆ Bell Chart ┆ Animals@#$%^ │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 │
╞═══════╪════════════╪══════════════╡
│ 0 ┆ 0 ┆ 0 │
│ 1 ┆ 1 ┆ 1 │
│ 2 ┆ 2 ┆ 2 │
└───────┴────────────┴──────────────┘
>>> df.clean_names(remove_special=True)
shape: (3, 3)
┌───────┬────────────┬─────────┐
│ aloha ┆ bell_chart ┆ animals │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 │
╞═══════╪════════════╪═════════╡
│ 0 ┆ 0 ┆ 0 │
│ 1 ┆ 1 ┆ 1 │
│ 2 ┆ 2 ┆ 2 │
└───────┴────────────┴─────────┘

!!! info "New in version 0.28.0"

Args:
strip_underscores: Removes the outer underscores from all
column names. Default None keeps outer underscores. Values can be
either 'left', 'right' or 'both' or the respective shorthand 'l',
'r' and True.
case_type: Whether to make the column names lower or uppercase.
Current case may be preserved with 'preserve',
while snake case conversion (from CamelCase or camelCase only)
can be turned on using "snake".
Default 'lower' makes all characters lowercase.
remove_special: Remove special characters from the column names.
Only letters, numbers and underscores are preserved.
strip_accents: Whether or not to remove accents from
the labels.
truncate_limit: Truncates formatted column names to
the specified length. Default None does not truncate.

Returns:
A polars DataFrame/LazyFrame.
""" # noqa: E501
return df.rename(
lambda col: _clean_column_names(
obj=col,
strip_accents=strip_accents,
strip_underscores=strip_underscores,
case_type=case_type,
remove_special=remove_special,
truncate_limit=truncate_limit,
)
)


@register_expr_method
def make_clean_names(
expression,
strip_underscores: str | bool = None,
case_type: str = "lower",
remove_special: bool = False,
strip_accents: bool = False,
enforce_string: bool = False,
truncate_limit: int = None,
) -> pl.Expr:
"""
Clean the labels in a polars Expression.

Examples:
>>> import polars as pl
>>> import janitor.polars
>>> df = pl.DataFrame({"raw": ["Abçdê fgí j"]})
>>> df
shape: (1, 1)
┌─────────────┐
│ raw │
│ --- │
│ str │
╞═════════════╡
│ Abçdê fgí j │
└─────────────┘

Clean the column values:
>>> df.with_columns(pl.col("raw").make_clean_names(strip_accents=True))
shape: (1, 1)
┌─────────────┐
│ raw │
│ --- │
│ str │
╞═════════════╡
│ abcde_fgi_j │
└─────────────┘

!!! info "New in version 0.28.0"

Args:
strip_underscores: Removes the outer underscores
from all labels in the expression.
Default None keeps outer underscores.
Values can be either 'left', 'right'
or 'both' or the respective shorthand 'l',
'r' and True.
case_type: Whether to make the labels in the expression lower or uppercase.
Current case may be preserved with 'preserve',
while snake case conversion (from CamelCase or camelCase only)
can be turned on using "snake".
Default 'lower' makes all characters lowercase.
remove_special: Remove special characters from the values in the expression.
Only letters, numbers and underscores are preserved.
strip_accents: Whether or not to remove accents from
the expression.
enforce_string: Whether or not to cast the expression to a string type.
truncate_limit: Truncates formatted labels in the expression to
the specified length. Default None does not truncate.

Returns:
A polars Expression.
"""
return _clean_expr_names(
obj=expression,
strip_accents=strip_accents,
strip_underscores=strip_underscores,
case_type=case_type,
remove_special=remove_special,
enforce_string=enforce_string,
truncate_limit=truncate_limit,
)


def _change_case_expr(
obj: pl.Expr,
case_type: str,
Expand Down
Loading
Loading