diff --git a/.requirements/docs.in b/.requirements/docs.in index f0d4afc29..b23e373aa 100644 --- a/.requirements/docs.in +++ b/.requirements/docs.in @@ -1,4 +1,5 @@ mkdocs +polars mkdocs-material mkdocstrings>=0.19.0 mkdocstrings-python diff --git a/.requirements/testing.in b/.requirements/testing.in index 57e12c2d3..8179653b8 100644 --- a/.requirements/testing.in +++ b/.requirements/testing.in @@ -4,4 +4,5 @@ pytest>=3.4.2 hypothesis>=4.4.0 interrogate pandas-vet +polars py>=1.10.0 diff --git a/CHANGELOG.md b/CHANGELOG.md index 81fb66223..207b130b8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,9 @@ # Changelog ## [Unreleased] -- [ENH] Improved performance for non-equi joins when using numba - @samukweku PR #1341 + +- [ENH] Improved performance for non-equi joins when using numba - @samukweku PR #1341 +- [ENH] Added a `clean_names` method for polars - it can be used to clean the column names, or clean column values . Issue #1343 ## [v0.27.0] - 2024-03-21 diff --git a/environment-dev.yml b/environment-dev.yml index 1f8e48ece..2543e2c76 100644 --- a/environment-dev.yml +++ b/environment-dev.yml @@ -34,6 +34,7 @@ dependencies: - pipreqs - pip-tools - pre-commit + - polars - pyspark>=3.2.0 - pytest - pytest-cov diff --git a/janitor/functions/clean_names.py b/janitor/functions/clean_names.py index 71735a7fc..a38753fa8 100644 --- a/janitor/functions/clean_names.py +++ b/janitor/functions/clean_names.py @@ -1,7 +1,9 @@ -"""Functions for cleaning columns names.""" +"""Functions for cleaning columns/index names and/or column values.""" + +from __future__ import annotations import unicodedata -from typing import Hashable, Optional, Union +from typing import Optional, Union import pandas as pd import pandas_flavor as pf @@ -77,8 +79,9 @@ def clean_names( Column selection is possible using the [`select`][janitor.functions.select.select] syntax. strip_underscores: Removes the outer underscores from all - column names. Default None keeps outer underscores. Values can be - either 'left', 'right' or 'both' or the respective shorthand 'l', + column names/values. Default None keeps outer underscores. + Values can be either 'left', 'right' or 'both' + or the respective shorthand 'l', 'r' and True. case_type: Whether to make columns lower or uppercase. Current case may be preserved with 'preserve', @@ -88,15 +91,17 @@ def clean_names( remove_special: Remove special characters from columns. Only letters, numbers and underscores are preserved. strip_accents: Whether or not to remove accents from - columns names. + columns names/values. preserve_original_labels: Preserve original names. This is later retrievable using `df.original_labels`. Applies if `axis` is not None. - enforce_string: Whether or not to convert all column names - to string type. Defaults to True, but can be turned off. + enforce_string: Whether or not to convert all + column names/values to string type. + Defaults to True, but can be turned off. Columns with >1 levels will not be converted by default. - truncate_limit: Truncates formatted column names to - the specified length. Default None does not truncate. + truncate_limit: Truncates formatted column names/values + to the specified length. + Default None does not truncate. Raises: ValueError: If `axis=None` and `column_names=None`. @@ -116,7 +121,7 @@ def clean_names( column_names = [column_names] df = df.copy() for column_name in column_names: - df[column_name] = _clean_names_single_object( + df[column_name] = _clean_names( obj=df[column_name], enforce_string=enforce_string, case_type=case_type, @@ -136,7 +141,7 @@ def clean_names( for number in range(target_axis.nlevels) ] target_axis = [ - _clean_names_single_object( + _clean_names( obj=obj, enforce_string=enforce_string, case_type=case_type, @@ -148,7 +153,7 @@ def clean_names( for obj in target_axis ] else: - target_axis = _clean_names_single_object( + target_axis = _clean_names( obj=target_axis, enforce_string=enforce_string, case_type=case_type, @@ -164,100 +169,108 @@ def clean_names( return df -def _clean_names_single_object( +def _clean_names( obj: Union[pd.Index, pd.Series], - enforce_string, - case_type, - remove_special, - strip_accents, - strip_underscores, - truncate_limit, -): + strip_underscores: Optional[Union[str, bool]] = None, + case_type: str = "lower", + remove_special: bool = False, + strip_accents: bool = False, + enforce_string: bool = False, + truncate_limit: int = None, +) -> Union[pd.Index, pd.Series]: """ - Apply _clean_names on a single pandas object. + Generic function to clean labels in a pandas object. """ - if enforce_string and not (_is_str_or_cat(obj)): + if enforce_string and not _is_str_or_cat(obj): obj = obj.astype(str) - obj = _change_case(obj, case_type) - obj = _normalize_1(obj) + obj = _change_case(obj=obj, case_type=case_type) + obj = _normalize_1(obj=obj) if remove_special: - obj = obj.map(_remove_special) + obj = obj.str.replace( + pat="[^A-Za-z_\\d]", repl="", regex=True + ).str.strip() if strip_accents: - obj = obj.map(_strip_accents) + obj = _strip_accents(obj=obj) obj = obj.str.replace(pat="_+", repl="_", regex=True) - obj = _strip_underscores_func(obj, strip_underscores=strip_underscores) + obj = _strip_underscores_func( + obj, + strip_underscores=strip_underscores, + ) if truncate_limit: obj = obj.str[:truncate_limit] return obj -def _change_case(col: Union[pd.Index, pd.Series], case_type: str) -> str: - """Change case of labels in pandas object.""" +def _change_case( + obj: Union[pd.Index, pd.Series], + case_type: str, +) -> Union[pd.Index, pd.Series]: + """Change case of labels in obj.""" case_types = {"preserve", "upper", "lower", "snake"} case_type = case_type.lower() if case_type not in case_types: raise JanitorError(f"case_type must be one of: {case_types}") + if case_type == "preserve": - return col + return obj if case_type == "upper": - return col.str.upper() + return obj.str.upper() if case_type == "lower": - return col.str.lower() + return obj.str.lower() # Implementation taken from: https://gist.github.com/jaytaylor/3660565 # by @jtaylor return ( - col.str.replace(pat=r"(.)([A-Z][a-z]+)", repl=r"\1_\2", regex=True) + obj.str.replace(pat=r"(.)([A-Z][a-z]+)", repl=r"\1_\2", regex=True) .str.replace(pat=r"([a-z0-9])([A-Z])", repl=r"\1_\2", regex=True) .str.lower() ) -def _remove_special(label: Hashable) -> str: - """Remove special characters from label.""" - return "".join( - [item for item in str(label) if item.isalnum() or "_" in item] - ) - - -def _normalize_1(col: Union[pd.Index, pd.Series]) -> str: - """Perform normalization of labels in pandas object.""" +def _normalize_1( + obj: Union[pd.Index, pd.Series] +) -> Union[pd.Index, pd.Series]: + """Perform normalization of labels in obj.""" FIXES = [(r"[ /:,?()\.-]", "_"), (r"['’]", ""), (r"[\xa0]", "_")] for search, replace in FIXES: - col = col.str.replace(pat=search, repl=replace, regex=True) - return col + obj = obj.str.replace(pat=search, repl=replace, regex=True) + + return obj -def _strip_accents(label: Hashable) -> str: +def _strip_accents( + obj: Union[pd.Index, pd.Series], +) -> Union[pd.Index, pd.Series]: """Remove accents from a label. Inspired from [StackOverflow][so]. [so]: https://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-strin """ # noqa: E501 - - return "".join( - [ - letter - for letter in unicodedata.normalize("NFD", str(label)) - if not unicodedata.combining(letter) - ] + return obj.map( + lambda f: "".join( + [ + letter + for letter in unicodedata.normalize("NFD", str(f)) + if not unicodedata.combining(letter) + ] + ) ) def _strip_underscores_func( - col: Union[pd.Index, pd.Series], strip_underscores: Union[str, bool] = None -) -> pd.DataFrame: - """Strip underscores from a pandas object.""" + obj: Union[pd.Index, pd.Series], + strip_underscores: Union[str, bool] = None, +) -> Union[pd.Index, pd.Series]: + """Strip underscores.""" underscore_options = {None, "left", "right", "both", "l", "r", True} if strip_underscores not in underscore_options: raise JanitorError( f"strip_underscores must be one of: {underscore_options}" ) - - if strip_underscores in ["left", "l"]: - return col.str.lstrip("_") - if strip_underscores in ["right", "r"]: - return col.str.rstrip("_") + if strip_underscores in {"left", "l"}: + return obj.str.lstrip("_") + if strip_underscores in {"right", "r"}: + return obj.str.rstrip("_") if strip_underscores in {True, "both"}: - return col.str.strip("_") - return col + return obj.str.strip("_") + return obj diff --git a/janitor/functions/utils.py b/janitor/functions/utils.py index 9c9b98691..2c5b2eb1d 100644 --- a/janitor/functions/utils.py +++ b/janitor/functions/utils.py @@ -5,6 +5,7 @@ import fnmatch import inspect import re +import unicodedata import warnings from collections.abc import Callable as dispatch_callable from dataclasses import dataclass @@ -36,7 +37,13 @@ from pandas.core.common import is_bool_indexer from pandas.core.groupby.generic import DataFrameGroupBy, SeriesGroupBy -from janitor.utils import _expand_grid, check, check_column, find_stack_level +from janitor.errors import JanitorError +from janitor.utils import ( + _expand_grid, + check, + check_column, + find_stack_level, +) warnings.simplefilter("always", DeprecationWarning) @@ -1137,3 +1144,81 @@ def __eq__(self, other): """ self.join_args = (self.cols, other.cols, "==") return self + + +def _change_case( + obj: str, + case_type: str, +) -> str: + """Change case of obj.""" + case_types = {"preserve", "upper", "lower", "snake"} + case_type = case_type.lower() + if case_type not in case_types: + raise JanitorError(f"type must be one of: {case_types}") + + if case_type == "preserve": + return obj + if case_type == "upper": + return obj.upper() + if case_type == "lower": + return obj.lower() + # Implementation adapted from: https://gist.github.com/jaytaylor/3660565 + # by @jtaylor + obj = re.sub(pattern=r"(.)([A-Z][a-z]+)", repl=r"\1_\2", string=obj) + obj = re.sub(pattern=r"([a-z0-9])([A-Z])", repl=r"\1_\2", string=obj) + return obj.lower() + + +def _normalize_1(obj: str) -> str: + """Perform normalization of obj.""" + FIXES = [(r"[ /:,?()\.-]", "_"), (r"['’]", ""), (r"[\xa0]", "_")] + for search, replace in FIXES: + obj = re.sub(pattern=search, repl=replace, string=obj) + + return obj + + +def _remove_special( + obj: str, +) -> str: + """Remove special characters from obj.""" + obj = [item for item in obj if item.isalnum() or (item == "_")] + return "".join(obj) + + +def _strip_accents( + obj: str, +) -> str: + """Remove accents from obj. + + Inspired from [StackOverflow][so]. + + [so]: https://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-strin + """ # noqa: E501 + + obj = [ + letter + for letter in unicodedata.normalize("NFD", obj) + if not unicodedata.combining(letter) + ] + return "".join(obj) + + +def _strip_underscores_func( + obj: str, + strip_underscores: Union[str, bool] = None, +) -> str: + """Strip underscores from obj.""" + underscore_options = {None, "left", "right", "both", "l", "r", True} + if strip_underscores not in underscore_options: + raise JanitorError( + f"strip_underscores must be one of: {underscore_options}" + ) + + if strip_underscores in {"left", "l"}: + return obj.lstrip("_") + if strip_underscores in {"right", "r"}: + return obj.rstrip("_") + if strip_underscores in {True, "both"}: + return obj.strip("_") + return obj diff --git a/janitor/polars/__init__.py b/janitor/polars/__init__.py new file mode 100644 index 000000000..f130df071 --- /dev/null +++ b/janitor/polars/__init__.py @@ -0,0 +1,259 @@ +from __future__ import annotations + +from janitor.utils import import_message + +from .clean_names import _clean_column_names, _clean_expr_names + +try: + import polars as pl +except ImportError: + import_message( + submodule="polars", + package="polars", + conda_channel="conda-forge", + pip_install=True, + ) + + +@pl.api.register_dataframe_namespace("janitor") +class PolarsFrame: + def __init__(self, df: pl.DataFrame) -> pl.DataFrame: + self._df = df + + def clean_names( + self, + strip_underscores: str | bool = None, + case_type: str = "lower", + remove_special: bool = False, + strip_accents: bool = False, + truncate_limit: int = None, + ) -> pl.DataFrame: + """ + Clean the column names in a polars DataFrame. + + Examples: + >>> import polars as pl + >>> import janitor.polars + >>> df = pl.DataFrame( + ... { + ... "Aloha": range(3), + ... "Bell Chart": range(3), + ... "Animals@#$%^": range(3) + ... } + ... ) + >>> df + shape: (3, 3) + ┌───────┬────────────┬──────────────┐ + │ Aloha ┆ Bell Chart ┆ Animals@#$%^ │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═══════╪════════════╪══════════════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 1 ┆ 1 │ + │ 2 ┆ 2 ┆ 2 │ + └───────┴────────────┴──────────────┘ + >>> df.janitor.clean_names(remove_special=True) + shape: (3, 3) + ┌───────┬────────────┬─────────┐ + │ aloha ┆ bell_chart ┆ animals │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═══════╪════════════╪═════════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 1 ┆ 1 │ + │ 2 ┆ 2 ┆ 2 │ + └───────┴────────────┴─────────┘ + + !!! info "New in version 0.28.0" + + Args: + strip_underscores: Removes the outer underscores from all + column names. Default None keeps outer underscores. Values can be + either 'left', 'right' or 'both' or the respective shorthand 'l', + 'r' and True. + case_type: Whether to make the column names lower or uppercase. + Current case may be preserved with 'preserve', + while snake case conversion (from CamelCase or camelCase only) + can be turned on using "snake". + Default 'lower' makes all characters lowercase. + remove_special: Remove special characters from the column names. + Only letters, numbers and underscores are preserved. + strip_accents: Whether or not to remove accents from + the labels. + truncate_limit: Truncates formatted column names to + the specified length. Default None does not truncate. + + Returns: + A polars DataFrame. + """ # noqa: E501 + return self._df.rename( + lambda col: _clean_column_names( + obj=col, + strip_accents=strip_accents, + strip_underscores=strip_underscores, + case_type=case_type, + remove_special=remove_special, + truncate_limit=truncate_limit, + ) + ) + + +@pl.api.register_lazyframe_namespace("janitor") +class PolarsLazyFrame: + def __init__(self, df: pl.LazyFrame) -> pl.LazyFrame: + self._df = df + + def clean_names( + self, + strip_underscores: str | bool = None, + case_type: str = "lower", + remove_special: bool = False, + strip_accents: bool = False, + truncate_limit: int = None, + ) -> pl.LazyFrame: + """ + Clean the column names in a polars LazyFrame. + + Examples: + >>> import polars as pl + >>> import janitor.polars + >>> df = pl.LazyFrame( + ... { + ... "Aloha": range(3), + ... "Bell Chart": range(3), + ... "Animals@#$%^": range(3) + ... } + ... ) + >>> df.collect() + shape: (3, 3) + ┌───────┬────────────┬──────────────┐ + │ Aloha ┆ Bell Chart ┆ Animals@#$%^ │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═══════╪════════════╪══════════════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 1 ┆ 1 │ + │ 2 ┆ 2 ┆ 2 │ + └───────┴────────────┴──────────────┘ + >>> df.janitor.clean_names(remove_special=True).collect() + shape: (3, 3) + ┌───────┬────────────┬─────────┐ + │ aloha ┆ bell_chart ┆ animals │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═══════╪════════════╪═════════╡ + │ 0 ┆ 0 ┆ 0 │ + │ 1 ┆ 1 ┆ 1 │ + │ 2 ┆ 2 ┆ 2 │ + └───────┴────────────┴─────────┘ + + !!! info "New in version 0.28.0" + + Args: + strip_underscores: Removes the outer underscores from all + column names. Default None keeps outer underscores. Values can be + either 'left', 'right' or 'both' or the respective shorthand 'l', + 'r' and True. + case_type: Whether to make the column names lower or uppercase. + Current case may be preserved with 'preserve', + while snake case conversion (from CamelCase or camelCase only) + can be turned on using "snake". + Default 'lower' makes all characters lowercase. + remove_special: Remove special characters from the column names. + Only letters, numbers and underscores are preserved. + strip_accents: Whether or not to remove accents from + the labels. + truncate_limit: Truncates formatted column names to + the specified length. Default None does not truncate. + + Returns: + A polars LazyFrame. + """ # noqa: E501 + return self._df.rename( + lambda col: _clean_column_names( + obj=col, + strip_accents=strip_accents, + strip_underscores=strip_underscores, + case_type=case_type, + remove_special=remove_special, + truncate_limit=truncate_limit, + ) + ) + + +@pl.api.register_expr_namespace("janitor") +class PolarsExpr: + def __init__(self, expr: pl.Expr) -> pl.Expr: + self._expr = expr + + def clean_names( + self, + strip_underscores: str | bool = None, + case_type: str = "lower", + remove_special: bool = False, + strip_accents: bool = False, + enforce_string: bool = False, + truncate_limit: int = None, + ) -> pl.Expr: + """ + Clean the labels in a polars Expression. + + Examples: + >>> import polars as pl + >>> import janitor.polars + >>> df = pl.DataFrame({"raw": ["Abçdê fgí j"]}) + >>> df + shape: (1, 1) + ┌─────────────┐ + │ raw │ + │ --- │ + │ str │ + ╞═════════════╡ + │ Abçdê fgí j │ + └─────────────┘ + + Clean the column values: + >>> df.with_columns(pl.col("raw").janitor.clean_names(strip_accents=True)) + shape: (1, 1) + ┌─────────────┐ + │ raw │ + │ --- │ + │ str │ + ╞═════════════╡ + │ abcde_fgi_j │ + └─────────────┘ + + !!! info "New in version 0.28.0" + + Args: + strip_underscores: Removes the outer underscores + from all labels in the expression. + Default None keeps outer underscores. + Values can be either 'left', 'right' + or 'both' or the respective shorthand 'l', + 'r' and True. + case_type: Whether to make the labels in the expression lower or uppercase. + Current case may be preserved with 'preserve', + while snake case conversion (from CamelCase or camelCase only) + can be turned on using "snake". + Default 'lower' makes all characters lowercase. + remove_special: Remove special characters from the values in the expression. + Only letters, numbers and underscores are preserved. + strip_accents: Whether or not to remove accents from + the expression. + enforce_string: Whether or not to cast the expression to a string type. + truncate_limit: Truncates formatted labels in the expression to + the specified length. Default None does not truncate. + + Returns: + A polars Expression. + """ + return _clean_expr_names( + obj=self._expr, + strip_accents=strip_accents, + strip_underscores=strip_underscores, + case_type=case_type, + remove_special=remove_special, + enforce_string=enforce_string, + truncate_limit=truncate_limit, + ) diff --git a/janitor/polars/clean_names.py b/janitor/polars/clean_names.py new file mode 100644 index 000000000..90e2656e2 --- /dev/null +++ b/janitor/polars/clean_names.py @@ -0,0 +1,169 @@ +"""clean_names implementation for polars.""" + +from __future__ import annotations + +import re +import unicodedata + +from janitor.errors import JanitorError +from janitor.functions.utils import ( + _change_case, + _normalize_1, + _remove_special, + _strip_accents, + _strip_underscores_func, +) +from janitor.utils import import_message + +try: + import polars as pl +except ImportError: + import_message( + submodule="polars", + package="polars", + conda_channel="conda-forge", + pip_install=True, + ) + + +def _change_case_expr( + obj: pl.Expr, + case_type: str, +) -> pl.Expr: + """Change case of labels in obj.""" + case_types = {"preserve", "upper", "lower", "snake"} + case_type = case_type.lower() + if case_type not in case_types: + raise JanitorError(f"type must be one of: {case_types}") + + if case_type == "preserve": + return obj + if case_type == "upper": + return obj.str.to_uppercase() + if case_type == "lower": + return obj.str.to_lowercase() + # Implementation taken from: https://gist.github.com/jaytaylor/3660565 + # by @jtaylor + return ( + obj.str.replace_all( + pattern=r"(.)([A-Z][a-z]+)", value=r"${1}_${2}", literal=False + ) + .str.replace_all( + pattern=r"([a-z0-9])([A-Z])", value=r"${1}_${2}", literal=False + ) + .str.to_lowercase() + ) + + +def _normalize_expr(obj: pl.Expr) -> pl.Expr: + """Perform normalization of labels in obj.""" + FIXES = [(r"[ /:,?()\.-]", "_"), (r"['’]", ""), (r"[\xa0]", "_")] + for search, replace in FIXES: + obj = obj.str.replace_all(pattern=search, value=replace, literal=False) + return obj + + +def _remove_special_expr( + obj: pl.Expr, +) -> pl.Expr: + """Remove special characters from the labels in obj.""" + return obj.str.replace_all( + pattern="[^A-Za-z_\\d]", value="", literal=False + ).str.strip_chars() + + +def _strip_accents_expr( + obj: pl.Expr, +) -> pl.Expr: + """Remove accents from the labels in obj. + + Inspired from [StackOverflow][so]. + + [so]: https://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-strin + """ # noqa: E501 + # TODO: possible implementation in Rust + # or use a pyarrow implementation? + # https://github.com/pola-rs/polars/issues/11455 + return obj.map_elements( + lambda word: [ + letter + for letter in unicodedata.normalize("NFD", word) + if not unicodedata.combining(letter) + ], + return_dtype=pl.List(pl.Utf8), + ).list.join("") + + +def _strip_underscores_func_expr( + obj: pl.Expr, + strip_underscores: str | bool = None, +) -> pl.Expr: + """Strip underscores from obj.""" + underscore_options = {None, "left", "right", "both", "l", "r", True} + if strip_underscores not in underscore_options: + raise JanitorError( + f"strip_underscores must be one of: {underscore_options}" + ) + if strip_underscores in {"left", "l"}: + return obj.str.strip_chars_start("_") + if strip_underscores in {"right", "r"}: + return obj.str.strip_chars_end("_") + if strip_underscores in {True, "both"}: + return obj.str.strip_chars("_") + return obj + + +def _clean_column_names( + obj: str, + strip_underscores: str | bool = None, + case_type: str = "lower", + remove_special: bool = False, + strip_accents: bool = False, + truncate_limit: int = None, +) -> str: + """ + Function to clean the column names of a polars DataFrame. + """ + obj = _change_case(obj=obj, case_type=case_type) + obj = _normalize_1(obj=obj) + if remove_special: + obj = _remove_special(obj=obj) + if strip_accents: + obj = _strip_accents(obj=obj) + obj = re.sub(pattern="_+", repl="_", string=obj) + obj = _strip_underscores_func( + obj, + strip_underscores=strip_underscores, + ) + obj = obj[:truncate_limit] + return obj + + +def _clean_expr_names( + obj: pl.Expr, + strip_underscores: str | bool = None, + case_type: str = "lower", + remove_special: bool = False, + strip_accents: bool = False, + enforce_string: bool = False, + truncate_limit: int = None, +) -> pl.Expr: + """ + Function to clean the labels of a polars Expression. + """ + if enforce_string: + obj = obj.cast(pl.Utf8) + obj = _change_case_expr(obj=obj, case_type=case_type) + obj = _normalize_expr(obj=obj) + if remove_special: + obj = _remove_special_expr(obj=obj) + if strip_accents: + obj = _strip_accents_expr(obj=obj) + obj = obj.str.replace(pattern="_+", value="_", literal=False) + obj = _strip_underscores_func_expr( + obj, + strip_underscores=strip_underscores, + ) + if truncate_limit: + obj = obj.str.slice(offset=0, length=truncate_limit) + return obj diff --git a/janitor/spark/functions.py b/janitor/spark/functions.py index a43f7338d..57abd1824 100644 --- a/janitor/spark/functions.py +++ b/janitor/spark/functions.py @@ -4,7 +4,7 @@ from typing import Union from janitor import utils as janitor_utils -from janitor.functions.clean_names import ( +from janitor.functions.utils import ( _change_case, _normalize_1, _remove_special, diff --git a/mkdocs.yml b/mkdocs.yml index 639d71bea..a7545afc5 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -45,6 +45,7 @@ nav: - Machine Learning: api/ml.md - Math: api/math.md # - PySpark: api/pyspark.md # will be added back later + - Polars: api/polars.md - Timeseries: api/timeseries.md - XArray: api/xarray.md - Development Guide: devguide.md diff --git a/mkdocs/api/polars.md b/mkdocs/api/polars.md new file mode 100644 index 000000000..f5d5fed35 --- /dev/null +++ b/mkdocs/api/polars.md @@ -0,0 +1,8 @@ +# Polars + +::: janitor.polars + options: + members: + - PolarsExpr + - PolarsFrame + - PolarsLazyFrame diff --git a/pyproject.toml b/pyproject.toml index f6b98f54b..0a697589f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,6 +39,7 @@ markers = [ "utils: utility tests", "engineering: tests for engineering", "ml: tests for machine learning", + "polars: tests for polars methods", "spark_functions: tests for pyspark functions", "xarray: tests for xarray functions", "timeseries: tests for timeseries", diff --git a/tests/polars/functions/test_clean_names_polars.py b/tests/polars/functions/test_clean_names_polars.py new file mode 100644 index 000000000..23ce38742 --- /dev/null +++ b/tests/polars/functions/test_clean_names_polars.py @@ -0,0 +1,102 @@ +import polars as pl +import pytest + +from janitor import polars # noqa: F401 + + +@pytest.mark.functions +def test_clean_names_method_chain(dataframe): + """Tests clean_names default args in a method chain.""" + df = pl.from_pandas(dataframe) + df = df.janitor.clean_names() + expected_columns = [ + "a", + "bell_chart", + "decorated_elephant", + "animals@#$%^", + "cities", + ] + assert df.columns == expected_columns + + +@pytest.mark.functions +def test_clean_names_special_characters(dataframe): + """Tests clean_names `remove_special` parameter.""" + df = pl.from_pandas(dataframe) + df = df.janitor.clean_names(remove_special=True) + expected_columns = [ + "a", + "bell_chart", + "decorated_elephant", + "animals", + "cities", + ] + assert df.columns == expected_columns + + +@pytest.mark.functions +def test_clean_names_uppercase(dataframe): + """Tests clean_names `case_type` parameter = upper.""" + df = pl.from_pandas(dataframe) + df = df.janitor.clean_names(remove_special=True, case_type="upper") + expected_columns = [ + "A", + "BELL_CHART", + "DECORATED_ELEPHANT", + "ANIMALS", + "CITIES", + ] + assert df.columns == expected_columns + + +@pytest.mark.functions +def test_clean_names_strip_accents(): + """Tests clean_names `strip_accents` parameter.""" + df = pl.DataFrame({"João": [1, 2], "Лука́ся": [1, 2], "Käfer": [1, 2]}) + df = df.janitor.clean_names(strip_accents=True) + expected_columns = ["joao", "лукася", "kafer"] + assert df.columns == expected_columns + + +@pytest.mark.functions +def test_clean_names_camelcase_to_snake(dataframe): + """Tests clean_names `case_type` parameter = snake.""" + df = pl.from_pandas(dataframe) + df = ( + df.select("a") + .rename({"a": "AColumnName"}) + .janitor.clean_names(remove_special=True, case_type="snake") + ) + assert df.columns == ["a_column_name"] + + +@pytest.mark.functions +def test_clean_names_truncate_limit(dataframe): + """Tests clean_names `truncate_limit` parameter.""" + df = pl.from_pandas(dataframe) + df = df.janitor.clean_names(truncate_limit=7) + expected_columns = ["a", "bell_ch", "decorat", "animals", "cities"] + assert df.columns == expected_columns + + +@pytest.mark.functions +def test_charac(): + """Ensure non standard characters and spaces have been cleaned up.""" + + df = pl.DataFrame( + { + r"Current accountbalance(in % of GDP)": range(5), + } + ) + df = df.janitor.clean_names(strip_underscores=True, case_type="lower") + + assert "current_accountbalance_in_%_of_gdp" in df.columns + + +def test_clean_column_values(): + """Clean column values""" + raw = pl.DataFrame({"raw": ["Abçdê fgí j"]}) + outcome = raw.with_columns( + pl.col("raw").janitor.clean_names(strip_accents=True) + ) + assert list(outcome)[0][0] == "abcde_fgi_j"