From 9e1a4e573d2e28bac6c57e81e460b4cb93217167 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rodrigo=20Gir=C3=A3o=20Serr=C3=A3o?= <5621605+rodrigogiraoserrao@users.noreply.github.com> Date: Tue, 12 Nov 2024 17:58:34 +0000 Subject: [PATCH 1/2] docs: Make dprint wrap long lines in markdown dprint is now configured to wrap text at roughly 100 characters so that lines do not get too long. --- dprint.json | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/dprint.json b/dprint.json index 4d3bb797aa47..38bd6131b936 100644 --- a/dprint.json +++ b/dprint.json @@ -2,6 +2,10 @@ "includes": [ "**/*.{md,toml,json}" ], + "markdown": { + "lineWidth": 100, + "textWrap": "always" + }, "excludes": [ ".venv/", "**/target/", From 7607f8bd58ce40fd30fc714474652642092f0b2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rodrigo=20Gir=C3=A3o=20Serr=C3=A3o?= <5621605+rodrigogiraoserrao@users.noreply.github.com> Date: Tue, 12 Nov 2024 17:59:05 +0000 Subject: [PATCH 2/2] Run dprint with new config settings. --- .github/CODE_OF_CONDUCT.md | 66 ++--- CONTRIBUTING.md | 6 +- README.md | 64 +++-- crates/polars-arrow/src/README.md | 23 +- crates/polars-arrow/src/array/README.md | 41 ++- crates/polars-arrow/src/compute/README.md | 20 +- crates/polars-arrow/src/doc/lib.md | 20 +- crates/polars-arrow/src/io/README.md | 20 +- crates/polars-arrow/src/scalar/README.md | 7 +- crates/polars-compute/README.md | 6 +- crates/polars-core/README.md | 6 +- crates/polars-error/README.md | 6 +- crates/polars-expr/README.md | 3 +- crates/polars-io/README.md | 6 +- crates/polars-json/README.md | 6 +- crates/polars-lazy/README.md | 8 +- crates/polars-mem-engine/README.md | 6 +- crates/polars-ops/README.md | 6 +- .../polars-parquet/src/arrow/read/README.md | 35 ++- .../src/arrow/read/deserialize/README.md | 46 +-- crates/polars-pipe/README.md | 6 +- crates/polars-plan/README.md | 6 +- crates/polars-python/README.md | 7 +- crates/polars-row/README.md | 6 +- crates/polars-schema/README.md | 6 +- crates/polars-sql/README.md | 6 +- crates/polars-stream/README.md | 6 +- crates/polars-time/README.md | 6 +- crates/polars-utils/README.md | 6 +- docs/README.md | 18 +- docs/source/api/index.md | 12 +- docs/source/development/contributing/ci.md | 43 ++- docs/source/development/contributing/ide.md | 57 ++-- docs/source/development/contributing/index.md | 267 +++++++++++------- docs/source/development/contributing/test.md | 121 ++++---- docs/source/development/versioning.md | 108 ++++--- docs/source/index.md | 33 ++- docs/source/releases/changelog.md | 3 +- docs/source/releases/upgrade/0.19.md | 45 +-- docs/source/releases/upgrade/0.20.md | 106 ++++--- docs/source/releases/upgrade/1.md | 174 ++++++------ docs/source/releases/upgrade/index.md | 7 +- docs/source/user-guide/concepts/_streaming.md | 18 +- .../concepts/data-types-and-structures.md | 94 +++--- .../concepts/expressions-and-contexts.md | 139 +++++---- docs/source/user-guide/concepts/index.md | 3 +- docs/source/user-guide/concepts/lazy-api.md | 41 ++- docs/source/user-guide/ecosystem.md | 48 +++- .../user-guide/expressions/aggregation.md | 85 +++--- .../expressions/basic-operations.md | 68 +++-- docs/source/user-guide/expressions/casting.md | 68 +++-- .../expressions/categorical-data-and-enums.md | 176 +++++++----- .../expressions/expression-expansion.md | 162 +++++++---- docs/source/user-guide/expressions/folds.md | 33 ++- docs/source/user-guide/expressions/index.md | 7 +- .../expressions/lists-and-arrays.md | 86 +++--- .../user-guide/expressions/missing-data.md | 82 +++--- .../user-guide/expressions/numpy-functions.md | 22 +- docs/source/user-guide/expressions/strings.md | 102 ++++--- docs/source/user-guide/expressions/structs.md | 67 +++-- .../user-defined-python-functions.md | 115 +++++--- .../expressions/window-functions.md | 84 +++--- docs/source/user-guide/getting-started.md | 100 ++++--- docs/source/user-guide/gpu-support.md | 74 +++-- docs/source/user-guide/installation.md | 19 +- docs/source/user-guide/io/cloud-storage.md | 24 +- docs/source/user-guide/io/csv.md | 8 +- docs/source/user-guide/io/database.md | 54 +++- docs/source/user-guide/io/excel.md | 36 ++- docs/source/user-guide/io/hugging-face.md | 27 +- docs/source/user-guide/io/index.md | 3 +- docs/source/user-guide/io/json.md | 7 +- docs/source/user-guide/io/multiple.md | 8 +- docs/source/user-guide/io/parquet.md | 19 +- docs/source/user-guide/lazy/execution.md | 18 +- docs/source/user-guide/lazy/gpu.md | 7 +- docs/source/user-guide/lazy/index.md | 4 +- docs/source/user-guide/lazy/optimizations.md | 7 +- docs/source/user-guide/lazy/query-plan.md | 13 +- docs/source/user-guide/lazy/schemas.md | 27 +- docs/source/user-guide/lazy/using.md | 12 +- docs/source/user-guide/migration/pandas.md | 182 ++++++------ docs/source/user-guide/migration/spark.md | 20 +- docs/source/user-guide/misc/arrow.md | 32 ++- docs/source/user-guide/misc/comparison.md | 50 +++- .../source/user-guide/misc/multiprocessing.md | 117 +++++--- docs/source/user-guide/misc/styling.md | 5 +- docs/source/user-guide/misc/visualization.md | 29 +- .../plugins/your-first-polars-plugin.md | 103 ++++--- docs/source/user-guide/sql/create.md | 11 +- docs/source/user-guide/sql/cte.md | 25 +- docs/source/user-guide/sql/intro.md | 44 +-- docs/source/user-guide/sql/select.md | 22 +- docs/source/user-guide/sql/show.md | 14 +- .../transformations/concatenation.md | 37 ++- .../user-guide/transformations/index.md | 3 +- .../user-guide/transformations/joins.md | 137 +++++---- .../user-guide/transformations/pivot.md | 15 +- .../transformations/time-series/filter.md | 19 +- .../transformations/time-series/parsing.md | 25 +- .../transformations/time-series/resampling.md | 11 +- .../transformations/time-series/rolling.md | 52 ++-- .../transformations/time-series/timezones.md | 20 +- 103 files changed, 2680 insertions(+), 1710 deletions(-) diff --git a/.github/CODE_OF_CONDUCT.md b/.github/CODE_OF_CONDUCT.md index 9d2ce2333742..12a6354d20af 100644 --- a/.github/CODE_OF_CONDUCT.md +++ b/.github/CODE_OF_CONDUCT.md @@ -2,17 +2,15 @@ ## Our Pledge -In the interest of fostering an open and welcoming environment, we as -contributors and maintainers pledge to make participation in our project and -our community a harassment-free experience for everyone, regardless of age, body -size, disability, ethnicity, sex characteristics, gender identity and expression, -level of experience, education, socio-economic status, nationality, personal +In the interest of fostering an open and welcoming environment, we as contributors and maintainers +pledge to make participation in our project and our community a harassment-free experience for +everyone, regardless of age, body size, disability, ethnicity, sex characteristics, gender identity +and expression, level of experience, education, socio-economic status, nationality, personal appearance, race, religion, or sexual identity and orientation. ## Our Standards -Examples of behavior that contributes to creating a positive environment -include: +Examples of behavior that contributes to creating a positive environment include: - Using welcoming and inclusive language - Being respectful of differing viewpoints and experiences @@ -22,53 +20,47 @@ include: Examples of unacceptable behavior by participants include: -- The use of sexualized language or imagery and unwelcome sexual attention or - advances +- The use of sexualized language or imagery and unwelcome sexual attention or advances - Trolling, insulting/derogatory comments, and personal or political attacks - Public or private harassment -- Publishing others' private information, such as a physical or electronic - address, without explicit permission -- Other conduct which could reasonably be considered inappropriate in a - professional setting +- Publishing others' private information, such as a physical or electronic address, without explicit + permission +- Other conduct which could reasonably be considered inappropriate in a professional setting ## Our Responsibilities -Project maintainers are responsible for clarifying the standards of acceptable -behavior and are expected to take appropriate and fair corrective action in -response to any instances of unacceptable behavior. +Project maintainers are responsible for clarifying the standards of acceptable behavior and are +expected to take appropriate and fair corrective action in response to any instances of unacceptable +behavior. -Project maintainers have the right and responsibility to remove, edit, or -reject comments, commits, code, wiki edits, issues, and other contributions -that are not aligned to this Code of Conduct, or to ban temporarily or -permanently any contributor for other behaviors that they deem inappropriate, +Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, +code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or +to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. ## Scope -This Code of Conduct applies within all project spaces, and it also applies when -an individual is representing the project or its community in public spaces. -Examples of representing a project or community include using an official -project e-mail address, posting via an official social media account, or acting -as an appointed representative at an online or offline event. Representation of -a project may be further defined and clarified by project maintainers. +This Code of Conduct applies within all project spaces, and it also applies when an individual is +representing the project or its community in public spaces. Examples of representing a project or +community include using an official project e-mail address, posting via an official social media +account, or acting as an appointed representative at an online or offline event. Representation of a +project may be further defined and clarified by project maintainers. ## Enforcement -Instances of abusive, harassing, or otherwise unacceptable behavior may be -reported by contacting the project team at ritchie46@gmail.com. All -complaints will be reviewed and investigated and will result in a response that -is deemed necessary and appropriate to the circumstances. The project team is -obligated to maintain confidentiality with regard to the reporter of an incident. -Further details of specific enforcement policies may be posted separately. +Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting +the project team at ritchie46@gmail.com. All complaints will be reviewed and investigated and will +result in a response that is deemed necessary and appropriate to the circumstances. The project team +is obligated to maintain confidentiality with regard to the reporter of an incident. Further details +of specific enforcement policies may be posted separately. -Project maintainers who do not follow or enforce the Code of Conduct in good -faith may face temporary or permanent repercussions as determined by other -members of the project's leadership. +Project maintainers who do not follow or enforce the Code of Conduct in good faith may face +temporary or permanent repercussions as determined by other members of the project's leadership. ## Attribution -This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, -available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html +This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at +https://www.contributor-covenant.org/version/1/4/code-of-conduct.html [homepage]: https://www.contributor-covenant.org diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index ae87cb7e4bb9..175df57e8064 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,7 +1,9 @@ # Contributing to Polars -Thanks for taking the time to contribute! We appreciate all contributions, from reporting bugs to implementing new features. +Thanks for taking the time to contribute! We appreciate all contributions, from reporting bugs to +implementing new features. -Please refer to the [contributing section](https://docs.pola.rs/development/contributing/) of our documentation to get started. +Please refer to the [contributing section](https://docs.pola.rs/development/contributing/) of our +documentation to get started. We look forward to your contributions! diff --git a/README.md b/README.md index 43ac43596813..8ea37538a508 100644 --- a/README.md +++ b/README.md @@ -49,7 +49,8 @@ ## Polars: Blazingly fast DataFrames in Rust, Python, Node.js, R, and SQL Polars is a DataFrame interface on top of an OLAP Query Engine implemented in Rust using -[Apache Arrow Columnar Format](https://arrow.apache.org/docs/format/Columnar.html) as the memory model. +[Apache Arrow Columnar Format](https://arrow.apache.org/docs/format/Columnar.html) as the memory +model. - Lazy | eager execution - Multi-threaded @@ -158,11 +159,13 @@ Refer to the [Polars CLI repository](https://github.com/pola-rs/polars-cli) for ### Blazingly fast -Polars is very fast. In fact, it is one of the best performing solutions available. See the [PDS-H benchmarks](https://www.pola.rs/benchmarks.html) results. +Polars is very fast. In fact, it is one of the best performing solutions available. See the +[PDS-H benchmarks](https://www.pola.rs/benchmarks.html) results. ### Lightweight -Polars is also very lightweight. It comes with zero required dependencies, and this shows in the import times: +Polars is also very lightweight. It comes with zero required dependencies, and this shows in the +import times: - polars: 70ms - numpy: 104ms @@ -170,10 +173,11 @@ Polars is also very lightweight. It comes with zero required dependencies, and t ### Handles larger-than-RAM data -If you have data that does not fit into memory, Polars' query engine is able to process your query (or parts of your query) in a streaming fashion. -This drastically reduces memory requirements, so you might be able to process your 250GB dataset on your laptop. -Collect with `collect(streaming=True)` to run the query streaming. -(This might be a little slower, but it is still very fast!) +If you have data that does not fit into memory, Polars' query engine is able to process your query +(or parts of your query) in a streaming fashion. This drastically reduces memory requirements, so +you might be able to process your 250GB dataset on your laptop. Collect with +`collect(streaming=True)` to run the query streaming. (This might be a little slower, but it is +still very fast!) ## Setup @@ -185,7 +189,8 @@ Install the latest Polars version with: pip install polars ``` -We also have a conda package (`conda install -c conda-forge polars`), however pip is the preferred way to install Polars. +We also have a conda package (`conda install -c conda-forge polars`), however pip is the preferred +way to install Polars. Install Polars with all optional dependencies. @@ -199,7 +204,8 @@ You can also install a subset of all optional dependencies. pip install 'polars[numpy,pandas,pyarrow]' ``` -See the [User Guide](https://docs.pola.rs/user-guide/installation/#feature-flags) for more details on optional dependencies +See the [User Guide](https://docs.pola.rs/user-guide/installation/#feature-flags) for more details +on optional dependencies To see the current Polars version and a full list of its optional dependencies, run: @@ -207,12 +213,13 @@ To see the current Polars version and a full list of its optional dependencies, pl.show_versions() ``` -Releases happen quite often (weekly / every few days) at the moment, so updating Polars regularly to get the latest bugfixes / features might not be a bad idea. +Releases happen quite often (weekly / every few days) at the moment, so updating Polars regularly to +get the latest bugfixes / features might not be a bad idea. ### Rust -You can take latest release from `crates.io`, or if you want to use the latest features / performance -improvements point to the `main` branch of this repo. +You can take latest release from `crates.io`, or if you want to use the latest features / +performance improvements point to the `main` branch of this repo. ```toml polars = { git = "https://github.com/pola-rs/polars", rev = "" } @@ -234,36 +241,39 @@ This can be done by going through the following steps in sequence: 2. Install [maturin](https://maturin.rs/): `pip install maturin` 3. `cd py-polars` and choose one of the following: - `make build`, slow binary with debug assertions and symbols, fast compile times - - `make build-release`, fast binary without debug assertions, minimal debug symbols, long compile times - - `make build-nodebug-release`, same as build-release but without any debug symbols, slightly faster to compile - - `make build-debug-release`, same as build-release but with full debug symbols, slightly slower to compile + - `make build-release`, fast binary without debug assertions, minimal debug symbols, long compile + times + - `make build-nodebug-release`, same as build-release but without any debug symbols, slightly + faster to compile + - `make build-debug-release`, same as build-release but with full debug symbols, slightly slower + to compile - `make build-dist-release`, fastest binary, extreme compile times By default the binary is compiled with optimizations turned on for a modern CPU. Specify `LTS_CPU=1` with the command if your CPU is older and does not support e.g. AVX2. -Note that the Rust crate implementing the Python bindings is called `py-polars` to distinguish from the wrapped -Rust crate `polars` itself. However, both the Python package and the Python module are named `polars`, so you -can `pip install polars` and `import polars`. +Note that the Rust crate implementing the Python bindings is called `py-polars` to distinguish from +the wrapped Rust crate `polars` itself. However, both the Python package and the Python module are +named `polars`, so you can `pip install polars` and `import polars`. ## Using custom Rust functions in Python -Extending Polars with UDFs compiled in Rust is easy. We expose PyO3 extensions for `DataFrame` and `Series` -data structures. See more in https://github.com/pola-rs/pyo3-polars. +Extending Polars with UDFs compiled in Rust is easy. We expose PyO3 extensions for `DataFrame` and +`Series` data structures. See more in https://github.com/pola-rs/pyo3-polars. ## Going big... -Do you expect more than 2^32 (~4.2 billion) rows? Compile Polars with the `bigidx` feature -flag or, for Python users, install `pip install polars-u64-idx`. +Do you expect more than 2^32 (~4.2 billion) rows? Compile Polars with the `bigidx` feature flag or, +for Python users, install `pip install polars-u64-idx`. -Don't use this unless you hit the row boundary as the default build of Polars is faster and consumes less memory. +Don't use this unless you hit the row boundary as the default build of Polars is faster and consumes +less memory. ## Legacy -Do you want Polars to run on an old CPU (e.g. dating from before 2011), or on an `x86-64` build -of Python on Apple Silicon under Rosetta? Install `pip install polars-lts-cpu`. This version of -Polars is compiled without [AVX](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions) target -features. +Do you want Polars to run on an old CPU (e.g. dating from before 2011), or on an `x86-64` build of +Python on Apple Silicon under Rosetta? Install `pip install polars-lts-cpu`. This version of Polars +is compiled without [AVX](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions) target features. ## Sponsors diff --git a/crates/polars-arrow/src/README.md b/crates/polars-arrow/src/README.md index d6371ebc8741..cb24938c5f78 100644 --- a/crates/polars-arrow/src/README.md +++ b/crates/polars-arrow/src/README.md @@ -1,23 +1,27 @@ # Crate's design -This document describes the design of this module, and thus the overall crate. -Each module MAY have its own design document, that concerns specifics of that module, and if yes, -it MUST be on each module's `README.md`. +This document describes the design of this module, and thus the overall crate. Each module MAY have +its own design document, that concerns specifics of that module, and if yes, it MUST be on each +module's `README.md`. ## Equality -Array equality is not defined in the Arrow specification. This crate follows the intent of the specification, but there is no guarantee that this no verification that this equals e.g. C++'s definition. +Array equality is not defined in the Arrow specification. This crate follows the intent of the +specification, but there is no guarantee that this no verification that this equals e.g. C++'s +definition. -There is a single source of truth about whether two arrays are equal, and that is via their -equality operators, defined on the module [`array/equal`](array/equal/mod.rs). +There is a single source of truth about whether two arrays are equal, and that is via their equality +operators, defined on the module [`array/equal`](array/equal/mod.rs). -Implementation MUST use these operators for asserting equality, so that all testing follows the same definition of array equality. +Implementation MUST use these operators for asserting equality, so that all testing follows the same +definition of array equality. ## Error handling - Errors from an external dependency MUST be encapsulated on `External`. - Errors from IO MUST be encapsulated on `Io`. -- This crate MAY return `NotYetImplemented` when the functionality does not exist, or it MAY panic with `unimplemented!`. +- This crate MAY return `NotYetImplemented` when the functionality does not exist, or it MAY panic + with `unimplemented!`. ## Logical and physical types @@ -29,4 +33,5 @@ There is a strict separation between physical and logical types: ## Source of undefined behavior -There is one, and only one, acceptable source of undefined behavior: FFI. It is impossible to prove that data passed via pointers are safe for consumption (only a promise from the specification). +There is one, and only one, acceptable source of undefined behavior: FFI. It is impossible to prove +that data passed via pointers are safe for consumption (only a promise from the specification). diff --git a/crates/polars-arrow/src/array/README.md b/crates/polars-arrow/src/array/README.md index af21f91e02ef..b3c8424c4c45 100644 --- a/crates/polars-arrow/src/array/README.md +++ b/crates/polars-arrow/src/array/README.md @@ -10,7 +10,8 @@ This document describes the overall design of this module. ## Arrays: -- Every arrow array with a different physical representation MUST be implemented as a struct or generic struct. +- Every arrow array with a different physical representation MUST be implemented as a struct or + generic struct. - An array MAY have its own module. E.g. `primitive/mod.rs` @@ -22,16 +23,19 @@ This document describes the overall design of this module. - Every child array on the struct MUST be `Box`. -- An array MUST implement `try_new(...) -> Self`. This method MUST error iff - the data does not follow the arrow specification, including any sentinel types such as utf8. +- An array MUST implement `try_new(...) -> Self`. This method MUST error iff the data does not + follow the arrow specification, including any sentinel types such as utf8. - An array MAY implement `unsafe try_new_unchecked` that skips validation steps that are `O(N)`. -- An array MUST implement either `new_empty()` or `new_empty(DataType)` that returns a zero-len of `Self`. +- An array MUST implement either `new_empty()` or `new_empty(DataType)` that returns a zero-len of + `Self`. -- An array MUST implement either `new_null(length: usize)` or `new_null(DataType, length: usize)` that returns a valid array of length `length` whose all elements are null. +- An array MUST implement either `new_null(length: usize)` or `new_null(DataType, length: usize)` + that returns a valid array of length `length` whose all elements are null. -- An array MAY implement `value(i: usize)` that returns the value at slot `i` ignoring the validity bitmap. +- An array MAY implement `value(i: usize)` that returns the value at slot `i` ignoring the validity + bitmap. - functions to create new arrays from native Rust SHOULD be named as follows: - `from`: from a slice of optional values (e.g. `AsRef<[Option]` for `BooleanArray`) @@ -42,20 +46,26 @@ This document describes the overall design of this module. ### Slot offsets -- An array MUST have a `offset: usize` measuring the number of slots that the array is currently offsetted by if the specification requires. +- An array MUST have a `offset: usize` measuring the number of slots that the array is currently + offsetted by if the specification requires. -- An array MUST implement `fn slice(&self, offset: usize, length: usize) -> Self` that returns an offsetted and/or truncated clone of the array. This function MUST increase the array's offset if it exists. +- An array MUST implement `fn slice(&self, offset: usize, length: usize) -> Self` that returns an + offsetted and/or truncated clone of the array. This function MUST increase the array's offset if + it exists. - Conversely, `offset` MUST only be changed by `slice`. -The rational of the above is that it enable us to be fully interoperable with the offset logic supported by the C data interface, while at the same time easily perform array slices -within Rust's type safety mechanism. +The rational of the above is that it enable us to be fully interoperable with the offset logic +supported by the C data interface, while at the same time easily perform array slices within Rust's +type safety mechanism. ### Mutable Arrays -- An array MAY have a mutable counterpart. E.g. `MutablePrimitiveArray` is the mutable counterpart of `PrimitiveArray`. +- An array MAY have a mutable counterpart. E.g. `MutablePrimitiveArray` is the mutable + counterpart of `PrimitiveArray`. -- Arrays with mutable counterparts MUST have its own module, and have the mutable counterpart declared in `{module}/mutable.rs`. +- Arrays with mutable counterparts MUST have its own module, and have the mutable counterpart + declared in `{module}/mutable.rs`. - The trait `MutableArray` MUST only be implemented by mutable arrays in this module. @@ -67,7 +77,8 @@ within Rust's type safety mechanism. - it must not allocate - it must not cause `O(N)` data transformations - This is achieved by converting mutable versions to immutable counterparts (e.g. `MutableBitmap -> Bitmap`). + This is achieved by converting mutable versions to immutable counterparts (e.g. + `MutableBitmap -> Bitmap`). - The rational is that `MutableArray`s can be used to perform in-place operations under - the arrow spec. + The rational is that `MutableArray`s can be used to perform in-place operations under the arrow + spec. diff --git a/crates/polars-arrow/src/compute/README.md b/crates/polars-arrow/src/compute/README.md index 6b5bec7e703e..4662ed0944b8 100644 --- a/crates/polars-arrow/src/compute/README.md +++ b/crates/polars-arrow/src/compute/README.md @@ -2,7 +2,8 @@ This document outlines the design guide lines of this module. -This module is composed by independent operations common in analytics. Below are some design of its principles: +This module is composed by independent operations common in analytics. Below are some design of its +principles: - APIs MUST return an error when either: - The arguments are incorrect @@ -14,19 +15,24 @@ This module is composed by independent operations common in analytics. Below are - kernels MUST NOT take ownership of any of its arguments (i.e. everything must be a reference). -- APIs SHOULD error when an operation on variable sized containers can overflow the maximum size of `usize`. +- APIs SHOULD error when an operation on variable sized containers can overflow the maximum size of + `usize`. -- Kernels SHOULD use the arrays' logical type to decide whether kernels - can be applied on an array. For example, `Date32 + Date32` is meaningless and SHOULD NOT be implemented. +- Kernels SHOULD use the arrays' logical type to decide whether kernels can be applied on an array. + For example, `Date32 + Date32` is meaningless and SHOULD NOT be implemented. -- Kernels SHOULD be implemented via `clone`, `slice` or the `iterator` API provided by `Buffer`, `Bitmap`, `Vec` or `MutableBitmap`. +- Kernels SHOULD be implemented via `clone`, `slice` or the `iterator` API provided by `Buffer`, + `Bitmap`, `Vec` or `MutableBitmap`. - Kernels MUST NOT use any API to read bits other than the ones provided by `Bitmap`. -- Implementations SHOULD aim for auto-vectorization, which is usually accomplished via `from_trusted_len_iter`. +- Implementations SHOULD aim for auto-vectorization, which is usually accomplished via + `from_trusted_len_iter`. - Implementations MUST feature-gate any implementation that requires external dependencies - When a kernel accepts dynamically-typed arrays, it MUST expect them as `&dyn Array`. -- When an API returns `&dyn Array`, it MUST return `Box`. The rational is that a `Box` is mutable, while an `Arc` is not. As such, `Box` offers the most flexible API to consumers and the compiler. Users can cast a `Box` into `Arc` via `.into()`. +- When an API returns `&dyn Array`, it MUST return `Box`. The rational is that a `Box` is + mutable, while an `Arc` is not. As such, `Box` offers the most flexible API to consumers and the + compiler. Users can cast a `Box` into `Arc` via `.into()`. diff --git a/crates/polars-arrow/src/doc/lib.md b/crates/polars-arrow/src/doc/lib.md index dd10d361bd80..6d22c121890d 100644 --- a/crates/polars-arrow/src/doc/lib.md +++ b/crates/polars-arrow/src/doc/lib.md @@ -1,12 +1,11 @@ Welcome to polars_arrow's documentation. Thanks for checking it out! This is a library for efficient in-memory data operations with -[Arrow in-memory format](https://arrow.apache.org/docs/format/Columnar.html). -It is a re-write from the bottom up of the official `arrow` crate with soundness -and type safety in mind. +[Arrow in-memory format](https://arrow.apache.org/docs/format/Columnar.html). It is a re-write from +the bottom up of the official `arrow` crate with soundness and type safety in mind. -Check out [the guide](https://jorgecarleitao.github.io/polars_arrow/main/guide/) for an introduction. -Below is an example of some of the things you can do with it: +Check out [the guide](https://jorgecarleitao.github.io/polars_arrow/main/guide/) for an +introduction. Below is an example of some of the things you can do with it: ```rust use std::sync::Arc; @@ -68,15 +67,14 @@ fn main() -> Result<()> { ## Cargo features -This crate has a significant number of cargo features to reduce compilation -time and number of dependencies. The feature `"full"` activates most -functionality, such as: +This crate has a significant number of cargo features to reduce compilation time and number of +dependencies. The feature `"full"` activates most functionality, such as: - `io_ipc`: to interact with the Arrow IPC format - `io_ipc_compression`: to read and write compressed Arrow IPC (v2) - `io_flight` to read and write to Arrow's Flight protocol - `compute` to operate on arrays (addition, sum, sort, etc.) -The feature `simd` (not part of `full`) produces more explicit SIMD instructions -via [`std::simd`](https://doc.rust-lang.org/nightly/std/simd/index.html), but requires the -nightly channel. +The feature `simd` (not part of `full`) produces more explicit SIMD instructions via +[`std::simd`](https://doc.rust-lang.org/nightly/std/simd/index.html), but requires the nightly +channel. diff --git a/crates/polars-arrow/src/io/README.md b/crates/polars-arrow/src/io/README.md index a3c7599b8bdf..52e236220f13 100644 --- a/crates/polars-arrow/src/io/README.md +++ b/crates/polars-arrow/src/io/README.md @@ -5,20 +5,24 @@ This document describes the overall design of this module. ## Rules: - Each directory in this module corresponds to a specific format such as `csv` and `json`. -- directories that depend on external dependencies MUST be feature gated, with a feature named with a prefix `io_`. +- directories that depend on external dependencies MUST be feature gated, with a feature named with + a prefix `io_`. - modules MUST re-export any API of external dependencies they require as part of their public API. E.g. - - if a module as an API `write(writer: &mut csv:Writer, ...)`, it MUST contain `pub use csv::Writer;`. + - if a module as an API `write(writer: &mut csv:Writer, ...)`, it MUST contain + `pub use csv::Writer;`. The rational is that adding this crate to `cargo.toml` must be sufficient to use it. -- Each directory SHOULD contain two directories, `read` and `write`, corresponding - to functionality about reading from the format and writing to the format respectively. +- Each directory SHOULD contain two directories, `read` and `write`, corresponding to functionality + about reading from the format and writing to the format respectively. - The base module SHOULD contain `use pub read;` and `use pub write;`. - Implementations SHOULD separate reading of "data" from reading of "metadata". Examples: - schema read or inference SHOULD be a separate function - functions that read "data" SHOULD consume a schema typically pre-read. -- Implementations SHOULD separate IO-bounded operations from CPU-bounded operations. - I.e. implementations SHOULD: - - contain functions that consume a `Read` implementor and output a "raw" struct, i.e. a struct that is e.g. compressed and serialized +- Implementations SHOULD separate IO-bounded operations from CPU-bounded operations. I.e. + implementations SHOULD: + - contain functions that consume a `Read` implementor and output a "raw" struct, i.e. a struct + that is e.g. compressed and serialized - contain functions that consume a "raw" struct and convert it into Arrow. - - offer each of these functions as independent public APIs, so that consumers can decide how to balance CPU-bounds and IO-bounds. + - offer each of these functions as independent public APIs, so that consumers can decide how to + balance CPU-bounds and IO-bounds. diff --git a/crates/polars-arrow/src/scalar/README.md b/crates/polars-arrow/src/scalar/README.md index ea6c3791d6be..9ff93cd86df7 100644 --- a/crates/polars-arrow/src/scalar/README.md +++ b/crates/polars-arrow/src/scalar/README.md @@ -6,7 +6,8 @@ Design choices: There are three reasons: -- a scalar should have a small memory footprint, which an enum would not ensure given the different physical types available. +- a scalar should have a small memory footprint, which an enum would not ensure given the different + physical types available. - forward-compatibility: a new entry on an `enum` is backward-incompatible - do not expose implementation details to users (reduce the surface of the public API) @@ -14,8 +15,8 @@ There are three reasons: This is to be aligned with the general notion of arrow's `Array`. -This API is a companion to the `Array`, and follows the same design as `Array`. -Specifically, a `Scalar` is a trait object that can be downcasted to concrete implementations. +This API is a companion to the `Array`, and follows the same design as `Array`. Specifically, a +`Scalar` is a trait object that can be downcasted to concrete implementations. Like `Array`, `Scalar` implements diff --git a/crates/polars-compute/README.md b/crates/polars-compute/README.md index e521dcf4e7fe..8b5cc0e0be09 100644 --- a/crates/polars-compute/README.md +++ b/crates/polars-compute/README.md @@ -1,5 +1,7 @@ # polars-compute -`polars-compute` is an **internal sub-crate** of the [Polars](https://crates.io/crates/polars) library, supplying private compute kernels. +`polars-compute` is an **internal sub-crate** of the [Polars](https://crates.io/crates/polars) +library, supplying private compute kernels. -**Important Note**: This crate is **not intended for external usage**. Please refer to the main [Polars crate](https://crates.io/crates/polars) for intended usage. +**Important Note**: This crate is **not intended for external usage**. Please refer to the main +[Polars crate](https://crates.io/crates/polars) for intended usage. diff --git a/crates/polars-core/README.md b/crates/polars-core/README.md index 684c5f33832c..81aabbc979e1 100644 --- a/crates/polars-core/README.md +++ b/crates/polars-core/README.md @@ -1,5 +1,7 @@ # polars-core -`polars-core` is an **internal sub-crate** of the [Polars](https://crates.io/crates/polars) library, providing its core functionalities. +`polars-core` is an **internal sub-crate** of the [Polars](https://crates.io/crates/polars) library, +providing its core functionalities. -**Important Note**: This crate is **not intended for external usage**. Please refer to the main [Polars crate](https://crates.io/crates/polars) for intended usage. +**Important Note**: This crate is **not intended for external usage**. Please refer to the main +[Polars crate](https://crates.io/crates/polars) for intended usage. diff --git a/crates/polars-error/README.md b/crates/polars-error/README.md index 9aaac05c5795..a8af201817db 100644 --- a/crates/polars-error/README.md +++ b/crates/polars-error/README.md @@ -1,5 +1,7 @@ # polars-error -`polars-error` is an **internal sub-crate** of the [Polars](https://crates.io/crates/polars) library, defining its error types. +`polars-error` is an **internal sub-crate** of the [Polars](https://crates.io/crates/polars) +library, defining its error types. -**Important Note**: This crate is **not intended for external usage**. Please refer to the main [Polars crate](https://crates.io/crates/polars) for intended usage. +**Important Note**: This crate is **not intended for external usage**. Please refer to the main +[Polars crate](https://crates.io/crates/polars) for intended usage. diff --git a/crates/polars-expr/README.md b/crates/polars-expr/README.md index 30bada91a12b..73c6bc1c7c7a 100644 --- a/crates/polars-expr/README.md +++ b/crates/polars-expr/README.md @@ -4,4 +4,5 @@ Physical expression implementations. `polars-expr` is an **internal sub-crate** of the [Polars](https://crates.io/crates/polars) library. -**Important Note**: This crate is **not intended for external usage**. Please refer to the main [Polars crate](https://crates.io/crates/polars) for intended usage. +**Important Note**: This crate is **not intended for external usage**. Please refer to the main +[Polars crate](https://crates.io/crates/polars) for intended usage. diff --git a/crates/polars-io/README.md b/crates/polars-io/README.md index 963a1a86e25d..8258a7ef8601 100644 --- a/crates/polars-io/README.md +++ b/crates/polars-io/README.md @@ -1,5 +1,7 @@ # polars-io -`polars-io` is an **internal sub-crate** of the [Polars](https://crates.io/crates/polars) library, that provides IO functionality for the Polars dataframe library. +`polars-io` is an **internal sub-crate** of the [Polars](https://crates.io/crates/polars) library, +that provides IO functionality for the Polars dataframe library. -**Important Note**: This crate is **not intended for external usage**. Please refer to the main [Polars crate](https://crates.io/crates/polars) for intended usage. +**Important Note**: This crate is **not intended for external usage**. Please refer to the main +[Polars crate](https://crates.io/crates/polars) for intended usage. diff --git a/crates/polars-json/README.md b/crates/polars-json/README.md index d0fa8cab5a8b..f056af4c72b8 100644 --- a/crates/polars-json/README.md +++ b/crates/polars-json/README.md @@ -1,5 +1,7 @@ # polars-json -`polars-json` is an **internal sub-crate** of the [Polars](https://crates.io/crates/polars) library, provides functionalities to handle JSON objects. +`polars-json` is an **internal sub-crate** of the [Polars](https://crates.io/crates/polars) library, +provides functionalities to handle JSON objects. -**Important Note**: This crate is **not intended for external usage**. Please refer to the main [Polars crate](https://crates.io/crates/polars) for intended usage. +**Important Note**: This crate is **not intended for external usage**. Please refer to the main +[Polars crate](https://crates.io/crates/polars) for intended usage. diff --git a/crates/polars-lazy/README.md b/crates/polars-lazy/README.md index 81f5cb01d220..b12552cb4a5a 100644 --- a/crates/polars-lazy/README.md +++ b/crates/polars-lazy/README.md @@ -1,5 +1,9 @@ # polars-lazy -`polars-lazy` serves as the lazy query engine for the [Polars](https://crates.io/crates/polars) DataFrame library. It allows you to perform operations on DataFrames in a lazy manner, only executing them when necessary. This can lead to significant performance improvements for large datasets. +`polars-lazy` serves as the lazy query engine for the [Polars](https://crates.io/crates/polars) +DataFrame library. It allows you to perform operations on DataFrames in a lazy manner, only +executing them when necessary. This can lead to significant performance improvements for large +datasets. -**Important Note**: This crate is **not intended for external usage**. Please refer to the main [Polars crate](https://crates.io/crates/polars) for intended usage. +**Important Note**: This crate is **not intended for external usage**. Please refer to the main +[Polars crate](https://crates.io/crates/polars) for intended usage. diff --git a/crates/polars-mem-engine/README.md b/crates/polars-mem-engine/README.md index 4bf8e86f8508..a07c36a16dfd 100644 --- a/crates/polars-mem-engine/README.md +++ b/crates/polars-mem-engine/README.md @@ -1,5 +1,7 @@ # polars-mem-engine -`polars-mem-engine` is an **internal sub-crate** of the [Polars](https://crates.io/crates/polars) library. +`polars-mem-engine` is an **internal sub-crate** of the [Polars](https://crates.io/crates/polars) +library. -**Important Note**: This crate is **not intended for external usage**. Please refer to the main [Polars crate](https://crates.io/crates/polars) for intended usage. +**Important Note**: This crate is **not intended for external usage**. Please refer to the main +[Polars crate](https://crates.io/crates/polars) for intended usage. diff --git a/crates/polars-ops/README.md b/crates/polars-ops/README.md index 9c575ee43613..6801a4d5736e 100644 --- a/crates/polars-ops/README.md +++ b/crates/polars-ops/README.md @@ -1,5 +1,7 @@ # polars-ops -`polars-ops` is an **internal sub-crate** of the [Polars](https://crates.io/crates/polars) library, providing extended operations on Polars data structures. +`polars-ops` is an **internal sub-crate** of the [Polars](https://crates.io/crates/polars) library, +providing extended operations on Polars data structures. -**Important Note**: This crate is **not intended for external usage**. Please refer to the main [Polars crate](https://crates.io/crates/polars) for intended usage. +**Important Note**: This crate is **not intended for external usage**. Please refer to the main +[Polars crate](https://crates.io/crates/polars) for intended usage. diff --git a/crates/polars-parquet/src/arrow/read/README.md b/crates/polars-parquet/src/arrow/read/README.md index c36aaafaf79a..0d44ffca233a 100644 --- a/crates/polars-parquet/src/arrow/read/README.md +++ b/crates/polars-parquet/src/arrow/read/README.md @@ -2,35 +2,34 @@ ### LSB equivalence between definition levels and bitmaps -When the maximum repetition level is 0 and the maximum definition level is 1, -the RLE-encoded definition levels correspond exactly to Arrow's bitmap and can be -memcopied without further transformations. +When the maximum repetition level is 0 and the maximum definition level is 1, the RLE-encoded +definition levels correspond exactly to Arrow's bitmap and can be memcopied without further +transformations. ## Nested parquet groups are deserialized recursively -Reading a parquet nested field is done by reading each primitive -column sequentially, and build the nested struct recursively. +Reading a parquet nested field is done by reading each primitive column sequentially, and build the +nested struct recursively. -Rows of nested parquet groups are encoded in the repetition and definition levels. -In arrow, they correspond to: +Rows of nested parquet groups are encoded in the repetition and definition levels. In arrow, they +correspond to: - list's offsets and validity - struct's validity The implementation in this module leverages this observation: -Nested parquet fields are initially recursed over to gather -whether the type is a Struct or List, and whether it is required or optional, which we store -in `nested_info: Vec>`. `Nested` is a trait object that receives definition -and repetition levels depending on the type and nullability of the nested item. -We process the definition and repetition levels into `nested_info`. +Nested parquet fields are initially recursed over to gather whether the type is a Struct or List, +and whether it is required or optional, which we store in `nested_info: Vec>`. +`Nested` is a trait object that receives definition and repetition levels depending on the type and +nullability of the nested item. We process the definition and repetition levels into `nested_info`. -When we finish a field, we recursively pop from `nested_info` as we build -the `StructArray` or `ListArray`. +When we finish a field, we recursively pop from `nested_info` as we build the `StructArray` or +`ListArray`. With this approach, the only difference vs flat is: -1. we do not leverage the bitmap optimization, and instead need to deserialize the repetition - and definition levels to `i32`. -2. we deserialize definition levels twice, once to extend the values/nullability and - one to extend `nested_info`. +1. we do not leverage the bitmap optimization, and instead need to deserialize the repetition and + definition levels to `i32`. +2. we deserialize definition levels twice, once to extend the values/nullability and one to extend + `nested_info`. diff --git a/crates/polars-parquet/src/arrow/read/deserialize/README.md b/crates/polars-parquet/src/arrow/read/deserialize/README.md index 5b985bac8e9b..aab9c280112d 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/README.md +++ b/crates/polars-parquet/src/arrow/read/deserialize/README.md @@ -2,8 +2,8 @@ ## Non-nested types -Let's start with the design used for non-nested arrays. The (private) entry point of this -module for non-nested arrays is `simple::page_iter_to_arrays`. +Let's start with the design used for non-nested arrays. The (private) entry point of this module for +non-nested arrays is `simple::page_iter_to_arrays`. This function expects @@ -14,35 +14,37 @@ This function expects and returns an iterator of `Array`, `ArrayIter`. -This design is shared among _all_ `(parquet, arrow)` implemented tuples. Their main -difference is how they are deserialized, which depends on the source and target types. +This design is shared among _all_ `(parquet, arrow)` implemented tuples. Their main difference is +how they are deserialized, which depends on the source and target types. When the array iterator is pulled the first time, the following happens: - a page from `Pages` is pulled - a `PageState<'a>` is built from the page - the `PageState` is consumed into a mutable array: - - if `chunk_size` is larger than the number of rows in the page, the mutable array state is preserved and a new page is pulled and the process repeated until we fill a chunk. - - if `chunk_size` is smaller than the number of rows in the page, the mutable array state - is returned and the remaining of the page is consumed into multiple mutable arrays of length `chunk_size` into a FIFO queue. + - if `chunk_size` is larger than the number of rows in the page, the mutable array state is + preserved and a new page is pulled and the process repeated until we fill a chunk. + - if `chunk_size` is smaller than the number of rows in the page, the mutable array state is + returned and the remaining of the page is consumed into multiple mutable arrays of length + `chunk_size` into a FIFO queue. Subsequent pulls of arrays will first try to pull from the FIFO queue. Once the queue is empty, the a new page is pulled. ### `PageState` -As mentioned above, the iterator leverages the idea that we attach a state to a page. Recall -that a page is essentially `[header][data]`. The `data` part contains encoded -`[rep levels][def levels][non-null values]`. Some pages have an associated dictionary page, -in which case the `non-null values` represent the indices. +As mentioned above, the iterator leverages the idea that we attach a state to a page. Recall that a +page is essentially `[header][data]`. The `data` part contains encoded +`[rep levels][def levels][non-null values]`. Some pages have an associated dictionary page, in which +case the `non-null values` represent the indices. Irrespectively of the physical type, the main idea is to split the page in two iterators: - An iterator over `def levels` - An iterator over `non-null values` -and progress the iterators as needed. In particular, for non-nested types, `def levels` is -a bitmap with the same representation as Arrow, in which case the validity is extended directly. +and progress the iterators as needed. In particular, for non-nested types, `def levels` is a bitmap +with the same representation as Arrow, in which case the validity is extended directly. The `non-null values` are "expanded" by filling null values with the default value of each physical type. @@ -52,13 +54,13 @@ type. For nested type with N+1 levels (1 is the primitive), we need to build the nest information of each N levels + the non-nested Arrow array. -This is done by first transversing the parquet types and using it to initialize, per chunk, the N levels. +This is done by first transversing the parquet types and using it to initialize, per chunk, the N +levels. -The per-chunk execution is then similar but `chunk_size` only drives the number of retrieved -rows from the outermost parquet group (the field). Each of these pulls knows how many items need -to be pulled from the inner groups, all the way to the primitive type. This works because -in parquet a row cannot be split between two pages and thus each page is guaranteed -to contain a full row. +The per-chunk execution is then similar but `chunk_size` only drives the number of retrieved rows +from the outermost parquet group (the field). Each of these pulls knows how many items need to be +pulled from the inner groups, all the way to the primitive type. This works because in parquet a row +cannot be split between two pages and thus each page is guaranteed to contain a full row. The `PageState` of nested types is composed by 4 iterators: @@ -66,6 +68,6 @@ The `PageState` of nested types is composed by 4 iterators: - An iterator over `def levels` - An iterator over `non-null values` -The idea is that an iterator of `rep, def` contain all the information to decode the -nesting structure of an arrow array. The other two iterators are equivalent to the non-nested -types with the exception that `def levels` are no equivalent to arrow bitmaps. +The idea is that an iterator of `rep, def` contain all the information to decode the nesting +structure of an arrow array. The other two iterators are equivalent to the non-nested types with the +exception that `def levels` are no equivalent to arrow bitmaps. diff --git a/crates/polars-pipe/README.md b/crates/polars-pipe/README.md index 1186ce9a898a..ee3ee757f80d 100644 --- a/crates/polars-pipe/README.md +++ b/crates/polars-pipe/README.md @@ -1,5 +1,7 @@ # polars-pipe -`polars-pipe` is an **internal sub-crate** of the [Polars](https://crates.io/crates/polars) library, introducing OOC (out of core) algorithms to polars physical plans. +`polars-pipe` is an **internal sub-crate** of the [Polars](https://crates.io/crates/polars) library, +introducing OOC (out of core) algorithms to polars physical plans. -**Important Note**: This crate is **not intended for external usage**. Please refer to the main [Polars crate](https://crates.io/crates/polars) for intended usage. +**Important Note**: This crate is **not intended for external usage**. Please refer to the main +[Polars crate](https://crates.io/crates/polars) for intended usage. diff --git a/crates/polars-plan/README.md b/crates/polars-plan/README.md index 59fce1861941..3a9589b18463 100644 --- a/crates/polars-plan/README.md +++ b/crates/polars-plan/README.md @@ -1,5 +1,7 @@ # polars-plan -`polars-plan` is an **internal sub-crate** of the [Polars](https://crates.io/crates/polars) library, that provides source code responsible for Polars logical planning. +`polars-plan` is an **internal sub-crate** of the [Polars](https://crates.io/crates/polars) library, +that provides source code responsible for Polars logical planning. -**Important Note**: This crate is **not intended for external usage**. Please refer to the main [Polars crate](https://crates.io/crates/polars) for intended usage. +**Important Note**: This crate is **not intended for external usage**. Please refer to the main +[Polars crate](https://crates.io/crates/polars) for intended usage. diff --git a/crates/polars-python/README.md b/crates/polars-python/README.md index 3a68700e34fc..2244b02c543e 100644 --- a/crates/polars-python/README.md +++ b/crates/polars-python/README.md @@ -1,6 +1,7 @@ # polars-python -`polars-python` is an **internal sub-crate** of the [Polars](https://crates.io/crates/polars) library. -It enables running Polars workloads in Python. +`polars-python` is an **internal sub-crate** of the [Polars](https://crates.io/crates/polars) +library. It enables running Polars workloads in Python. -**Important Note**: This crate is **not intended for external usage**. Please refer to the main [Polars crate](https://crates.io/crates/polars) for intended usage. +**Important Note**: This crate is **not intended for external usage**. Please refer to the main +[Polars crate](https://crates.io/crates/polars) for intended usage. diff --git a/crates/polars-row/README.md b/crates/polars-row/README.md index 7e8f59f67620..aa5aa22e67ad 100644 --- a/crates/polars-row/README.md +++ b/crates/polars-row/README.md @@ -1,5 +1,7 @@ # polars-row -`polars-row` is an **internal sub-crate** of the [Polars](https://crates.io/crates/polars) library, that provides row encodings for the Polars DataFrame Library. +`polars-row` is an **internal sub-crate** of the [Polars](https://crates.io/crates/polars) library, +that provides row encodings for the Polars DataFrame Library. -**Important Note**: This crate is **not intended for external usage**. Please refer to the main [Polars crate](https://crates.io/crates/polars) for intended usage. +**Important Note**: This crate is **not intended for external usage**. Please refer to the main +[Polars crate](https://crates.io/crates/polars) for intended usage. diff --git a/crates/polars-schema/README.md b/crates/polars-schema/README.md index 6d68ee41675a..1c983a8c81ce 100644 --- a/crates/polars-schema/README.md +++ b/crates/polars-schema/README.md @@ -1,5 +1,7 @@ # polars-schema -`polars-schema` is an **internal sub-crate** of the [Polars](https://crates.io/crates/polars) library, supplying private schema utility functions. +`polars-schema` is an **internal sub-crate** of the [Polars](https://crates.io/crates/polars) +library, supplying private schema utility functions. -**Important Note**: This crate is **not intended for external usage**. Please refer to the main [Polars crate](https://crates.io/crates/polars) for intended usage. +**Important Note**: This crate is **not intended for external usage**. Please refer to the main +[Polars crate](https://crates.io/crates/polars) for intended usage. diff --git a/crates/polars-sql/README.md b/crates/polars-sql/README.md index 869d2e75da88..0cfcdba54569 100644 --- a/crates/polars-sql/README.md +++ b/crates/polars-sql/README.md @@ -1,6 +1,7 @@ # polars-sql -`polars-sql` is a sub-crate of the [Polars](https://crates.io/crates/polars) library, offering a SQL transpiler. It allows for SQL query conversion to Polars logical plans. +`polars-sql` is a sub-crate of the [Polars](https://crates.io/crates/polars) library, offering a SQL +transpiler. It allows for SQL query conversion to Polars logical plans. ## Usage @@ -17,4 +18,5 @@ You can then import the crate in your Rust code using: use polars_sql::*; ``` -**Important Note**: This crate is **not intended for external usage**. Please refer to the main [Polars crate](https://crates.io/crates/polars) for intended usage. +**Important Note**: This crate is **not intended for external usage**. Please refer to the main +[Polars crate](https://crates.io/crates/polars) for intended usage. diff --git a/crates/polars-stream/README.md b/crates/polars-stream/README.md index c16aedf1901e..c72da5661012 100644 --- a/crates/polars-stream/README.md +++ b/crates/polars-stream/README.md @@ -1,5 +1,7 @@ # polars-stream -`polars-stream` is an **internal sub-crate** of the [Polars](https://crates.io/crates/polars) library, containing a streaming execution engine. +`polars-stream` is an **internal sub-crate** of the [Polars](https://crates.io/crates/polars) +library, containing a streaming execution engine. -**Important Note**: This crate is **not intended for external usage**. Please refer to the main [Polars crate](https://crates.io/crates/polars) for intended usage. +**Important Note**: This crate is **not intended for external usage**. Please refer to the main +[Polars crate](https://crates.io/crates/polars) for intended usage. diff --git a/crates/polars-time/README.md b/crates/polars-time/README.md index d43adb2abb36..6227756e86a2 100644 --- a/crates/polars-time/README.md +++ b/crates/polars-time/README.md @@ -1,5 +1,7 @@ # polars-time -`polars-time` is an **internal sub-crate** of the [Polars](https://crates.io/crates/polars) library, focusing on time-related utilities. +`polars-time` is an **internal sub-crate** of the [Polars](https://crates.io/crates/polars) library, +focusing on time-related utilities. -**Important Note**: This crate is **not intended for external usage**. Please refer to the main [Polars crate](https://crates.io/crates/polars) for intended usage. +**Important Note**: This crate is **not intended for external usage**. Please refer to the main +[Polars crate](https://crates.io/crates/polars) for intended usage. diff --git a/crates/polars-utils/README.md b/crates/polars-utils/README.md index 2f200a67f1a6..71bcb687b80c 100644 --- a/crates/polars-utils/README.md +++ b/crates/polars-utils/README.md @@ -1,5 +1,7 @@ # polars-utils -`polars-utils` is an **internal sub-crate** of the [Polars](https://crates.io/crates/polars) library, supplying private utility functions. +`polars-utils` is an **internal sub-crate** of the [Polars](https://crates.io/crates/polars) +library, supplying private utility functions. -**Important Note**: This crate is **not intended for external usage**. Please refer to the main [Polars crate](https://crates.io/crates/polars) for intended usage. +**Important Note**: This crate is **not intended for external usage**. Please refer to the main +[Polars crate](https://crates.io/crates/polars) for intended usage. diff --git a/docs/README.md b/docs/README.md index 60b690c1c100..f0a24a43924d 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,9 +1,11 @@ -The documentation is split across two subfolders, `source` and `assets`. -The folder `source` contains the static source files that make up the user guide, which are mostly markdown files and the snippets of code. -The folder `assets` contains (dynamically generated) assets used by those files, including data files for the snippets and images with plots or diagrams. +The documentation is split across two subfolders, `source` and `assets`. The folder `source` +contains the static source files that make up the user guide, which are mostly markdown files and +the snippets of code. The folder `assets` contains (dynamically generated) assets used by those +files, including data files for the snippets and images with plots or diagrams. -Do _not_ merge the two folders together. -In [PR #18773](https://github.com/pola-rs/polars/pull/18773) we introduced this split to fix the MkDocs server live reloading. -If everything is in one folder `docs`, the MkDocs server will watch the folder `docs`. -When you make one change the MkDocs server live reloads and rebuilds the docs. -This triggers scripts that build asset files, which change the folder `docs`, leading to an infinite reloading loop. +Do _not_ merge the two folders together. In +[PR #18773](https://github.com/pola-rs/polars/pull/18773) we introduced this split to fix the MkDocs +server live reloading. If everything is in one folder `docs`, the MkDocs server will watch the +folder `docs`. When you make one change the MkDocs server live reloads and rebuilds the docs. This +triggers scripts that build asset files, which change the folder `docs`, leading to an infinite +reloading loop. diff --git a/docs/source/api/index.md b/docs/source/api/index.md index 8e5b13707a71..d0b8e51dbd71 100644 --- a/docs/source/api/index.md +++ b/docs/source/api/index.md @@ -5,15 +5,15 @@ hide: # API reference -The API reference contains detailed descriptions of all public functions and objects. -It's the best place to look if you need information on a specific function. +The API reference contains detailed descriptions of all public functions and objects. It's the best +place to look if you need information on a specific function. ## Python -The Python API reference is built using Sphinx. -It's available in [our docs](https://docs.pola.rs/api/python/stable/reference/index.html). +The Python API reference is built using Sphinx. It's available in +[our docs](https://docs.pola.rs/api/python/stable/reference/index.html). ## Rust -The Rust API reference is built using Cargo. -It's available on [docs.rs](https://docs.rs/polars/latest/polars/). +The Rust API reference is built using Cargo. It's available on +[docs.rs](https://docs.rs/polars/latest/polars/). diff --git a/docs/source/development/contributing/ci.md b/docs/source/development/contributing/ci.md index bd771f79a20c..8488417826a1 100644 --- a/docs/source/development/contributing/ci.md +++ b/docs/source/development/contributing/ci.md @@ -1,6 +1,7 @@ # Continuous integration -Polars uses GitHub Actions as its continuous integration (CI) tool. The setup is reasonably complex, as far as CI setups go. This page explains some of the design choices. +Polars uses GitHub Actions as its continuous integration (CI) tool. The setup is reasonably complex, +as far as CI setups go. This page explains some of the design choices. ## Goal @@ -12,35 +13,51 @@ Overall, the CI suite aims to achieve the following: - Enforce that code is properly documented. - Allow maintainers to easily publish new releases. -We rely on a wide range of tools to achieve this for both the Rust and the Python code base, and thus a lot of checks are triggered on each pull request. +We rely on a wide range of tools to achieve this for both the Rust and the Python code base, and +thus a lot of checks are triggered on each pull request. -It's entirely possible that you submit a relatively trivial fix that subsequently fails a bunch of checks. Do not despair - check the logs to see what went wrong and try to fix it. You can run the failing command locally to verify that everything works correctly. If you can't figure it out, ask a maintainer for help! +It's entirely possible that you submit a relatively trivial fix that subsequently fails a bunch of +checks. Do not despair - check the logs to see what went wrong and try to fix it. You can run the +failing command locally to verify that everything works correctly. If you can't figure it out, ask a +maintainer for help! ## Design The CI setup is designed with the following requirements in mind: -- Get feedback on each step individually. We want to avoid our test job being cancelled because a linting check failed, only to find out later that we also have a failing test. -- Get feedback on each check as quickly as possible. We want to be able to iterate quickly if it turns out our code does not pass some of the checks. -- Only run checks when they need to be run. A change to the Rust code does not warrant a linting check of the Python code, for example. +- Get feedback on each step individually. We want to avoid our test job being cancelled because a + linting check failed, only to find out later that we also have a failing test. +- Get feedback on each check as quickly as possible. We want to be able to iterate quickly if it + turns out our code does not pass some of the checks. +- Only run checks when they need to be run. A change to the Rust code does not warrant a linting + check of the Python code, for example. This results in a modular setup with many separate workflows and jobs that rely heavily on caching. ### Modular setup -The repository consists of two main parts: the Rust code base and the Python code base. Both code bases are interdependent: Rust code is tested through Python tests, and the Python code relies on the Rust implementation for most functionality. +The repository consists of two main parts: the Rust code base and the Python code base. Both code +bases are interdependent: Rust code is tested through Python tests, and the Python code relies on +the Rust implementation for most functionality. -To make sure CI jobs are only run when they need to be run, each workflow is triggered only when relevant files are modified. +To make sure CI jobs are only run when they need to be run, each workflow is triggered only when +relevant files are modified. ### Caching -The main challenge is that the Rust code base for Polars is quite large, and consequently, compiling the project from scratch is slow. This is addressed by caching the Rust build artifacts. +The main challenge is that the Rust code base for Polars is quite large, and consequently, compiling +the project from scratch is slow. This is addressed by caching the Rust build artifacts. -However, since GitHub Actions does not allow sharing caches between feature branches, we need to run the workflows on the main branch as well - at least the part that builds the Rust cache. This leads to many workflows that trigger both on pull request AND on push to the main branch, with individual steps of jobs enabled or disabled based on the branch it runs on. +However, since GitHub Actions does not allow sharing caches between feature branches, we need to run +the workflows on the main branch as well - at least the part that builds the Rust cache. This leads +to many workflows that trigger both on pull request AND on push to the main branch, with individual +steps of jobs enabled or disabled based on the branch it runs on. -Care must also be taken not to exceed the maximum cache space of 10Gb allotted to open source GitHub repositories. Hence we do not do any caching on feature branches - we always use the cache available from the main branch. This also avoids any extra time that would be required to store the cache. +Care must also be taken not to exceed the maximum cache space of 10Gb allotted to open source GitHub +repositories. Hence we do not do any caching on feature branches - we always use the cache available +from the main branch. This also avoids any extra time that would be required to store the cache. ## Releases -The release jobs for Rust and Python are triggered manually. -Refer to the [contributing guide](./index.md#release-flow) for the full release process. +The release jobs for Rust and Python are triggered manually. Refer to the +[contributing guide](./index.md#release-flow) for the full release process. diff --git a/docs/source/development/contributing/ide.md b/docs/source/development/contributing/ide.md index 31811e3f12ae..4e500001e54d 100644 --- a/docs/source/development/contributing/ide.md +++ b/docs/source/development/contributing/ide.md @@ -1,7 +1,7 @@ # IDE configuration -Using an integrated development environments (IDE) and configuring it properly will help you work on Polars more effectively. -This page contains some recommendations for configuring popular IDEs. +Using an integrated development environments (IDE) and configuring it properly will help you work on +Polars more effectively. This page contains some recommendations for configuring popular IDEs. ## Visual Studio Code @@ -13,9 +13,12 @@ The extensions below are recommended. #### rust-analyzer -If you work on the Rust code at all, you will need the [rust-analyzer](https://marketplace.visualstudio.com/items?itemName=rust-lang.rust-analyzer) extension. This extension provides code completion for the Rust code. +If you work on the Rust code at all, you will need the +[rust-analyzer](https://marketplace.visualstudio.com/items?itemName=rust-lang.rust-analyzer) +extension. This extension provides code completion for the Rust code. -For it to work well for the Polars code base, add the following settings to your `.vscode/settings.json`: +For it to work well for the Polars code base, add the following settings to your +`.vscode/settings.json`: ```json { @@ -26,10 +29,10 @@ For it to work well for the Polars code base, add the following settings to your #### Ruff -The [Ruff](https://marketplace.visualstudio.com/items?itemName=charliermarsh.ruff) extension will help you conform to the formatting requirements of the Python code. -We use both the Ruff linter and formatter. -It is recommended to configure the extension to use the Ruff installed in your environment. -This will make it use the correct Ruff version and configuration. +The [Ruff](https://marketplace.visualstudio.com/items?itemName=charliermarsh.ruff) extension will +help you conform to the formatting requirements of the Python code. We use both the Ruff linter and +formatter. It is recommended to configure the extension to use the Ruff installed in your +environment. This will make it use the correct Ruff version and configuration. ```json { @@ -39,20 +42,22 @@ This will make it use the correct Ruff version and configuration. #### CodeLLDB -The [CodeLLDB](https://marketplace.visualstudio.com/items?itemName=vadimcn.vscode-lldb) extension is useful for debugging Rust code. -You can also debug Rust code called from Python (see section below). +The [CodeLLDB](https://marketplace.visualstudio.com/items?itemName=vadimcn.vscode-lldb) extension is +useful for debugging Rust code. You can also debug Rust code called from Python (see section below). ### Debugging -Due to the way that Python and Rust interoperate, debugging the Rust side of development from Python calls can be difficult. -This guide shows how to set up a debugging environment that makes debugging Rust code called from a Python script painless. +Due to the way that Python and Rust interoperate, debugging the Rust side of development from Python +calls can be difficult. This guide shows how to set up a debugging environment that makes debugging +Rust code called from a Python script painless. #### Preparation -Start by installing the CodeLLDB extension (see above). -Then add the following two configurations to your `launch.json` file. -This file is usually found in the `.vscode` folder of your project root. -See the [official VSCode documentation](https://code.visualstudio.com/docs/editor/debugging#_launch-configurations) for more information about the `launch.json` file. +Start by installing the CodeLLDB extension (see above). Then add the following two configurations to +your `launch.json` file. This file is usually found in the `.vscode` folder of your project root. +See the +[official VSCode documentation](https://code.visualstudio.com/docs/editor/debugging#_launch-configurations) +for more information about the `launch.json` file.
launch.json @@ -110,18 +115,22 @@ See the [official VSCode documentation](https://code.visualstudio.com/docs/edito 2. Set breakpoints in any `.rs` or `.py` file. -3. In the `Run and Debug` panel on the left, select `Debug Rust/Python` from the drop-down menu on top and click the `Start Debugging` button. +3. In the `Run and Debug` panel on the left, select `Debug Rust/Python` from the drop-down menu on + top and click the `Start Debugging` button. -At this point, your debugger should stop on breakpoints in any `.rs` file located within the codebase. +At this point, your debugger should stop on breakpoints in any `.rs` file located within the +codebase. #### Details -The debugging feature runs via the specially-designed VSCode launch configuration shown above. -The initial Python debugger is launched using a special launch script located at `py-polars/debug/launch.py` and passes the name of the script to be debugged (the target script) as an input argument. -The launch script determines the process ID, writes this value into the `launch.json` configuration file, compiles the target script and runs it in the current environment. -At this point, a second (Rust) debugger is attached to the Python debugger. -The result is two simultaneous debuggers operating on the same running instance. -Breakpoints in the Python code will stop on the Python debugger and breakpoints in the Rust code will stop on the Rust debugger. +The debugging feature runs via the specially-designed VSCode launch configuration shown above. The +initial Python debugger is launched using a special launch script located at +`py-polars/debug/launch.py` and passes the name of the script to be debugged (the target script) as +an input argument. The launch script determines the process ID, writes this value into the +`launch.json` configuration file, compiles the target script and runs it in the current environment. +At this point, a second (Rust) debugger is attached to the Python debugger. The result is two +simultaneous debuggers operating on the same running instance. Breakpoints in the Python code will +stop on the Python debugger and breakpoints in the Rust code will stop on the Rust debugger. ## JetBrains (PyCharm, RustRover, CLion) diff --git a/docs/source/development/contributing/index.md b/docs/source/development/contributing/index.md index c3175df9f5b2..763c59cdd39c 100644 --- a/docs/source/development/contributing/index.md +++ b/docs/source/development/contributing/index.md @@ -4,46 +4,61 @@ render_macros: false # Overview -Thanks for taking the time to contribute! We appreciate all contributions, from reporting bugs to implementing new features. -If you're unclear on how to proceed after reading this guide, please contact us on [Discord](https://discord.gg/4UfP5cfBE7). +Thanks for taking the time to contribute! We appreciate all contributions, from reporting bugs to +implementing new features. If you're unclear on how to proceed after reading this guide, please +contact us on [Discord](https://discord.gg/4UfP5cfBE7). ## Reporting bugs -We use [GitHub issues](https://github.com/pola-rs/polars/issues) to track bugs and suggested enhancements. -You can report a bug by opening a [new issue](https://github.com/pola-rs/polars/issues/new/choose). -Use the appropriate issue type for the language you are using ([Rust](https://github.com/pola-rs/polars/issues/new?labels=bug&template=bug_report_rust.yml) / [Python](https://github.com/pola-rs/polars/issues/new?labels=bug&template=bug_report_python.yml)). +We use [GitHub issues](https://github.com/pola-rs/polars/issues) to track bugs and suggested +enhancements. You can report a bug by opening a +[new issue](https://github.com/pola-rs/polars/issues/new/choose). Use the appropriate issue type for +the language you are using +([Rust](https://github.com/pola-rs/polars/issues/new?labels=bug&template=bug_report_rust.yml) / +[Python](https://github.com/pola-rs/polars/issues/new?labels=bug&template=bug_report_python.yml)). -Before creating a bug report, please check that your bug has not already been reported, and that your bug exists on the latest version of Polars. -If you find a closed issue that seems to report the same bug you're experiencing, open a new issue and include a link to the original issue in your issue description. +Before creating a bug report, please check that your bug has not already been reported, and that +your bug exists on the latest version of Polars. If you find a closed issue that seems to report the +same bug you're experiencing, open a new issue and include a link to the original issue in your +issue description. -Please include as many details as possible in your bug report. The information helps the maintainers resolve the issue faster. +Please include as many details as possible in your bug report. The information helps the maintainers +resolve the issue faster. ## Suggesting enhancements -We use [GitHub issues](https://github.com/pola-rs/polars/issues) to track bugs and suggested enhancements. -You can suggest an enhancement by opening a [new feature request](https://github.com/pola-rs/polars/issues/new?labels=enhancement&template=feature_request.yml). +We use [GitHub issues](https://github.com/pola-rs/polars/issues) to track bugs and suggested +enhancements. You can suggest an enhancement by opening a +[new feature request](https://github.com/pola-rs/polars/issues/new?labels=enhancement&template=feature_request.yml). Before creating an enhancement suggestion, please check that a similar issue does not already exist. -Please describe the behavior you want and why, and provide examples of how Polars would be used if your feature were added. +Please describe the behavior you want and why, and provide examples of how Polars would be used if +your feature were added. ## Contributing to the codebase ### Picking an issue -Pick an issue by going through the [issue tracker](https://github.com/pola-rs/polars/issues) and finding an issue you would like to work on. -Feel free to pick any issue with an [accepted](https://github.com/pola-rs/polars/issues?q=is%3Aopen+is%3Aissue+label%3Aaccepted) label that is not already assigned. -We use the [help wanted](https://github.com/pola-rs/polars/issues?q=is%3Aopen+is%3Aissue+label%3A%22help+wanted%22) label to indicate issues that are high on our wishlist. +Pick an issue by going through the [issue tracker](https://github.com/pola-rs/polars/issues) and +finding an issue you would like to work on. Feel free to pick any issue with an +[accepted](https://github.com/pola-rs/polars/issues?q=is%3Aopen+is%3Aissue+label%3Aaccepted) label +that is not already assigned. We use the +[help wanted](https://github.com/pola-rs/polars/issues?q=is%3Aopen+is%3Aissue+label%3A%22help+wanted%22) +label to indicate issues that are high on our wishlist. -If you are a first time contributor, you might want to look for issues labeled [good first issue](https://github.com/pola-rs/polars/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22). -The Polars code base is quite complex, so starting with a small issue will help you find your way around! +If you are a first time contributor, you might want to look for issues labeled +[good first issue](https://github.com/pola-rs/polars/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22). +The Polars code base is quite complex, so starting with a small issue will help you find your way +around! -If you would like to take on an issue, please comment on the issue to let others know. -You may use the issue to discuss possible solutions. +If you would like to take on an issue, please comment on the issue to let others know. You may use +the issue to discuss possible solutions. ### Setting up your local environment -The Polars development flow relies on both Rust and Python, which means setting up your local development environment is not trivial. -If you run into problems, please contact us on [Discord](https://discord.gg/4UfP5cfBE7). +The Polars development flow relies on both Rust and Python, which means setting up your local +development environment is not trivial. If you run into problems, please contact us on +[Discord](https://discord.gg/4UfP5cfBE7). !!! note @@ -53,15 +68,18 @@ If you run into problems, please contact us on [Discord](https://discord.gg/4UfP #### Configuring Git -For contributing to Polars you need a free [GitHub account](https://github.com) and have [git](https://git-scm.com) installed on your machine. -Start by [forking](https://docs.github.com/en/get-started/quickstart/fork-a-repo) the Polars repository, then clone your forked repository using `git`: +For contributing to Polars you need a free [GitHub account](https://github.com) and have +[git](https://git-scm.com) installed on your machine. Start by +[forking](https://docs.github.com/en/get-started/quickstart/fork-a-repo) the Polars repository, then +clone your forked repository using `git`: ```bash git clone https://github.com//polars.git cd polars ``` -Optionally set the `upstream` remote to be able to sync your fork with the Polars repository in the future: +Optionally set the `upstream` remote to be able to sync your fork with the Polars repository in the +future: ```bash git remote add upstream https://github.com/pola-rs/polars.git @@ -70,25 +88,28 @@ git fetch upstream #### Installing dependencies -In order to work on Polars effectively, you will need [Rust](https://www.rust-lang.org/), [Python](https://www.python.org/), and [dprint](https://dprint.dev/). +In order to work on Polars effectively, you will need [Rust](https://www.rust-lang.org/), +[Python](https://www.python.org/), and [dprint](https://dprint.dev/). -First, install Rust using [rustup](https://www.rust-lang.org/tools/install). -After the initial installation, you will also need to install the nightly toolchain: +First, install Rust using [rustup](https://www.rust-lang.org/tools/install). After the initial +installation, you will also need to install the nightly toolchain: ```bash rustup toolchain install nightly --component miri ``` -Next, install Python, for example using [pyenv](https://github.com/pyenv/pyenv#installation). -We recommend using the latest Python version (`3.12`). -Make sure you deactivate any active virtual environments (command: `deactivate`) or conda environments (command: `conda deactivate`), as the steps below will create a new [virtual environment](https://docs.python.org/3/tutorial/venv.html) for Polars. -You will need Python even if you intend to work on the Rust code only, as we rely on the Python tests to verify all functionality. +Next, install Python, for example using [pyenv](https://github.com/pyenv/pyenv#installation). We +recommend using the latest Python version (`3.12`). Make sure you deactivate any active virtual +environments (command: `deactivate`) or conda environments (command: `conda deactivate`), as the +steps below will create a new [virtual environment](https://docs.python.org/3/tutorial/venv.html) +for Polars. You will need Python even if you intend to work on the Rust code only, as we rely on the +Python tests to verify all functionality. -Finally, install [dprint](https://dprint.dev/install/). -This is not strictly required, but it is recommended as we use it to autoformat certain file types. +Finally, install [dprint](https://dprint.dev/install/). This is not strictly required, but it is +recommended as we use it to autoformat certain file types. -You can now check that everything works correctly by going into the `py-polars` directory and running the test suite -(warning: this may be slow the first time you run it): +You can now check that everything works correctly by going into the `py-polars` directory and +running the test suite (warning: this may be slow the first time you run it): ```bash cd py-polars @@ -102,8 +123,10 @@ make test This will do a number of things: - Use Python to create a virtual environment in the `.venv` folder. -- Use [pip](https://pip.pypa.io/) and [uv](https://github.com/astral-sh/uv) to install all Python dependencies for development, linting, and building documentation. -- Use Rust to compile and install Polars in your virtual environment. _At least 8GB of RAM is recommended for this step to run smoothly._ +- Use [pip](https://pip.pypa.io/) and [uv](https://github.com/astral-sh/uv) to install all Python + dependencies for development, linting, and building documentation. +- Use Rust to compile and install Polars in your virtual environment. _At least 8GB of RAM is + recommended for this step to run smoothly._ - Use [pytest](https://docs.pytest.org/) to run the Python unittests in your virtual environment !!! note @@ -118,8 +141,8 @@ Check if linting also works correctly by running: make pre-commit ``` -Note that we do not actually use the [pre-commit](https://pre-commit.com/) tool. -We use the Makefile to conveniently run the following formatting and linting tools: +Note that we do not actually use the [pre-commit](https://pre-commit.com/) tool. We use the Makefile +to conveniently run the following formatting and linting tools: - [ruff](https://github.com/charliermarsh/ruff) - [mypy](http://mypy-lang.org/) @@ -131,8 +154,9 @@ If this all runs correctly, you're ready to start contributing to the Polars cod #### Updating the development environment -Dependencies are updated regularly - at least once per month. -If you do not keep your environment up-to-date, you may notice tests or CI checks failing, or you may not be able to build Polars at all. +Dependencies are updated regularly - at least once per month. If you do not keep your environment +up-to-date, you may notice tests or CI checks failing, or you may not be able to build Polars at +all. To update your environment, first make sure your fork is in sync with the Polars repository: @@ -149,8 +173,8 @@ Update all Python dependencies to their latest versions by running: make requirements ``` -If the Rust toolchain version has been updated, you should update your Rust toolchain. -Follow it up by running `cargo clean` to make sure your Cargo folder does not grow too large: +If the Rust toolchain version has been updated, you should update your Rust toolchain. Follow it up +by running `cargo clean` to make sure your Cargo folder does not grow too large: ```bash rustup update @@ -161,14 +185,14 @@ cargo clean Create a new git branch from the `main` branch in your local repository, and start coding! -The Rust code is located in the `crates` directory, while the Python codebase is located in the `py-polars` directory. -Both directories contain a `Makefile` with helpful commands. Most notably: +The Rust code is located in the `crates` directory, while the Python codebase is located in the +`py-polars` directory. Both directories contain a `Makefile` with helpful commands. Most notably: - `make test` to run the test suite (see the [test suite docs](./test.md) for more info) - `make pre-commit` to run autoformatting and linting -Note that your work cannot be merged if these checks fail! -Run `make help` to get a list of other helpful commands. +Note that your work cannot be merged if these checks fail! Run `make help` to get a list of other +helpful commands. Two other things to keep in mind: @@ -177,8 +201,9 @@ Two other things to keep in mind: ### Pull requests -When you have resolved your issue, [open a pull request](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request-from-a-fork) in the Polars repository. -Please adhere to the following guidelines: +When you have resolved your issue, +[open a pull request](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request-from-a-fork) +in the Polars repository. Please adhere to the following guidelines: - Title: @@ -199,26 +224,32 @@ Please adhere to the following guidelines: - Make sure all [GitHub Actions checks](./ci.md) pass. -After you have opened your pull request, a maintainer will review it and possibly leave some comments. -Once all issues are resolved, the maintainer will merge your pull request, and your work will be part of the next Polars release! +After you have opened your pull request, a maintainer will review it and possibly leave some +comments. Once all issues are resolved, the maintainer will merge your pull request, and your work +will be part of the next Polars release! -Keep in mind that your work does not have to be perfect right away! -If you are stuck or unsure about your solution, feel free to open a draft pull request and ask for help. +Keep in mind that your work does not have to be perfect right away! If you are stuck or unsure about +your solution, feel free to open a draft pull request and ask for help. ## Contributing to documentation -The most important components of Polars documentation are the [user guide](https://docs.pola.rs/user-guide/), the [API references](https://docs.pola.rs/api/), and the database of questions on [StackOverflow](https://stackoverflow.com/). +The most important components of Polars documentation are the +[user guide](https://docs.pola.rs/user-guide/), the [API references](https://docs.pola.rs/api/), and +the database of questions on [StackOverflow](https://stackoverflow.com/). ### User guide -The user guide is maintained in the `docs/source/user-guide` folder. Before creating a PR first raise an issue to discuss what you feel is missing or could be improved. +The user guide is maintained in the `docs/source/user-guide` folder. Before creating a PR first +raise an issue to discuss what you feel is missing or could be improved. #### Building and serving the user guide -The user guide is built using [MkDocs](https://www.mkdocs.org/). You install the dependencies for building the user guide by running `make build` in the root of the repo. -Additionally, you need to make sure the [graphviz](https://graphviz.org/) `dot` binary is on your path. +The user guide is built using [MkDocs](https://www.mkdocs.org/). You install the dependencies for +building the user guide by running `make build` in the root of the repo. Additionally, you need to +make sure the [graphviz](https://graphviz.org/) `dot` binary is on your path. -Activate the virtual environment and run `mkdocs serve` to build and serve the user guide, so you can view it locally and see updates as you make changes. +Activate the virtual environment and run `mkdocs serve` to build and serve the user guide, so you +can view it locally and see updates as you make changes. #### Creating a new user guide page @@ -226,7 +257,8 @@ Each user guide page is based on a `.md` markdown file. This file must be listed #### Adding a shell code block -To add a code block with code to be run in a shell with tabs for Python and Rust, use the following format: +To add a code block with code to be run in a shell with tabs for Python and Rust, use the following +format: ```` === ":fontawesome-brands-python: Python" @@ -244,17 +276,25 @@ To add a code block with code to be run in a shell with tabs for Python and Rust #### Adding a code block -The snippets for Python and Rust code blocks are in the `docs/source/src/python/` and `docs/source/src/rust/` directories, respectively. To add a code snippet with Python or Rust code to a `.md` page, use the following format: +The snippets for Python and Rust code blocks are in the `docs/source/src/python/` and +`docs/source/src/rust/` directories, respectively. To add a code snippet with Python or Rust code to +a `.md` page, use the following format: ``` {{code_block('user-guide/io/cloud-storage','read_parquet',['read_parquet','read_csv'])}} ``` -- The first argument is a path to either or both files called `docs/source/src/python/user-guide/io/cloud-storage.py` and `docs/source/src/rust/user-guide/io/cloud-storage.rs`. -- The second argument is the name given at the start and end of each snippet in the `.py` or `.rs` file -- The third argument is a list of links to functions in the API docs. For each element of the list there must be a corresponding entry in `docs/source/_build/API_REFERENCE_LINKS.yml` +- The first argument is a path to either or both files called + `docs/source/src/python/user-guide/io/cloud-storage.py` and + `docs/source/src/rust/user-guide/io/cloud-storage.rs`. +- The second argument is the name given at the start and end of each snippet in the `.py` or `.rs` + file +- The third argument is a list of links to functions in the API docs. For each element of the list + there must be a corresponding entry in `docs/source/_build/API_REFERENCE_LINKS.yml` -If the corresponding `.py` and `.rs` snippet files both exist then each snippet named in the second argument to `code_block` above must exist or the build will fail. An empty snippet should be added to the `.py` or `.rs` file if the snippet is not needed. +If the corresponding `.py` and `.rs` snippet files both exist then each snippet named in the second +argument to `code_block` above must exist or the build will fail. An empty snippet should be added +to the `.py` or `.rs` file if the snippet is not needed. Each snippet is formatted as follows: @@ -266,10 +306,12 @@ df = pl.read_parquet("file.parquet") # --8<-- [end:read_parquet] ``` -The snippet is delimited by `--8<-- [start:]` and `--8<-- [end:]`. The snippet name must match the name given in the second argument to `code_block` above. +The snippet is delimited by `--8<-- [start:]` and `--8<-- [end:]`. The +snippet name must match the name given in the second argument to `code_block` above. -In some cases, you may need to add links to different functions for the Python and Rust APIs. -When that is the case, you can use the two extra optional arguments that `code_block` accepts, that can be used to pass Python-only and Rust-only links: +In some cases, you may need to add links to different functions for the Python and Rust APIs. When +that is the case, you can use the two extra optional arguments that `code_block` accepts, that can +be used to pass Python-only and Rust-only links: ``` {{code_block('path', 'snippet_name', ['common_api_links'], ['python_only_links'], ['rust_only_links'])}} @@ -277,82 +319,111 @@ When that is the case, you can use the two extra optional arguments that `code_b #### Linting -Before committing, install `dprint` (see above) and run `dprint fmt` from the `docs` directory to lint the markdown files. +Before committing, install `dprint` (see above) and run `dprint fmt` from the `docs` directory to +lint the markdown files. ### API reference -Polars has separate API references for [Rust](https://docs.pola.rs/api/rust/dev/polars/) and [Python](https://docs.pola.rs/api/python/dev/reference/index.html). -These are generated directly from the codebase, so in order to contribute, you will have to follow the steps outlined in [this section](#contributing-to-the-codebase) above. +Polars has separate API references for [Rust](https://docs.pola.rs/api/rust/dev/polars/) and +[Python](https://docs.pola.rs/api/python/dev/reference/index.html). These are generated directly +from the codebase, so in order to contribute, you will have to follow the steps outlined in +[this section](#contributing-to-the-codebase) above. #### Rust -Rust Polars uses `cargo doc` to build its documentation. Contributions to improve or clarify the API reference are welcome. +Rust Polars uses `cargo doc` to build its documentation. Contributions to improve or clarify the API +reference are welcome. #### Python -For the Python API reference, we always welcome good docstring examples. -There are still parts of the API that do not have any code examples. -This is a great way to start contributing to Polars! +For the Python API reference, we always welcome good docstring examples. There are still parts of +the API that do not have any code examples. This is a great way to start contributing to Polars! -Note that we follow the [numpydoc](https://numpydoc.readthedocs.io/en/latest/format.html) convention. -Docstring examples should also follow the [Black](https://black.readthedocs.io/) codestyle. -From the `py-polars` directory, run `make fmt` to make sure your additions pass the linter, and run `make doctest` to make sure your docstring examples are valid. +Note that we follow the [numpydoc](https://numpydoc.readthedocs.io/en/latest/format.html) +convention. Docstring examples should also follow the [Black](https://black.readthedocs.io/) +codestyle. From the `py-polars` directory, run `make fmt` to make sure your additions pass the +linter, and run `make doctest` to make sure your docstring examples are valid. -Polars uses Sphinx to build the API reference. -This means docstrings in general should follow the [reST](https://www.sphinx-doc.org/en/master/usage/restructuredtext/basics.html) format. -If you want to build the API reference locally, go to the `py-polars/docs` directory and run `make html`. -The resulting HTML files will be in `py-polars/docs/build/html`. +Polars uses Sphinx to build the API reference. This means docstrings in general should follow the +[reST](https://www.sphinx-doc.org/en/master/usage/restructuredtext/basics.html) format. If you want +to build the API reference locally, go to the `py-polars/docs` directory and run `make html`. The +resulting HTML files will be in `py-polars/docs/build/html`. -New additions to the API should be added manually to the API reference by adding an entry to the correct `.rst` file in the `py-polars/docs/source/reference` directory. +New additions to the API should be added manually to the API reference by adding an entry to the +correct `.rst` file in the `py-polars/docs/source/reference` directory. ### StackOverflow -We use StackOverflow to create a database of high quality questions and answers that is searchable and remains up-to-date. -There is a separate tag for each language: +We use StackOverflow to create a database of high quality questions and answers that is searchable +and remains up-to-date. There is a separate tag for each language: - [Python Polars](https://stackoverflow.com/questions/tagged/python-polars) - [Rust Polars](https://stackoverflow.com/questions/tagged/rust-polars) -Contributions in the form of well-formulated questions or answers are always welcome! -If you add a new question, please notify us by adding a [matching issue](https://github.com/pola-rs/polars/issues/new?&labels=question&template=question.yml) to our GitHub issue tracker. +Contributions in the form of well-formulated questions or answers are always welcome! If you add a +new question, please notify us by adding a +[matching issue](https://github.com/pola-rs/polars/issues/new?&labels=question&template=question.yml) +to our GitHub issue tracker. ## Release flow _This section is intended for Polars maintainers._ -Polars releases Rust crates to [crates.io](https://crates.io/crates/polars) and Python packages to [PyPI](https://pypi.org/project/polars/). +Polars releases Rust crates to [crates.io](https://crates.io/crates/polars) and Python packages to +[PyPI](https://pypi.org/project/polars/). -New releases are marked by an official [GitHub release](https://github.com/pola-rs/polars/releases) and an associated git tag. We utilize [Release Drafter](https://github.com/release-drafter/release-drafter) to automatically draft GitHub releases with release notes. +New releases are marked by an official [GitHub release](https://github.com/pola-rs/polars/releases) +and an associated git tag. We utilize +[Release Drafter](https://github.com/release-drafter/release-drafter) to automatically draft GitHub +releases with release notes. ### Steps -The steps for releasing a new Rust or Python version are similar. The release process is mostly automated through GitHub Actions, but some manual steps are required. Follow the steps below to release a new version. +The steps for releasing a new Rust or Python version are similar. The release process is mostly +automated through GitHub Actions, but some manual steps are required. Follow the steps below to +release a new version. Start by bumping the version number in the source code: -1. Check the [releases page](https://github.com/pola-rs/polars/releases) on GitHub and find the appropriate draft release. Note the version number associated with this release. -2. Make sure your fork is up-to-date with the latest version of the main Polars repository, and create a new branch. +1. Check the [releases page](https://github.com/pola-rs/polars/releases) on GitHub and find the + appropriate draft release. Note the version number associated with this release. +2. Make sure your fork is up-to-date with the latest version of the main Polars repository, and + create a new branch. 3. Bump the version number. -- _Rust:_ Update the version number in all `Cargo.toml` files in the `polars` directory and subdirectories. You'll probably want to use some search/replace strategy, as there are quite a few crates that need to be updated. -- _Python:_ Update the version number in [`py-polars/Cargo.toml`](https://github.com/pola-rs/polars/blob/main/py-polars/Cargo.toml#L3) to match the version of the draft release. +- _Rust:_ Update the version number in all `Cargo.toml` files in the `polars` directory and + subdirectories. You'll probably want to use some search/replace strategy, as there are quite a few + crates that need to be updated. +- _Python:_ Update the version number in + [`py-polars/Cargo.toml`](https://github.com/pola-rs/polars/blob/main/py-polars/Cargo.toml#L3) to + match the version of the draft release. 4. From the `py-polars` directory, run `make build` to generate a new `Cargo.lock` file. -5. Create a new commit with all files added. The name of the commit should follow the format `release(): Polars `. For example: `release(python): Python Polars 0.16.1` +5. Create a new commit with all files added. The name of the commit should follow the format + `release(): Polars `. For example: + `release(python): Python Polars 0.16.1` 6. Push your branch and open a new pull request to the `main` branch of the main Polars repository. 7. Wait for the GitHub Actions checks to pass, then squash and merge your pull request. Directly after merging your pull request, release the new version: -8. Go to the release workflow ([Python](https://github.com/pola-rs/polars/actions/workflows/release-python.yml)/[Rust](https://github.com/pola-rs/polars/actions/workflows/release-rust.yml)), click _Run workflow_ in the top right, and click the green button. This will trigger the workflow, which will build all release artifacts and publish them. -9. Wait for the workflow to finish, then check [crates.io](https://crates.io/crates/polars)/[PyPI](https://pypi.org/project/polars/)/[GitHub](https://github.com/pola-rs/polars/releases) to verify that the new Polars release is now available. +8. Go to the release workflow + ([Python](https://github.com/pola-rs/polars/actions/workflows/release-python.yml)/[Rust](https://github.com/pola-rs/polars/actions/workflows/release-rust.yml)), + click _Run workflow_ in the top right, and click the green button. This will trigger the + workflow, which will build all release artifacts and publish them. +9. Wait for the workflow to finish, then check + [crates.io](https://crates.io/crates/polars)/[PyPI](https://pypi.org/project/polars/)/[GitHub](https://github.com/pola-rs/polars/releases) + to verify that the new Polars release is now available. ### Troubleshooting -It may happen that one or multiple release jobs fail. If so, you should first try to simply re-run the failed jobs from the GitHub Actions UI. +It may happen that one or multiple release jobs fail. If so, you should first try to simply re-run +the failed jobs from the GitHub Actions UI. -If that doesn't help, you will have to figure out what's wrong and commit a fix. Once your fix has made it to the `main` branch, simply re-trigger the release workflow. +If that doesn't help, you will have to figure out what's wrong and commit a fix. Once your fix has +made it to the `main` branch, simply re-trigger the release workflow. ## License -Any contributions you make to this project will fall under the [MIT License](https://github.com/pola-rs/polars/blob/main/LICENSE) that covers the Polars project. +Any contributions you make to this project will fall under the +[MIT License](https://github.com/pola-rs/polars/blob/main/LICENSE) that covers the Polars project. diff --git a/docs/source/development/contributing/test.md b/docs/source/development/contributing/test.md index 135012953cb1..b219bcb085fe 100644 --- a/docs/source/development/contributing/test.md +++ b/docs/source/development/contributing/test.md @@ -4,84 +4,97 @@ Additional information on the Rust test suite will be added to this page later. -The `py-polars/tests` folder contains the main Polars test suite. -This page contains some information on the various components of the test suite, as well as guidelines for writing new tests. +The `py-polars/tests` folder contains the main Polars test suite. This page contains some +information on the various components of the test suite, as well as guidelines for writing new +tests. -The test suite contains four main components, each confined to their own folder: unit tests, parametric tests, benchmark tests, and doctests. +The test suite contains four main components, each confined to their own folder: unit tests, +parametric tests, benchmark tests, and doctests. -Note that this test suite is indirectly responsible for testing Rust Polars as well. -The Rust test suite is kept small to reduce compilation times. -A lot of the Rust functionality is tested here instead. +Note that this test suite is indirectly responsible for testing Rust Polars as well. The Rust test +suite is kept small to reduce compilation times. A lot of the Rust functionality is tested here +instead. ## Unit tests -The `unit` folder contains all regular unit tests. -These tests are intended to make sure all Polars functionality works as intended. +The `unit` folder contains all regular unit tests. These tests are intended to make sure all Polars +functionality works as intended. ### Running unit tests -Run unit tests by running `make test` from the `py-polars` folder. -This will compile the Rust bindings and then run the unit tests. +Run unit tests by running `make test` from the `py-polars` folder. This will compile the Rust +bindings and then run the unit tests. -If you're working in the Python code only, you can avoid recompiling every time by simply running `pytest` instead from your virtual environment. +If you're working in the Python code only, you can avoid recompiling every time by simply running +`pytest` instead from your virtual environment. -By default, "slow" tests and "ci-only" tests are skipped for local test runs. -Such tests are marked using a [custom pytest marker](https://docs.pytest.org/en/latest/example/markers.html). -To run these tests specifically, you can run `pytest -m slow`, `pytest -m ci_only`, `pytest -m slow ci_only` -or run `pytest -m ""` to run _all_ tests, regardless of marker. +By default, "slow" tests and "ci-only" tests are skipped for local test runs. Such tests are marked +using a [custom pytest marker](https://docs.pytest.org/en/latest/example/markers.html). To run these +tests specifically, you can run `pytest -m slow`, `pytest -m ci_only`, `pytest -m slow ci_only` or +run `pytest -m ""` to run _all_ tests, regardless of marker. -Note that the "ci-only" tests may require you to run `make requirements-all` to get additional dependencies -(such as `torch`) that are otherwise not installed as part of the default Polars development environment. +Note that the "ci-only" tests may require you to run `make requirements-all` to get additional +dependencies (such as `torch`) that are otherwise not installed as part of the default Polars +development environment. -Tests can be run in parallel by running `pytest -n auto`. -The parallelization is handled by [`pytest-xdist`](https://pytest-xdist.readthedocs.io/en/latest/). +Tests can be run in parallel by running `pytest -n auto`. The parallelization is handled by +[`pytest-xdist`](https://pytest-xdist.readthedocs.io/en/latest/). ### Writing unit tests -Whenever you add new functionality, you should also add matching unit tests. -Add your tests to appropriate test module in the `unit` folder. -Some guidelines to keep in mind: +Whenever you add new functionality, you should also add matching unit tests. Add your tests to +appropriate test module in the `unit` folder. Some guidelines to keep in mind: - Try to fully cover all possible inputs and edge cases you can think of. -- Utilize pytest tools like [`fixture`](https://docs.pytest.org/en/latest/explanation/fixtures.html) and [`parametrize`](https://docs.pytest.org/en/latest/how-to/parametrize.html) where appropriate. -- Since many tests will require some data to be defined first, it can be efficient to run multiple checks in a single test. This can also be addressed using pytest fixtures. +- Utilize pytest tools like [`fixture`](https://docs.pytest.org/en/latest/explanation/fixtures.html) + and [`parametrize`](https://docs.pytest.org/en/latest/how-to/parametrize.html) where appropriate. +- Since many tests will require some data to be defined first, it can be efficient to run multiple + checks in a single test. This can also be addressed using pytest fixtures. - Unit tests should not depend on external factors, otherwise test parallelization will break. ## Parametric tests -The `parametric` folder contains parametric tests written using the [Hypothesis](https://hypothesis.readthedocs.io/) framework. -These tests are intended to find and test edge cases by generating many random datapoints. +The `parametric` folder contains parametric tests written using the +[Hypothesis](https://hypothesis.readthedocs.io/) framework. These tests are intended to find and +test edge cases by generating many random datapoints. ### Running parametric tests Run parametric tests by running `pytest -m hypothesis`. -Note that parametric tests are excluded by default when running `pytest`. -You must explicitly specify `-m hypothesis` to run them. +Note that parametric tests are excluded by default when running `pytest`. You must explicitly +specify `-m hypothesis` to run them. -These tests _will_ be included when calculating test coverage, and will also be run as part of the `make test-all` make command. +These tests _will_ be included when calculating test coverage, and will also be run as part of the +`make test-all` make command. ## Doctests -The `docs` folder contains a script for running [`doctest`](https://docs.python.org/3/library/doctest.html). -This folder does not contain any actual tests - rather, the script checks all docstrings in the Polars package for `Examples` sections, runs the code examples, and verifies the output. +The `docs` folder contains a script for running +[`doctest`](https://docs.python.org/3/library/doctest.html). This folder does not contain any actual +tests - rather, the script checks all docstrings in the Polars package for `Examples` sections, runs +the code examples, and verifies the output. -The aim of running `doctest` is to make sure the `Examples` sections in our docstrings are valid and remain up-to-date with code changes. +The aim of running `doctest` is to make sure the `Examples` sections in our docstrings are valid and +remain up-to-date with code changes. ### Running `doctest` -To run the `doctest` module, run `make doctest` from the `py-polars` folder. -You can also run the script directly from your virtual environment. +To run the `doctest` module, run `make doctest` from the `py-polars` folder. You can also run the +script directly from your virtual environment. -Note that doctests are _not_ run using pytest. While pytest does have the capability to run doc examples, configuration options are too limited for our purposes. +Note that doctests are _not_ run using pytest. While pytest does have the capability to run doc +examples, configuration options are too limited for our purposes. -Doctests will _not_ count towards test coverage. They are not a substitute for unit tests, but rather intended to convey the intended use of the Polars API to the user. +Doctests will _not_ count towards test coverage. They are not a substitute for unit tests, but +rather intended to convey the intended use of the Polars API to the user. ### Writing doc examples -Almost all classes/methods/functions that are part of Polars' public API should include code examples in their docstring. -These examples help users understand basic usage and allow us to illustrate more advanced concepts as well. -Some guidelines for writing a good docstring `Examples` section: +Almost all classes/methods/functions that are part of Polars' public API should include code +examples in their docstring. These examples help users understand basic usage and allow us to +illustrate more advanced concepts as well. Some guidelines for writing a good docstring `Examples` +section: - Start with a minimal example that showcases the default functionality. - Showcase the effect of its parameters. @@ -90,7 +103,10 @@ Some guidelines for writing a good docstring `Examples` section: There are many great docstring examples already, just check other code if you need inspiration! -In addition to the [regular options](https://docs.python.org/3/library/doctest.html#option-flags) available when writing doctests, the script configuration allows for a new `IGNORE_RESULT` directive. Use this directive if you want to ensure the code runs, but the output may be random by design or not interesting to check. +In addition to the [regular options](https://docs.python.org/3/library/doctest.html#option-flags) +available when writing doctests, the script configuration allows for a new `IGNORE_RESULT` +directive. Use this directive if you want to ensure the code runs, but the output may be random by +design or not interesting to check. ```python >>> df.sample(n=2) # doctest: +IGNORE_RESULT @@ -98,25 +114,28 @@ In addition to the [regular options](https://docs.python.org/3/library/doctest.h ## Benchmark tests -The `benchmark` folder contains code for running various benchmark tests. -The aim of this part of the test suite is to spot performance regressions in the code, and to verify that Polars functionality works as expected when run on a release build or at a larger scale. +The `benchmark` folder contains code for running various benchmark tests. The aim of this part of +the test suite is to spot performance regressions in the code, and to verify that Polars +functionality works as expected when run on a release build or at a larger scale. -Polars uses [CodSpeed](https://codspeed.io/pola-rs/polars) for tracking the performance of the benchmark tests. +Polars uses [CodSpeed](https://codspeed.io/pola-rs/polars) for tracking the performance of the +benchmark tests. ### Generating data -For most tests, a relatively large dataset must be generated first. -This is done as part of the `pytest` setup process. +For most tests, a relatively large dataset must be generated first. This is done as part of the +`pytest` setup process. -The data generation logic was taken from the [H2O.ai database benchmark](https://github.com/h2oai/db-benchmark), which is the foundation for many of the benchmark tests. +The data generation logic was taken from the +[H2O.ai database benchmark](https://github.com/h2oai/db-benchmark), which is the foundation for many +of the benchmark tests. ### Running the benchmark tests -The benchmark tests can be run using pytest. -Run `pytest -m benchmark --durations 0 -v` to run these tests and report run duration. +The benchmark tests can be run using pytest. Run `pytest -m benchmark --durations 0 -v` to run these +tests and report run duration. -Note that benchmark tests are excluded by default when running `pytest`. -You must explicitly specify `-m benchmark` to run them. -They will also be excluded when calculating test coverage. +Note that benchmark tests are excluded by default when running `pytest`. You must explicitly specify +`-m benchmark` to run them. They will also be excluded when calculating test coverage. These tests _will_ be run as part of the `make test-all` make command. diff --git a/docs/source/development/versioning.md b/docs/source/development/versioning.md index 8b584349d7d3..402c935b28aa 100644 --- a/docs/source/development/versioning.md +++ b/docs/source/development/versioning.md @@ -5,35 +5,39 @@ Polars adheres to the [semantic versioning](https://semver.org/) specification: - Breaking changes lead to a **major** version increase (`1.0.0`, `2.0.0`, ...) -- New features and performance improvements lead to a **minor** version increase (`1.1.0`, `1.2.0`, ...) +- New features and performance improvements lead to a **minor** version increase (`1.1.0`, `1.2.0`, + ...) - Other changes lead to a **patch** version increase (`1.0.1`, `1.0.2`, ...) ## Policy for breaking changes -Polars takes backwards compatibility seriously, but we are not afraid to change things if it leads to a better product. +Polars takes backwards compatibility seriously, but we are not afraid to change things if it leads +to a better product. ### Philosophy -We don't always get it right on the first try. -We learn as we go along and get feedback from our users. -Sometimes, we're a little too eager to get out a new feature and didn't ponder all the possible implications. +We don't always get it right on the first try. We learn as we go along and get feedback from our +users. Sometimes, we're a little too eager to get out a new feature and didn't ponder all the +possible implications. -If this happens, we correct our mistakes and introduce a breaking change. -Most of the time, this is no big deal. -Users get a deprecation warning, they do a quick search-and-replace in their code base, and that's that. +If this happens, we correct our mistakes and introduce a breaking change. Most of the time, this is +no big deal. Users get a deprecation warning, they do a quick search-and-replace in their code base, +and that's that. -At times, we run into an issue requires more effort on our user's part to fix. -A change in the query engine can seriously impact the assumptions in a data pipeline. -We do not make such changes lightly, but we will make them if we believe it makes Polars better. +At times, we run into an issue requires more effort on our user's part to fix. A change in the query +engine can seriously impact the assumptions in a data pipeline. We do not make such changes lightly, +but we will make them if we believe it makes Polars better. -Freeing ourselves of past indiscretions is important to keep Polars moving forward. -We know it takes time and energy for our users to keep up with new releases but, in the end, it benefits everyone for Polars to be the best product possible. +Freeing ourselves of past indiscretions is important to keep Polars moving forward. We know it takes +time and energy for our users to keep up with new releases but, in the end, it benefits everyone for +Polars to be the best product possible. ### What qualifies as a breaking change **A breaking change occurs when an existing component of the public API is changed or removed.** -A feature is part of the public API if it is documented in the [API reference](https://docs.pola.rs/api/python/stable/reference/index.html). +A feature is part of the public API if it is documented in the +[API reference](https://docs.pola.rs/api/python/stable/reference/index.html). Examples of breaking changes: @@ -47,32 +51,39 @@ Examples of changes that are _not_ considered breaking: - The module path of a public class is changed. - An optional parameter is added to an existing method. -Bug fixes are not considered a breaking change, even though it may impact some users' [workflows](https://xkcd.com/1172/). +Bug fixes are not considered a breaking change, even though it may impact some users' +[workflows](https://xkcd.com/1172/). ### Unstable functionality -Some parts of the public API are marked as **unstable**. -You can recognize this functionality from the warning in the API reference, or from the warning issued when the configuration option `warn_unstable` is active. -There are a number of reasons functionality may be marked as unstable: +Some parts of the public API are marked as **unstable**. You can recognize this functionality from +the warning in the API reference, or from the warning issued when the configuration option +`warn_unstable` is active. There are a number of reasons functionality may be marked as unstable: -- We are unsure about the exact API. The name, function signature, or implementation are likely to change in the future. -- The functionality is not tested extensively yet. Bugs may pop up when used in real-world scenarios. -- The functionality does not yet integrate well with the full Polars API. You may find it works in one context but not in another. +- We are unsure about the exact API. The name, function signature, or implementation are likely to + change in the future. +- The functionality is not tested extensively yet. Bugs may pop up when used in real-world + scenarios. +- The functionality does not yet integrate well with the full Polars API. You may find it works in + one context but not in another. -Releasing functionality as unstable allows us to gather important feedback from users that use Polars in real-world scenarios. -This helps us fine-tune things before giving it our final stamp of approval. -Users that are only interested in solid, well-tested functionality can avoid this part of the API. +Releasing functionality as unstable allows us to gather important feedback from users that use +Polars in real-world scenarios. This helps us fine-tune things before giving it our final stamp of +approval. Users that are only interested in solid, well-tested functionality can avoid this part of +the API. -Functionality marked as unstable may change at any point without it being considered a breaking change. +Functionality marked as unstable may change at any point without it being considered a breaking +change. ### Deprecation warnings -If we decide to introduce a breaking change, the existing behavior is deprecated _if possible_. -For example, if we choose to rename a function, the new function is added alongside the old function, and using the old function will result in a deprecation warning. +If we decide to introduce a breaking change, the existing behavior is deprecated _if possible_. For +example, if we choose to rename a function, the new function is added alongside the old function, +and using the old function will result in a deprecation warning. -Not all changes can be deprecated nicely. -A change to the query engine may have effects across a large part of the API. -Such changes will not be warned for, but _will_ be included in the changelog and the migration guide. +Not all changes can be deprecated nicely. A change to the query engine may have effects across a +large part of the API. Such changes will not be warned for, but _will_ be included in the changelog +and the migration guide. !!! warning Rust users only @@ -82,30 +93,33 @@ Such changes will not be warned for, but _will_ be included in the changelog and ### Deprecation period As a rule, deprecated functionality is removed two breaking releases after the deprecation happens. -For example, a function deprecated in version `1.2.3` will be retained in version `2.0.0` and removed in version `3.0.0`. +For example, a function deprecated in version `1.2.3` will be retained in version `2.0.0` and +removed in version `3.0.0`. -An exception to this rule are deprecations introduced with a breaking release. -These will be enforced on the next breaking release. -For example, a function deprecated in version `2.0.0` will be removed in version `3.0.0`. +An exception to this rule are deprecations introduced with a breaking release. These will be +enforced on the next breaking release. For example, a function deprecated in version `2.0.0` will be +removed in version `3.0.0`. -This means that if your program does not raise any deprecation warnings, it should be mostly safe to upgrade to the next major version. -As breaking releases happen about once every six months, this allows six to twelve months to adjust to any pending breaking changes. +This means that if your program does not raise any deprecation warnings, it should be mostly safe to +upgrade to the next major version. As breaking releases happen about once every six months, this +allows six to twelve months to adjust to any pending breaking changes. -**In some cases, we may decide to adjust the deprecation period.** -If retaining the deprecated functionality blocks other improvements to Polars, we may shorten the deprecation period to a single breaking release. This will be mentioned in the warning message. -If the deprecation affects many users, we may extend the deprecation period. +**In some cases, we may decide to adjust the deprecation period.** If retaining the deprecated +functionality blocks other improvements to Polars, we may shorten the deprecation period to a single +breaking release. This will be mentioned in the warning message. If the deprecation affects many +users, we may extend the deprecation period. ## Release frequency -Polars does not have a set release schedule. -We issue a new release whenever we feel like we have something new and valuable to offer to our users. -In practice, a new minor version is released about once every one or two weeks. +Polars does not have a set release schedule. We issue a new release whenever we feel like we have +something new and valuable to offer to our users. In practice, a new minor version is released about +once every one or two weeks. ### Breaking releases -Over time, issues pop up that require a breaking change to address. -When enough issues have accumulated, we issue a breaking release. +Over time, issues pop up that require a breaking change to address. When enough issues have +accumulated, we issue a breaking release. -So far, breaking releases have happened about once every three to six months. -The rate and severity of breaking changes will continue to diminish as Polars grows more solid. -From this point on, we expect new major versions to be released about once every six months. +So far, breaking releases have happened about once every three to six months. The rate and severity +of breaking changes will continue to diminish as Polars grows more solid. From this point on, we +expect new major versions to be released about once every six months. diff --git a/docs/source/index.md b/docs/source/index.md index 4fd988e974eb..7e1a102833b3 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -16,17 +16,24 @@ -Polars is a blazingly fast DataFrame library for manipulating structured data. The core is written in Rust, and available for Python, R and NodeJS. +Polars is a blazingly fast DataFrame library for manipulating structured data. The core is written +in Rust, and available for Python, R and NodeJS. ## Key features -- **Fast**: Written from scratch in Rust, designed close to the machine and without external dependencies. +- **Fast**: Written from scratch in Rust, designed close to the machine and without external + dependencies. - **I/O**: First class support for all common data storage layers: local, cloud storage & databases. -- **Intuitive API**: Write your queries the way they were intended. Polars, internally, will determine the most efficient way to execute using its query optimizer. -- **Out of Core**: The streaming API allows you to process your results without requiring all your data to be in memory at the same time. -- **Parallel**: Utilises the power of your machine by dividing the workload among the available CPU cores without any additional configuration. -- **Vectorized Query Engine**: Using [Apache Arrow](https://arrow.apache.org/), a columnar data format, to process your queries in a vectorized manner and SIMD to optimize CPU usage. -- **GPU Support**: Optionally run queries on NVIDIA GPUs for maximum performance for in-memory workloads. +- **Intuitive API**: Write your queries the way they were intended. Polars, internally, will + determine the most efficient way to execute using its query optimizer. +- **Out of Core**: The streaming API allows you to process your results without requiring all your + data to be in memory at the same time. +- **Parallel**: Utilises the power of your machine by dividing the workload among the available CPU + cores without any additional configuration. +- **Vectorized Query Engine**: Using [Apache Arrow](https://arrow.apache.org/), a columnar data + format, to process your queries in a vectorized manner and SIMD to optimize CPU usage. +- **GPU Support**: Optionally run queries on NVIDIA GPUs for maximum performance for in-memory + workloads. @@ -45,7 +52,8 @@ The goal of Polars is to provide a lightning fast DataFrame library that: - A consistent and predictable API. - Adheres to a strict schema (data-types should be known before running the query). -Polars is written in Rust which gives it C/C++ performance and allows it to fully control performance-critical parts in a query engine. +Polars is written in Rust which gives it C/C++ performance and allows it to fully control +performance-critical parts in a query engine. ## Example @@ -55,14 +63,17 @@ A more extensive introduction can be found in the [next chapter](user-guide/gett ## Community -Polars has a very active community with frequent releases (approximately weekly). Below are some of the top contributors to the project: +Polars has a very active community with frequent releases (approximately weekly). Below are some of +the top contributors to the project: --8<-- "docs/assets/people.md" ## Contributing -We appreciate all contributions, from reporting bugs to implementing new features. Read our [contributing guide](development/contributing/index.md) to learn more. +We appreciate all contributions, from reporting bugs to implementing new features. Read our +[contributing guide](development/contributing/index.md) to learn more. ## License -This project is licensed under the terms of the [MIT license](https://github.com/pola-rs/polars/blob/main/LICENSE). +This project is licensed under the terms of the +[MIT license](https://github.com/pola-rs/polars/blob/main/LICENSE). diff --git a/docs/source/releases/changelog.md b/docs/source/releases/changelog.md index 6b0ad77bc203..d3afd8af6807 100644 --- a/docs/source/releases/changelog.md +++ b/docs/source/releases/changelog.md @@ -2,4 +2,5 @@ Polars uses GitHub to manage both Python and Rust releases. -Refer to our [GitHub releases page](https://github.com/pola-rs/polars/releases) for the changelog associated with each new release. +Refer to our [GitHub releases page](https://github.com/pola-rs/polars/releases) for the changelog +associated with each new release. diff --git a/docs/source/releases/upgrade/0.19.md b/docs/source/releases/upgrade/0.19.md index 0f41b9a52ba7..68cf4ed064e1 100644 --- a/docs/source/releases/upgrade/0.19.md +++ b/docs/source/releases/upgrade/0.19.md @@ -4,9 +4,9 @@ ### Aggregation functions no longer support horizontal computation -This impacts aggregation functions like `sum`, `min`, and `max`. -These functions were overloaded to support both vertical and horizontal computation. -Recently, new dedicated functionality for horizontal computation was released, and horizontal computation was deprecated. +This impacts aggregation functions like `sum`, `min`, and `max`. These functions were overloaded to +support both vertical and horizontal computation. Recently, new dedicated functionality for +horizontal computation was released, and horizontal computation was deprecated. Restore the old behavior by using the horizontal variant, e.g. `sum_horizontal`. @@ -47,8 +47,9 @@ shape: (1, 2) `all` will now ignore null values by default, rather than treat them as `False`. -For both `any` and `all`, the `drop_nulls` parameter has been renamed to `ignore_nulls` and is now keyword-only. -Also fixed an issue when setting this parameter to `False` would erroneously result in `None` output in some cases. +For both `any` and `all`, the `drop_nulls` parameter has been renamed to `ignore_nulls` and is now +keyword-only. Also fixed an issue when setting this parameter to `False` would erroneously result in +`None` output in some cases. To restore the old behavior, set `ignore_nulls` to `False` and check for `None` output. @@ -70,9 +71,9 @@ True ### Improved error types for many methods -Improving our error messages is an ongoing effort. -We did a sweep of our Python code base and made many improvements to error messages and error types. -Most notably, many `ValueError`s were changed to `TypeError`s. +Improving our error messages is an ongoing effort. We did a sweep of our Python code base and made +many improvements to error messages and error types. Most notably, many `ValueError`s were changed +to `TypeError`s. If your code relies on handling Polars exceptions, you may have to make some adjustments. @@ -96,9 +97,9 @@ TypeError: Series constructor called with unsupported type 'int' for the `values ### Updates to expression input parsing -Methods like `select` and `with_columns` accept one or more expressions. -But they also accept strings, integers, lists, and other inputs that we try to interpret as expressions. -We updated our internal logic to parse inputs more consistently. +Methods like `select` and `with_columns` accept one or more expressions. But they also accept +strings, integers, lists, and other inputs that we try to interpret as expressions. We updated our +internal logic to parse inputs more consistently. **Example** @@ -134,8 +135,8 @@ shape: (2, 2) ### `shuffle` / `sample` now use an internal Polars seed -If you used the built-in Python `random.seed` function to control the randomness of Polars expressions, this will no longer work. -Instead, use the new `set_random_seed` function. +If you used the built-in Python `random.seed` function to control the randomness of Polars +expressions, this will no longer work. Instead, use the new `set_random_seed` function. **Example** @@ -157,10 +158,12 @@ pl.set_random_seed(1) ## Deprecations -Creating a consistent and intuitive API is hard; finding the right name for each function, method, and parameter might be the hardest part. -The new version comes with several naming changes, and you will most likely run into deprecation warnings when upgrading to `0.19`. +Creating a consistent and intuitive API is hard; finding the right name for each function, method, +and parameter might be the hardest part. The new version comes with several naming changes, and you +will most likely run into deprecation warnings when upgrading to `0.19`. -If you want to upgrade without worrying about deprecation warnings right now, you can add the following snippet to your code: +If you want to upgrade without worrying about deprecation warnings right now, you can add the +following snippet to your code: ```python import warnings @@ -170,7 +173,9 @@ warnings.filterwarnings("ignore", category=DeprecationWarning) ### `groupby` renamed to `group_by` -This is not a change we make lightly, as it will impact almost all our users. But "group by" is really two different words, and our naming strategy dictates that these should be separated by an underscore. +This is not a change we make lightly, as it will impact almost all our users. But "group by" is +really two different words, and our naming strategy dictates that these should be separated by an +underscore. Most likely, a simple search and replace will be enough to take care of this update: @@ -179,9 +184,11 @@ Most likely, a simple search and replace will be enough to take care of this upd ### `apply` renamed to `map_*` -`apply` is probably the most misused part of our API. Many Polars users come from pandas, where `apply` has a completely different meaning. +`apply` is probably the most misused part of our API. Many Polars users come from pandas, where +`apply` has a completely different meaning. -We now consolidate all our functionality for user-defined functions under the name `map`. This results in the following renaming: +We now consolidate all our functionality for user-defined functions under the name `map`. This +results in the following renaming: | Before | After | | --------------------------- | -------------- | diff --git a/docs/source/releases/upgrade/0.20.md b/docs/source/releases/upgrade/0.20.md index b981dbb0e06d..f001f4918e45 100644 --- a/docs/source/releases/upgrade/0.20.md +++ b/docs/source/releases/upgrade/0.20.md @@ -4,12 +4,12 @@ ### Change default `join` behavior with regard to null values -Previously, null values in the join key were considered a value like any other value. -This meant that null values in the left frame would be joined with null values in the right frame. -This is expensive and does not match default behavior in SQL. +Previously, null values in the join key were considered a value like any other value. This meant +that null values in the left frame would be joined with null values in the right frame. This is +expensive and does not match default behavior in SQL. -Default behavior has now been changed to ignore null values in the join key. -The previous behavior can be retained by setting `join_nulls=True`. +Default behavior has now been changed to ignore null values in the join key. The previous behavior +can be retained by setting `join_nulls=True`. **Example** @@ -57,12 +57,12 @@ shape: (2, 3) ### Preserve left and right join keys in outer joins Previously, the result of an outer join did not contain the join keys of the left and right frames. -Rather, it contained a coalesced version of the left key and right key. -This loses information and does not conform to default SQL behavior. +Rather, it contained a coalesced version of the left key and right key. This loses information and +does not conform to default SQL behavior. -The behavior has been changed to include the original join keys. -Name clashes are solved by appending a suffix (`_right` by default) to the right join key name. -The previous behavior can be retained by setting `how="outer_coalesce"`. +The behavior has been changed to include the original join keys. Name clashes are solved by +appending a suffix (`_right` by default) to the right join key name. The previous behavior can be +retained by setting `how="outer_coalesce"`. **Example** @@ -116,11 +116,14 @@ shape: (4, 3) ### `count` now ignores null values -The `count` method for `Expr` and `Series` now ignores null values. Use `len` to get the count with null values included. +The `count` method for `Expr` and `Series` now ignores null values. Use `len` to get the count with +null values included. -Note that `pl.count()` and `group_by(...).count()` are unchanged. These count the number of rows in the context, so nulls are not applicable in the same way. +Note that `pl.count()` and `group_by(...).count()` are unchanged. These count the number of rows in +the context, so nulls are not applicable in the same way. -This brings behavior more in line with the SQL standard, where `COUNT(col)` ignores null values but `COUNT(*)` counts rows regardless of null values. +This brings behavior more in line with the SQL standard, where `COUNT(col)` ignores null values but +`COUNT(*)` counts rows regardless of null values. **Example** @@ -164,11 +167,11 @@ shape: (1, 1) ### `NaN` values are now considered equal -Floating point `NaN` values were treated as unequal across Polars operations. -This has been corrected to better match user expectation and existing standards. +Floating point `NaN` values were treated as unequal across Polars operations. This has been +corrected to better match user expectation and existing standards. -While this is considered a bug fix, it is included in this guide in order to draw attention to possible impact on user workflows that may contain -`NaN` values. +While this is considered a bug fix, it is included in this guide in order to draw attention to +possible impact on user workflows that may contain `NaN` values. **Example** @@ -201,13 +204,16 @@ Series: '' [bool] ### Assertion utils updates to exact checking and `NaN` equality -The assertion utility functions `assert_frame_equal` and `assert_series_equal` would use the tolerance parameters `atol` and `rtol` to do approximate checking, unless `check_exact` was set to `True`. -This could lead to some surprising behavior, as integers are generally thought of as exact values. -Integer values are now always checked exactly. -To do inexact checking, convert to float first. +The assertion utility functions `assert_frame_equal` and `assert_series_equal` would use the +tolerance parameters `atol` and `rtol` to do approximate checking, unless `check_exact` was set to +`True`. This could lead to some surprising behavior, as integers are generally thought of as exact +values. Integer values are now always checked exactly. To do inexact checking, convert to float +first. -Additionally, the `nans_compare_equal` parameter has been removed and `NaN` values are now always considered equal, which was the previous default behavior. -This parameter had previously been deprecated but has been removed before the end of the standard deprecation period to facilitate the change to `NaN` equality. +Additionally, the `nans_compare_equal` parameter has been removed and `NaN` values are now always +considered equal, which was the previous default behavior. This parameter had previously been +deprecated but has been removed before the end of the standard deprecation period to facilitate the +change to `NaN` equality. **Example** @@ -232,14 +238,20 @@ AssertionError: DataFrames are different (value mismatch for column 'id') ### Allow all `DataType` objects to be instantiated -Polars data types are subclasses of the `DataType` class. -We had a 'hack' in place that automatically converted data types instantiated without any arguments to the `class`, rather than actually instantiating it. -The idea was to allow specifying data types as `Int64` rather than `Int64()`, which is more succinct. -However, this caused some unexpected behavior when working directly with data type objects, especially as there was a discrepancy with data types like `Datetime` which _will_ be instantiated in many cases. +Polars data types are subclasses of the `DataType` class. We had a 'hack' in place that +automatically converted data types instantiated without any arguments to the `class`, rather than +actually instantiating it. The idea was to allow specifying data types as `Int64` rather than +`Int64()`, which is more succinct. However, this caused some unexpected behavior when working +directly with data type objects, especially as there was a discrepancy with data types like +`Datetime` which _will_ be instantiated in many cases. -Going forward, instantiating a data type will always return an instance of that class. Both classes an instances are handled by Polars, so the previous short syntax is still available. Methods that return data types like `Series.dtype` and `DataFrame.schema` now always return instantiated data types objects. +Going forward, instantiating a data type will always return an instance of that class. Both classes +an instances are handled by Polars, so the previous short syntax is still available. Methods that +return data types like `Series.dtype` and `DataFrame.schema` now always return instantiated data +types objects. -You may have to update some of your data type checks if you were not already using the equality operator (`==`), as well as update some type hints. +You may have to update some of your data type checks if you were not already using the equality +operator (`==`), as well as update some type hints. **Example** @@ -268,8 +280,8 @@ True ### Update constructors for `Decimal` and `Array` data types -The data types `Decimal` and `Array` have had their parameters switched around. -The new constructors should more closely match user expectations. +The data types `Decimal` and `Array` have had their parameters switched around. The new constructors +should more closely match user expectations. **Example** @@ -293,9 +305,10 @@ Decimal(precision=10, scale=5) ### `DataType.is_nested` changed from a property to a class method -This is a minor change, but a very important one to properly update. -Failure to update accordingly may result in faulty logic, as Python will evaluate the _method_ to `True`. -For example, `if dtype.is_nested` will now evaluate to `True` regardless of the data type, because it returns the method, which Python considers truthy. +This is a minor change, but a very important one to properly update. Failure to update accordingly +may result in faulty logic, as Python will evaluate the _method_ to `True`. For example, +`if dtype.is_nested` will now evaluate to `True` regardless of the data type, because it returns the +method, which Python considers truthy. **Example** @@ -315,9 +328,9 @@ True ### Smaller integer data types for datetime components `dt.month`, `dt.week` -Most datetime components such as `month` and `week` would previously return a `UInt32` type. -This has been updated to the smallest appropriate signed integer type. -This should reduce memory consumption. +Most datetime components such as `month` and `week` would previously return a `UInt32` type. This +has been updated to the smallest appropriate signed integer type. This should reduce memory +consumption. | Method | Dtype old | Dtype new | | ----------- | --------- | --------- | @@ -398,8 +411,11 @@ Series: 'a' [null] The new implementation is mostly backwards compatible. Please do note the following: -1. The logic for determining the return data type has changed. You may want to specify `return_dtype` to override the inferred data type, or take advantage of the new function signature (separate `old` and `new` parameters) to influence the return type. -2. The previous workaround for referencing other columns as default by using a struct column no longer works. It now simply works as expected, no workaround needed. +1. The logic for determining the return data type has changed. You may want to specify + `return_dtype` to override the inferred data type, or take advantage of the new function + signature (separate `old` and `new` parameters) to influence the return type. +2. The previous workaround for referencing other columns as default by using a struct column no + longer works. It now simply works as expected, no workaround needed. **Example** @@ -501,9 +517,10 @@ shape: (2, 2) ### Update `read_parquet` to use Object Store rather than fsspec -If you were using `read_parquet`, installing `fsspec` as an optional dependency is no longer required. -The new Object Store implementation was already in use for `scan_parquet`. -It may have slightly different behavior in certain cases, such as how credentials are detected and how downloads are performed. +If you were using `read_parquet`, installing `fsspec` as an optional dependency is no longer +required. The new Object Store implementation was already in use for `scan_parquet`. It may have +slightly different behavior in certain cases, such as how credentials are detected and how downloads +are performed. The resulting `DataFrame` should be identical between versions. @@ -511,8 +528,9 @@ The resulting `DataFrame` should be identical between versions. ### Cumulative functions renamed from `cum*` to `cum_*` -Technically, this deprecation was introduced in version `0.19.14`, but many users will first encounter it when upgrading to `0.20`. -It's a relatively impactful change, which is why we mention it here. +Technically, this deprecation was introduced in version `0.19.14`, but many users will first +encounter it when upgrading to `0.20`. It's a relatively impactful change, which is why we mention +it here. | Old name | New name | | ----------- | ------------ | diff --git a/docs/source/releases/upgrade/1.md b/docs/source/releases/upgrade/1.md index e401d43b72a6..8caaad765f8b 100644 --- a/docs/source/releases/upgrade/1.md +++ b/docs/source/releases/upgrade/1.md @@ -4,10 +4,11 @@ ### Properly apply `strict` parameter in Series constructor -The behavior of the Series constructor has been updated. -Generally, it will be more strict, unless the user passes `strict=False`. +The behavior of the Series constructor has been updated. Generally, it will be more strict, unless +the user passes `strict=False`. -Strict construction is more efficient than non-strict construction, so make sure to pass values of the same data type to the constructor for the best performance. +Strict construction is more efficient than non-strict construction, so make sure to pass values of +the same data type to the constructor for the best performance. **Example** @@ -67,11 +68,11 @@ Series: '' [i8] ### Change data orientation inference logic for DataFrame construction -Polars no longer inspects data types to infer the orientation of the data passed to the DataFrame constructor. -Data orientation is inferred based on the data and schema dimensions. +Polars no longer inspects data types to infer the orientation of the data passed to the DataFrame +constructor. Data orientation is inferred based on the data and schema dimensions. -Additionally, a warning is raised whenever row orientation is inferred. -Because of some confusing edge cases, users should pass `orient="row"` to make explicit that their input is row-based. +Additionally, a warning is raised whenever row orientation is inferred. Because of some confusing +edge cases, users should pass `orient="row"` to make explicit that their input is row-based. **Example** @@ -125,8 +126,9 @@ shape: (2, 2) If you work with time zones, please make sure to account for this change. Handling of time zone information in the Series and DataFrame constructors was inconsistent. -Row-wise construction would convert to the given time zone, while column-wise construction would _replace_ the time zone. -The inconsistency has been fixed by always converting to the time zone specified in the data type. +Row-wise construction would convert to the given time zone, while column-wise construction would +_replace_ the time zone. The inconsistency has been fixed by always converting to the time zone +specified in the data type. **Example** @@ -156,8 +158,8 @@ Series: '' [datetime[μs, Europe/Amsterdam]] ### Update some error types to more appropriate variants -We have updated a lot of error types to more accurately represent the problem. -Most commonly, `ComputeError` types were changed to `InvalidOperationError` or `SchemaError`. +We have updated a lot of error types to more accurately represent the problem. Most commonly, +`ComputeError` types were changed to `InvalidOperationError` or `SchemaError`. **Example** @@ -182,10 +184,9 @@ polars.exceptions.InvalidOperationError: conversion from `i64` to `u8` failed in ### Update `read/scan_parquet` to disable Hive partitioning by default for file inputs -Parquet reading functions now also support directory inputs. -Hive partitioning is enabled by default for directories, but is now _disabled_ by default for file inputs. -File inputs include single files, globs, and lists of files. -Explicitly pass `hive_partitioning=True` to restore previous behavior. +Parquet reading functions now also support directory inputs. Hive partitioning is enabled by default +for directories, but is now _disabled_ by default for file inputs. File inputs include single files, +globs, and lists of files. Explicitly pass `hive_partitioning=True` to restore previous behavior. **Example** @@ -233,8 +234,9 @@ shape: (2, 2) `reshape` now returns an Array type instead of a List type. -Users can restore the old functionality by calling `.arr.to_list()` on the output. -Note that this is not more expensive than it would be to create a List type directly, because reshaping into an array is basically free. +Users can restore the old functionality by calling `.arr.to_list()` on the output. Note that this is +not more expensive than it would be to create a List type directly, because reshaping into an array +is basically free. **Example** @@ -299,13 +301,16 @@ Series: '' [array[i64, 2]] ### Split `replace` functionality into two separate methods -The API for `replace` has proven to be confusing to many users, particularly with regards to the `default` argument and the resulting data type. +The API for `replace` has proven to be confusing to many users, particularly with regards to the +`default` argument and the resulting data type. -It has been split up into two methods: `replace` and `replace_strict`. -`replace` now always keeps the existing data type _(breaking, see example below)_ and is meant for replacing some values in your existing column. -Its parameters `default` and `return_dtype` have been deprecated. +It has been split up into two methods: `replace` and `replace_strict`. `replace` now always keeps +the existing data type _(breaking, see example below)_ and is meant for replacing some values in +your existing column. Its parameters `default` and `return_dtype` have been deprecated. -The new method `replace_strict` is meant for creating a new column, mapping some or all of the values of the original column, and optionally specifying a default value. If no default is provided, it raises an error if any non-null values are not mapped. +The new method `replace_strict` is meant for creating a new column, mapping some or all of the +values of the original column, and optionally specifying a default value. If no default is provided, +it raises an error if any non-null values are not mapped. **Example** @@ -342,8 +347,8 @@ Series: '' [str] ### Preserve nulls in `ewm_mean`, `ewm_std`, and `ewm_var` -Polars will no longer forward-fill null values in `ewm` methods. -The user can call `.forward_fill()` on the output to achieve the same result. +Polars will no longer forward-fill null values in `ewm` methods. The user can call `.forward_fill()` +on the output to achieve the same result. **Example** @@ -415,7 +420,8 @@ shape: (3, 1) ### Change `str.to_datetime` to default to microsecond precision for format specifiers `"%f"` and `"%.f"` -In `.str.to_datetime`, when specifying `%.f` as the format, the default was to set the resulting datatype to nanosecond precision. This has been changed to microsecond precision. +In `.str.to_datetime`, when specifying `%.f` as the format, the default was to set the resulting +datatype to nanosecond precision. This has been changed to microsecond precision. #### Example @@ -444,8 +450,8 @@ Series: '' [datetime[us]] ### Update resulting column names in `pivot` when pivoting by multiple values -In `DataFrame.pivot`, when specifying multiple `values` columns, the result would redundantly include the `column` column in the column names. -This has been addressed. +In `DataFrame.pivot`, when specifying multiple `values` columns, the result would redundantly +include the `column` column in the column names. This has been addressed. **Example** @@ -497,14 +503,14 @@ After: Note that the function signature has also changed: - `columns` has been renamed to `on`, and is now the first positional argument. -- `index` and `values` are both optional. If `index` is not specified, then it - will use all columns not specified in `on` and `values`. If `values` is - not specified, it will use all columns not specified in `on` and `index`. +- `index` and `values` are both optional. If `index` is not specified, then it will use all columns + not specified in `on` and `values`. If `values` is not specified, it will use all columns not + specified in `on` and `index`. ### Support Decimal types by default when converting from Arrow -Update conversion from Arrow to always convert Decimals into Polars Decimals, rather than cast to Float64. -`Config.activate_decimals` has been removed. +Update conversion from Arrow to always convert Decimals into Polars Decimals, rather than cast to +Float64. `Config.activate_decimals` has been removed. **Example** @@ -537,11 +543,11 @@ Series: '' [decimal[3,2]] ### Remove serde functionality from `pl.read_json` and `DataFrame.write_json` -`pl.read_json` no longer supports reading JSON files produced by `DataFrame.serialize`. -Users should use `pl.DataFrame.deserialize` instead. +`pl.read_json` no longer supports reading JSON files produced by `DataFrame.serialize`. Users should +use `pl.DataFrame.deserialize` instead. -`DataFrame.write_json` now only writes row-oriented JSON. The parameters `row_oriented` and `pretty` have been removed. -Users should use `DataFrame.serialize` to serialize a DataFrame. +`DataFrame.write_json` now only writes row-oriented JSON. The parameters `row_oriented` and `pretty` +have been removed. Users should use `DataFrame.serialize` to serialize a DataFrame. **Example - `write_json`** @@ -610,9 +616,9 @@ shape: (2, 2) ### `Series.equals` no longer checks names by default -Previously, `Series.equals` would return `False` if the Series names didn't match. -The method now no longer checks the names by default. -The previous behavior can be retained by setting `check_names=True`. +Previously, `Series.equals` would return `False` if the Series names didn't match. The method now no +longer checks the names by default. The previous behavior can be retained by setting +`check_names=True`. **Example** @@ -680,8 +686,9 @@ shape: (1, 1) ### Rename struct fields of `rle` output -The struct fields of the `rle` method have been renamed from `lengths/values` to `len/value`. -The data type of the `len` field has also been updated to match the index type (was previously `Int32`, now `UInt32`). +The struct fields of the `rle` method have been renamed from `lengths/values` to `len/value`. The +data type of the `len` field has also been updated to match the index type (was previously `Int32`, +now `UInt32`). **Before** @@ -718,12 +725,12 @@ shape: (3, 2) ### Update `set_sorted` to only accept a single column -Calling `set_sorted` indicates that a column is sorted _individually_. -Passing multiple columns indicates that each of those columns are also sorted individually. -However, many users assumed this meant that the columns were sorted as a group, which led to incorrect results. +Calling `set_sorted` indicates that a column is sorted _individually_. Passing multiple columns +indicates that each of those columns are also sorted individually. However, many users assumed this +meant that the columns were sorted as a group, which led to incorrect results. -To help users avoid this pitfall, we removed the possibility to specify multiple columns in `set_sorted`. -To set multiple columns as sorted, simply call `set_sorted` multiple times. +To help users avoid this pitfall, we removed the possibility to specify multiple columns in +`set_sorted`. To set multiple columns as sorted, simply call `set_sorted` multiple times. **Example** @@ -751,9 +758,8 @@ Use instead: ### Default to raising on out-of-bounds indices in all `get`/`gather` operations -The default behavior was inconsistent between `get` and `gather` operations in various places. -Now all such operations will raise by default. -Pass `null_on_oob=True` to restore previous behavior. +The default behavior was inconsistent between `get` and `gather` operations in various places. Now +all such operations will raise by default. Pass `null_on_oob=True` to restore previous behavior. **Example** @@ -793,13 +799,14 @@ Series: '' [i64] ### Change default engine for `read_excel` to `"calamine"` -The `calamine` engine (available through the `fastexcel` package) has been added to Polars relatively recently. -It's much faster than the other engines, and was already the default for `xlsb` and `xls` files. -We now made it the default for all Excel files. +The `calamine` engine (available through the `fastexcel` package) has been added to Polars +relatively recently. It's much faster than the other engines, and was already the default for `xlsb` +and `xls` files. We now made it the default for all Excel files. -There may be subtle differences between this engine and the previous default (`xlsx2csv`). -One clear difference is that the `calamine` engine does not support the `engine_options` parameter. -If you cannot get your desired behavior with the `calamine` engine, specify `engine="xlsx2csv"` to restore previous behavior. +There may be subtle differences between this engine and the previous default (`xlsx2csv`). One clear +difference is that the `calamine` engine does not support the `engine_options` parameter. If you +cannot get your desired behavior with the `calamine` engine, specify `engine="xlsx2csv"` to restore +previous behavior. ### Example @@ -826,10 +833,9 @@ Instead, explicitly specify the `xlsx2csv` engine or omit the `engine_options`: ### Remove class variables from some DataTypes -Some DataType classes had class variables. -The `Datetime` class, for example, had `time_unit` and `time_zone` as class variables. -This was unintended: these should have been instance variables. -This has now been corrected. +Some DataType classes had class variables. The `Datetime` class, for example, had `time_unit` and +`time_zone` as class variables. This was unintended: these should have been instance variables. This +has now been corrected. **Example** @@ -859,8 +865,8 @@ True ### Change default `offset` in `group_by_dynamic` from 'negative `every`' to 'zero' -This affects the start of the first window in `group_by_dynamic`. -The new behavior should align more with user expectations. +This affects the start of the first window in `group_by_dynamic`. The new behavior should align more +with user expectations. **Example** @@ -904,9 +910,9 @@ shape: (3, 2) ### Change default serialization format of `LazyFrame/DataFrame/Expr` -The only serialization format available for the `serialize/deserialize` methods on Polars objects was JSON. -We added a more optimized binary format and made this the default. -JSON serialization is still available by passing `format="json"`. +The only serialization format available for the `serialize/deserialize` methods on Polars objects +was JSON. We added a more optimized binary format and made this the default. JSON serialization is +still available by passing `format="json"`. **Example** @@ -950,9 +956,9 @@ shape: (1, 1) ### Constrain access to globals from `DataFrame.sql` in favor of `pl.sql` -The `sql` methods on `DataFrame` and `LazyFrame` can no longer access global variables. -These methods should be used for operating on the frame itself. -For global access, there is now the top-level `sql` function. +The `sql` methods on `DataFrame` and `LazyFrame` can no longer access global variables. These +methods should be used for operating on the frame itself. For global access, there is now the +top-level `sql` function. **Example** @@ -1003,13 +1009,13 @@ shape: (4, 2) ### Remove re-export of type aliases -We have a lot of type aliases defined in the `polars.type_aliases` module. -Some of these were re-exported at the top-level and in the `polars.datatypes` module. -These re-exports have been removed. +We have a lot of type aliases defined in the `polars.type_aliases` module. Some of these were +re-exported at the top-level and in the `polars.datatypes` module. These re-exports have been +removed. -We plan on adding a public `polars.typing` module in the future with a number of curated type aliases. -Until then, please define your own type aliases, or import from our `polars.type_aliases` module. -Note that the `type_aliases` module is not technically public, so use at your own risk. +We plan on adding a public `polars.typing` module in the future with a number of curated type +aliases. Until then, please define your own type aliases, or import from our `polars.type_aliases` +module. Note that the `type_aliases` module is not technically public, so use at your own risk. **Example** @@ -1029,9 +1035,9 @@ def foo(dtype: PolarsDataType) -> None: ... ### Streamline optional dependency definitions in `pyproject.toml` -We revisited to optional dependency definitions and made some minor changes. -If you were using the extras `fastexcel`, `gevent`, `matplotlib`, or `async`, this is a breaking change. -Please update your Polars installation to use the new extras. +We revisited to optional dependency definitions and made some minor changes. If you were using the +extras `fastexcel`, `gevent`, `matplotlib`, or `async`, this is a breaking change. Please update +your Polars installation to use the new extras. **Example** @@ -1051,13 +1057,15 @@ pip install 'polars[calamine,async,graph]' ### Issue `PerformanceWarning` when LazyFrame properties `schema/dtypes/columns/width` are used -Recent improvements to the correctness of the schema resolving in the lazy engine have had significant performance impact on the cost of resolving the schema. -It is no longer 'free' - in fact, in complex pipelines with lazy file reading, resolving the schema can be relatively expensive. +Recent improvements to the correctness of the schema resolving in the lazy engine have had +significant performance impact on the cost of resolving the schema. It is no longer 'free' - in +fact, in complex pipelines with lazy file reading, resolving the schema can be relatively expensive. Because of this, the schema-related properties on LazyFrame were no longer good API design. -Properties represent information that is already available, and just needs to be retrieved. -However, for the LazyFrame properties, accessing these may have significant performance cost. +Properties represent information that is already available, and just needs to be retrieved. However, +for the LazyFrame properties, accessing these may have significant performance cost. -To solve this, we added the `LazyFrame.collect_schema` method, which retrieves the schema and returns a `Schema` object. -The properties raise a `PerformanceWarning` and tell the user to use `collect_schema` instead. -We chose not to deprecate the properties for now to facilitate writing code that is generic for both DataFrames and LazyFrames. +To solve this, we added the `LazyFrame.collect_schema` method, which retrieves the schema and +returns a `Schema` object. The properties raise a `PerformanceWarning` and tell the user to use +`collect_schema` instead. We chose not to deprecate the properties for now to facilitate writing +code that is generic for both DataFrames and LazyFrames. diff --git a/docs/source/releases/upgrade/index.md b/docs/source/releases/upgrade/index.md index 467a99938c28..256764fb006e 100644 --- a/docs/source/releases/upgrade/index.md +++ b/docs/source/releases/upgrade/index.md @@ -1,9 +1,10 @@ # About -Polars releases an upgrade guide alongside each breaking release. -This guide is intended to help you upgrade from an older Polars version to the new version. +Polars releases an upgrade guide alongside each breaking release. This guide is intended to help you +upgrade from an older Polars version to the new version. -Each guide contains all breaking changes that were not previously deprecated, as well as any significant new deprecations. +Each guide contains all breaking changes that were not previously deprecated, as well as any +significant new deprecations. A full list of all changes is available in the [changelog](../changelog.md). diff --git a/docs/source/user-guide/concepts/_streaming.md b/docs/source/user-guide/concepts/_streaming.md index e4427c10481a..ba51484a2e08 100644 --- a/docs/source/user-guide/concepts/_streaming.md +++ b/docs/source/user-guide/concepts/_streaming.md @@ -2,15 +2,20 @@ -One additional benefit of the lazy API is that it allows queries to be executed in a streaming manner. Instead of processing all the data at once, Polars can execute the query in batches allowing you to process datasets that do not fit in memory. +One additional benefit of the lazy API is that it allows queries to be executed in a streaming +manner. Instead of processing all the data at once, Polars can execute the query in batches allowing +you to process datasets that do not fit in memory. -To tell Polars we want to execute a query in streaming mode we pass the `streaming=True` argument to `collect` +To tell Polars we want to execute a query in streaming mode we pass the `streaming=True` argument to +`collect` {{code_block('user-guide/concepts/streaming','streaming',['collect'])}} ## When is streaming available? -Streaming is still in development. We can ask Polars to execute any lazy query in streaming mode. However, not all lazy operations support streaming. If there is an operation for which streaming is not supported, Polars will run the query in non-streaming mode. +Streaming is still in development. We can ask Polars to execute any lazy query in streaming mode. +However, not all lazy operations support streaming. If there is an operation for which streaming is +not supported, Polars will run the query in non-streaming mode. Streaming is supported for many operations including: @@ -23,11 +28,14 @@ Streaming is supported for many operations including: - `explode`, `unpivot` - `scan_csv`, `scan_parquet`, `scan_ipc` -This list is not exhaustive. Polars is in active development, and more operations can be added without explicit notice. +This list is not exhaustive. Polars is in active development, and more operations can be added +without explicit notice. ### Example with supported operations -To determine which parts of your query are streaming, use the `explain` method. Below is an example that demonstrates how to inspect the query plan. More information about the query plan can be found in the chapter on the [Lazy API](https://docs.pola.rs/user-guide/lazy/query-plan/). +To determine which parts of your query are streaming, use the `explain` method. Below is an example +that demonstrates how to inspect the query plan. More information about the query plan can be found +in the chapter on the [Lazy API](https://docs.pola.rs/user-guide/lazy/query-plan/). {{code_block('user-guide/concepts/streaming', 'example',['explain'])}} diff --git a/docs/source/user-guide/concepts/data-types-and-structures.md b/docs/source/user-guide/concepts/data-types-and-structures.md index 896fc84a0ec9..93d29f60f484 100644 --- a/docs/source/user-guide/concepts/data-types-and-structures.md +++ b/docs/source/user-guide/concepts/data-types-and-structures.md @@ -9,17 +9,19 @@ Polars supports a variety of data types that fall broadly under the following ca - Temporal: dates, datetimes, times, and time deltas. - Miscellaneous: strings, binary data, Booleans, categoricals, enums, and objects. -All types support missing values represented by the special value `null`. -This is not to be conflated with the special value `NaN` in floating number data types; see the [section about floating point numbers](#floating-point-numbers) for more information. +All types support missing values represented by the special value `null`. This is not to be +conflated with the special value `NaN` in floating number data types; see the +[section about floating point numbers](#floating-point-numbers) for more information. -You can also find a [full table with all data types supported in the appendix](#appendix-full-data-types-table) with notes on when to use each data type and with links to relevant parts of the documentation. +You can also find a +[full table with all data types supported in the appendix](#appendix-full-data-types-table) with +notes on when to use each data type and with links to relevant parts of the documentation. ## Series -The core base data structures provided by Polars are series and dataframes. -A series is a 1-dimensional homogeneous data structure. -By “homogeneous” we mean that all elements inside a series have the same data type. -The snippet below shows how to create a named series: +The core base data structures provided by Polars are series and dataframes. A series is a +1-dimensional homogeneous data structure. By “homogeneous” we mean that all elements inside a series +have the same data type. The snippet below shows how to create a named series: {{code_block('user-guide/concepts/data-types-and-structures','series',['Series'])}} @@ -27,8 +29,8 @@ The snippet below shows how to create a named series: --8<-- "python/user-guide/concepts/data-types-and-structures.py:series" ``` -When creating a series, Polars will infer the data type from the values you provide. -You can specify a concrete data type to override the inference mechanism: +When creating a series, Polars will infer the data type from the values you provide. You can specify +a concrete data type to override the inference mechanism: {{code_block('user-guide/concepts/data-types-and-structures','series-dtype',['Series'])}} @@ -38,9 +40,11 @@ You can specify a concrete data type to override the inference mechanism: ## Dataframe -A dataframe is a 2-dimensional heterogeneous data structure that contains uniquely named series. -By holding your data in a dataframe you will be able to use the Polars API to write queries that manipulate your data. -You will be able to do this by using the [contexts and expressions provided by Polars](expressions-and-contexts.md) that we will talk about next. +A dataframe is a 2-dimensional heterogeneous data structure that contains uniquely named series. By +holding your data in a dataframe you will be able to use the Polars API to write queries that +manipulate your data. You will be able to do this by using the +[contexts and expressions provided by Polars](expressions-and-contexts.md) that we will talk about +next. The snippet below shows how to create a dataframe from a dictionary of lists: @@ -52,13 +56,13 @@ The snippet below shows how to create a dataframe from a dictionary of lists: ### Inspecting a dataframe -In this subsection we will show some useful methods to quickly inspect a dataframe. -We will use the dataframe we created earlier as a starting point. +In this subsection we will show some useful methods to quickly inspect a dataframe. We will use the +dataframe we created earlier as a starting point. #### Head -The function `head` shows the first rows of a dataframe. -By default, you get the first 5 rows but you can also specify the number of rows you want: +The function `head` shows the first rows of a dataframe. By default, you get the first 5 rows but +you can also specify the number of rows you want: {{code_block('user-guide/concepts/data-types-and-structures','head',['head'])}} @@ -68,8 +72,9 @@ By default, you get the first 5 rows but you can also specify the number of rows #### Glimpse -The function `glimpse` is another function that shows the values of the first few rows of a dataframe, but formats the output differently from `head`. -Here, each line of the output corresponds to a single column, making it easier to take inspect wider dataframes: +The function `glimpse` is another function that shows the values of the first few rows of a +dataframe, but formats the output differently from `head`. Here, each line of the output corresponds +to a single column, making it easier to take inspect wider dataframes: === ":fontawesome-brands-python: Python" [:material-api: `glimpse`](https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.glimpse.html) @@ -88,8 +93,8 @@ Here, each line of the output corresponds to a single column, making it easier t #### Tail -The function `tail` shows the last rows of a dataframe. -By default, you get the last 5 rows but you can also specify the number of rows you want, similar to how `head` works: +The function `tail` shows the last rows of a dataframe. By default, you get the last 5 rows but you +can also specify the number of rows you want, similar to how `head` works: {{code_block('user-guide/concepts/data-types-and-structures','tail',['tail'])}} @@ -99,8 +104,9 @@ By default, you get the last 5 rows but you can also specify the number of rows #### Sample -If you think the first or last rows of your dataframe are not representative of your data, you can use `sample` to get an arbitrary number of randomly selected rows from the DataFrame. -Note that the rows are not necessarily returned in the same order as they appear in the dataframe: +If you think the first or last rows of your dataframe are not representative of your data, you can +use `sample` to get an arbitrary number of randomly selected rows from the DataFrame. Note that the +rows are not necessarily returned in the same order as they appear in the dataframe: {{code_block('user-guide/concepts/data-types-and-structures','sample',['sample'])}} @@ -120,8 +126,8 @@ You can also use `describe` to compute summary statistics for all columns of you ## Schema -When talking about data (in a dataframe or otherwise) we can refer to its schema. -The schema is a mapping of column or series names to the data types of those same columns or series. +When talking about data (in a dataframe or otherwise) we can refer to its schema. The schema is a +mapping of column or series names to the data types of those same columns or series. You can check the schema of a dataframe with `schema`: @@ -131,10 +137,11 @@ You can check the schema of a dataframe with `schema`: --8<-- "python/user-guide/concepts/data-types-and-structures.py:schema" ``` -Much like with series, Polars will infer the schema of a dataframe when you create it but you can override the inference system if needed. +Much like with series, Polars will infer the schema of a dataframe when you create it but you can +override the inference system if needed. -In Python, you can specify an explicit schema by using a dictionary to map column names to data types. -You can use the value `None` if you do not wish to override inference for a given column: +In Python, you can specify an explicit schema by using a dictionary to map column names to data +types. You can use the value `None` if you do not wish to override inference for a given column: ```python --8<-- "python/user-guide/concepts/data-types-and-structures.py:schema-def" @@ -144,7 +151,9 @@ You can use the value `None` if you do not wish to override inference for a give --8<-- "python/user-guide/concepts/data-types-and-structures.py:schema-def" ``` -If you only need to override the inference of some columns, the parameter `schema_overrides` tends to be more convenient because it lets you omit columns for which you do not want to override the inference: +If you only need to override the inference of some columns, the parameter `schema_overrides` tends +to be more convenient because it lets you omit columns for which you do not want to override the +inference: ```python --8<-- "python/user-guide/concepts/data-types-and-structures.py:schema_overrides" @@ -156,24 +165,29 @@ If you only need to override the inference of some columns, the parameter `schem ## Data types internals -Polars utilizes the [Arrow Columnar Format](https://arrow.apache.org/docs/format/Columnar.html) for its data orientation. -Following this specification allows Polars to transfer data to/from other tools that also use the Arrow specification with little to no overhead. +Polars utilizes the [Arrow Columnar Format](https://arrow.apache.org/docs/format/Columnar.html) for +its data orientation. Following this specification allows Polars to transfer data to/from other +tools that also use the Arrow specification with little to no overhead. -Polars gets most of its performance from its query engine, the optimizations it performs on your query plans, and from the parallelization that it employs when running [your expressions](expressions-and-contexts.md#expressions). +Polars gets most of its performance from its query engine, the optimizations it performs on your +query plans, and from the parallelization that it employs when running +[your expressions](expressions-and-contexts.md#expressions). ## Floating point numbers -Polars generally follows the IEEE 754 floating point standard for `Float32` and `Float64`, with some exceptions: +Polars generally follows the IEEE 754 floating point standard for `Float32` and `Float64`, with some +exceptions: - Any `NaN` compares equal to any other `NaN`, and greater than any non-`NaN` value. -- Operations do not guarantee any particular behavior on the sign of zero or `NaN`, - nor on the payload of `NaN` values. This is not just limited to arithmetic operations, - e.g. a sort or group by operation may canonicalize all zeroes to +0 and all `NaN`s - to a positive `NaN` without payload for efficient equality checks. - -Polars always attempts to provide reasonably accurate results for floating point computations but does not provide guarantees -on the error unless mentioned otherwise. Generally speaking 100% accurate results are infeasibly expensive to achieve (requiring -much larger internal representations than 64-bit floats), and thus some error is always to be expected. +- Operations do not guarantee any particular behavior on the sign of zero or `NaN`, nor on the + payload of `NaN` values. This is not just limited to arithmetic operations, e.g. a sort or group + by operation may canonicalize all zeroes to +0 and all `NaN`s to a positive `NaN` without payload + for efficient equality checks. + +Polars always attempts to provide reasonably accurate results for floating point computations but +does not provide guarantees on the error unless mentioned otherwise. Generally speaking 100% +accurate results are infeasibly expensive to achieve (requiring much larger internal representations +than 64-bit floats), and thus some error is always to be expected. ## Appendix: full data types table diff --git a/docs/source/user-guide/concepts/expressions-and-contexts.md b/docs/source/user-guide/concepts/expressions-and-contexts.md index bee5cd130b45..e32d08b8bbe9 100644 --- a/docs/source/user-guide/concepts/expressions-and-contexts.md +++ b/docs/source/user-guide/concepts/expressions-and-contexts.md @@ -1,22 +1,25 @@ # Expressions and contexts -Polars has developed its own Domain Specific Language (DSL) for transforming data. -The language is very easy to use and allows for complex queries that remain human readable. -Expressions and contexts, which will be introduced here, are very important in achieving this readability while also allowing the Polars query engine to optimize your queries to make them run as fast as possible. +Polars has developed its own Domain Specific Language (DSL) for transforming data. The language is +very easy to use and allows for complex queries that remain human readable. Expressions and +contexts, which will be introduced here, are very important in achieving this readability while also +allowing the Polars query engine to optimize your queries to make them run as fast as possible. ## Expressions -In Polars, an _expression_ is a lazy representation of a data transformation. -Expressions are modular and flexible, which means you can use them as building blocks to build more complex expressions. -Here is an example of a Polars expression: +In Polars, an _expression_ is a lazy representation of a data transformation. Expressions are +modular and flexible, which means you can use them as building blocks to build more complex +expressions. Here is an example of a Polars expression: ```python --8<-- "python/user-guide/concepts/expressions.py:expression" ``` -As you might be able to guess, this expression takes a column named “weight” and divides its values by the square of the values in a column “height”, computing a person's BMI. +As you might be able to guess, this expression takes a column named “weight” and divides its values +by the square of the values in a column “height”, computing a person's BMI. -The code above expresses an abstract computation that we can save in a variable, manipulate further, or just print: +The code above expresses an abstract computation that we can save in a variable, manipulate further, +or just print: ```python --8<-- "python/user-guide/concepts/expressions.py:print-expr" @@ -27,14 +30,14 @@ The code above expresses an abstract computation that we can save in a variable, --8<-- "python/user-guide/concepts/expressions.py:print-expr" ``` -Because expressions are lazy, no computations have taken place yet. -That's what we need contexts for. +Because expressions are lazy, no computations have taken place yet. That's what we need contexts +for. ## Contexts -Polars expressions need a _context_ in which they are executed to produce a result. -Depending on the context it is used in, the same Polars expression can produce different results. -In this section, we will learn about the four most common contexts that Polars provides[^1]: +Polars expressions need a _context_ in which they are executed to produce a result. Depending on the +context it is used in, the same Polars expression can produce different results. In this section, we +will learn about the four most common contexts that Polars provides[^1]: 1. `select` 2. `with_columns` @@ -51,8 +54,8 @@ We use the dataframe below to show how each of the contexts works. ### `select` -The selection context `select` applies expressions over columns. -The context `select` may produce new columns that are aggregations, combinations of other columns, or literals: +The selection context `select` applies expressions over columns. The context `select` may produce +new columns that are aggregations, combinations of other columns, or literals: {{code_block('user-guide/concepts/expressions','select-1',['select'])}} @@ -60,12 +63,12 @@ The context `select` may produce new columns that are aggregations, combinations --8<-- "python/user-guide/concepts/expressions.py:select-1" ``` -The expressions in a context `select` must produce series that are all the same length or they must produce a scalar. -Scalars will be broadcast to match the length of the remaining series. -Literals, like the number used above, are also broadcast. +The expressions in a context `select` must produce series that are all the same length or they must +produce a scalar. Scalars will be broadcast to match the length of the remaining series. Literals, +like the number used above, are also broadcast. -Note that broadcasting can also occur within expressions. -For instance, consider the expression below: +Note that broadcasting can also occur within expressions. For instance, consider the expression +below: {{code_block('user-guide/concepts/expressions','select-2',['select'])}} @@ -73,15 +76,19 @@ For instance, consider the expression below: --8<-- "python/user-guide/concepts/expressions.py:select-2" ``` -Both the subtraction and the division use broadcasting within the expression because the subexpressions that compute the mean and the standard deviation evaluate to single values. +Both the subtraction and the division use broadcasting within the expression because the +subexpressions that compute the mean and the standard deviation evaluate to single values. -The context `select` is very flexible and powerful and allows you to evaluate arbitrary expressions independent of, and in parallel to, each other. -This is also true of the other contexts that we will see next. +The context `select` is very flexible and powerful and allows you to evaluate arbitrary expressions +independent of, and in parallel to, each other. This is also true of the other contexts that we will +see next. ### `with_columns` -The context `with_columns` is very similar to the context `select`. -The main difference between the two is that the context `with_columns` creates a new dataframe that contains the columns from the original dataframe and the new columns according to its input expressions, whereas the context `select` only includes the columns selected by its input expressions: +The context `with_columns` is very similar to the context `select`. The main difference between the +two is that the context `with_columns` creates a new dataframe that contains the columns from the +original dataframe and the new columns according to its input expressions, whereas the context +`select` only includes the columns selected by its input expressions: {{code_block('user-guide/concepts/expressions','with_columns-1',['with_columns'])}} @@ -89,11 +96,15 @@ The main difference between the two is that the context `with_columns` creates a --8<-- "python/user-guide/concepts/expressions.py:with_columns-1" ``` -Because of this difference between `select` and `with_columns`, the expressions used in a context `with_columns` must produce series that have the same length as the original columns in the dataframe, whereas it is enough for the expressions in the context `select` to produce series that have the same length among them. +Because of this difference between `select` and `with_columns`, the expressions used in a context +`with_columns` must produce series that have the same length as the original columns in the +dataframe, whereas it is enough for the expressions in the context `select` to produce series that +have the same length among them. ### `filter` -The context `filter` filters the rows of a dataframe based on one or more expressions that evaluate to the Boolean data type. +The context `filter` filters the rows of a dataframe based on one or more expressions that evaluate +to the Boolean data type. {{code_block('user-guide/concepts/expressions','filter-1',['filter'])}} @@ -103,8 +114,9 @@ The context `filter` filters the rows of a dataframe based on one or more expres ### `group_by` and aggregations -In the context `group_by`, rows are grouped according to the unique values of the grouping expressions. -You can then apply expressions to the resulting groups, which may be of variable lengths. +In the context `group_by`, rows are grouped according to the unique values of the grouping +expressions. You can then apply expressions to the resulting groups, which may be of variable +lengths. When using the context `group_by`, you can use an expression to compute the groupings dynamically: @@ -114,11 +126,12 @@ When using the context `group_by`, you can use an expression to compute the grou --8<-- "python/user-guide/concepts/expressions.py:group_by-1" ``` -After using `group_by` we use `agg` to apply aggregating expressions to the groups. -Since in the example above we only specified the name of a column, we get the groups of that column as lists. +After using `group_by` we use `agg` to apply aggregating expressions to the groups. Since in the +example above we only specified the name of a column, we get the groups of that column as lists. -We can specify as many grouping expressions as we'd like and the context `group_by` will group the rows according to the distinct values across the expressions specified. -Here, we group by a combination of decade of birth and whether the person is shorter than 1.7 metres: +We can specify as many grouping expressions as we'd like and the context `group_by` will group the +rows according to the distinct values across the expressions specified. Here, we group by a +combination of decade of birth and whether the person is shorter than 1.7 metres: {{code_block('user-guide/concepts/expressions','group_by-2',['group_by'])}} @@ -126,8 +139,9 @@ Here, we group by a combination of decade of birth and whether the person is sho --8<-- "python/user-guide/concepts/expressions.py:group_by-2" ``` -The resulting dataframe, after applying aggregating expressions, contains one column per each grouping expression on the left and then as many columns as needed to represent the results of the aggregating expressions. -In turn, we can specify as many aggregating expressions as we want: +The resulting dataframe, after applying aggregating expressions, contains one column per each +grouping expression on the left and then as many columns as needed to represent the results of the +aggregating expressions. In turn, we can specify as many aggregating expressions as we want: {{code_block('user-guide/concepts/expressions','group_by-3',['group_by'])}} @@ -139,19 +153,21 @@ See also `group_by_dynamic` and `group_by_rolling` for other grouping contexts. ## Expression expansion -The last example contained two grouping expressions and three aggregating expressions, and yet the resulting dataframe contained six columns instead of five. -If we look closely, the last aggregating expression mentioned two different columns: “weight” and “height”. +The last example contained two grouping expressions and three aggregating expressions, and yet the +resulting dataframe contained six columns instead of five. If we look closely, the last aggregating +expression mentioned two different columns: “weight” and “height”. -Polars expressions support a feature called _expression expansion_. -Expression expansion is like a shorthand notation for when you want to apply the same transformation to multiple columns. -As we have seen, the expression +Polars expressions support a feature called _expression expansion_. Expression expansion is like a +shorthand notation for when you want to apply the same transformation to multiple columns. As we +have seen, the expression ```python pl.col("weight", "height").mean().name.prefix("avg_") ``` -will compute the mean value of the columns “weight” and “height” and will rename them as “avg_weight” and “avg_height”, respectively. -In fact, the expression above is equivalent to using the two following expressions: +will compute the mean value of the columns “weight” and “height” and will rename them as +“avg_weight” and “avg_height”, respectively. In fact, the expression above is equivalent to using +the two following expressions: ```python [ @@ -160,8 +176,9 @@ In fact, the expression above is equivalent to using the two following expressio ] ``` -In this case, this expression expands into two independent expressions that Polars can execute in parallel. -In other cases, we may not be able to know in advance how many independent expressions an expression will unfold into. +In this case, this expression expands into two independent expressions that Polars can execute in +parallel. In other cases, we may not be able to know in advance how many independent expressions an +expression will unfold into. Consider this simple but elucidative example: @@ -169,9 +186,9 @@ Consider this simple but elucidative example: (pl.col(pl.Float64) * 1.1).name.suffix("*1.1") ``` -This expression will multiply all columns with data type `Float64` by `1.1`. -The number of columns this applies to depends on the schema of each dataframe. -In the case of the dataframe we have been using, it applies to two columns: +This expression will multiply all columns with data type `Float64` by `1.1`. The number of columns +this applies to depends on the schema of each dataframe. In the case of the dataframe we have been +using, it applies to two columns: {{code_block('user-guide/concepts/expressions','expression-expansion-1',['group_by'])}} @@ -179,7 +196,8 @@ In the case of the dataframe we have been using, it applies to two columns: --8<-- "python/user-guide/concepts/expressions.py:expression-expansion-1" ``` -In the case of the dataframe `df2` below, the same expression expands to 0 columns because no column has the data type `Float64`: +In the case of the dataframe `df2` below, the same expression expands to 0 columns because no column +has the data type `Float64`: {{code_block('user-guide/concepts/expressions','expression-expansion-2',['group_by'])}} @@ -187,18 +205,25 @@ In the case of the dataframe `df2` below, the same expression expands to 0 colum --8<-- "python/user-guide/concepts/expressions.py:expression-expansion-2" ``` -It is equally easy to imagine a scenario where the same expression would expand to dozens of columns. +It is equally easy to imagine a scenario where the same expression would expand to dozens of +columns. -Next, you will learn about [the lazy API and the function `explain`](lazy-api.md#previewing-the-query-plan), which you can use to preview what an expression will expand to given a schema. +Next, you will learn about +[the lazy API and the function `explain`](lazy-api.md#previewing-the-query-plan), which you can use +to preview what an expression will expand to given a schema. ## Conclusion -Because expressions are lazy, when you use an expression inside a context Polars can try to simplify your expression before running the data transformation it expresses. -Separate expressions within a context are embarrassingly parallel and Polars will take advantage of that, while also parallelizing expression execution when using expression expansion. -Further performance gains can be obtained when using [the lazy API of Polars](lazy-api.md), which is introduced next. +Because expressions are lazy, when you use an expression inside a context Polars can try to simplify +your expression before running the data transformation it expresses. Separate expressions within a +context are embarrassingly parallel and Polars will take advantage of that, while also parallelizing +expression execution when using expression expansion. Further performance gains can be obtained when +using [the lazy API of Polars](lazy-api.md), which is introduced next. -We have only scratched the surface of the capabilities of expressions. -There are a ton more expressions and they can be combined in a variety of ways. -See the [section on expressions](../expressions/index.md) for a deeper dive on the different types of expressions available. +We have only scratched the surface of the capabilities of expressions. There are a ton more +expressions and they can be combined in a variety of ways. See the +[section on expressions](../expressions/index.md) for a deeper dive on the different types of +expressions available. -[^1]: There are additional List and SQL contexts which are covered later in this guide. But for simplicity, we leave them out of scope for now. +[^1]: There are additional List and SQL contexts which are covered later in this guide. But for +simplicity, we leave them out of scope for now. diff --git a/docs/source/user-guide/concepts/index.md b/docs/source/user-guide/concepts/index.md index c4b28e50721f..b926828716f4 100644 --- a/docs/source/user-guide/concepts/index.md +++ b/docs/source/user-guide/concepts/index.md @@ -1,6 +1,7 @@ # Concepts -This chapter describes the core concepts of the Polars API. Understanding these will help you optimise your queries on a daily basis. We will cover the following topics: +This chapter describes the core concepts of the Polars API. Understanding these will help you +optimise your queries on a daily basis. We will cover the following topics: - [Data types and structures](data-types-and-structures.md) - [Expressions and contexts](expressions-and-contexts.md) diff --git a/docs/source/user-guide/concepts/lazy-api.md b/docs/source/user-guide/concepts/lazy-api.md index 85b985e1a74c..349bd058c2bb 100644 --- a/docs/source/user-guide/concepts/lazy-api.md +++ b/docs/source/user-guide/concepts/lazy-api.md @@ -1,7 +1,9 @@ # Lazy API -Polars supports two modes of operation: lazy and eager. The examples so far have used the eager API, in which the query is executed immediately. -In the lazy API, the query is only evaluated once it is _collected_. Deferring the execution to the last minute can have significant performance advantages and is why the lazy API is preferred in most cases. Let us demonstrate this with an example: +Polars supports two modes of operation: lazy and eager. The examples so far have used the eager API, +in which the query is executed immediately. In the lazy API, the query is only evaluated once it is +_collected_. Deferring the execution to the last minute can have significant performance advantages +and is why the lazy API is preferred in most cases. Let us demonstrate this with an example: {{code_block('user-guide/concepts/lazy-vs-eager','eager',['read_csv'])}} @@ -11,14 +13,22 @@ In this example we use the eager API to: 1. Filter the dataset based on sepal length. 1. Calculate the mean of the sepal width per species. -Every step is executed immediately returning the intermediate results. This can be very wasteful as we might do work or load extra data that is not being used. If we instead used the lazy API and waited on execution until all the steps are defined then the query planner could perform various optimizations. In this case: +Every step is executed immediately returning the intermediate results. This can be very wasteful as +we might do work or load extra data that is not being used. If we instead used the lazy API and +waited on execution until all the steps are defined then the query planner could perform various +optimizations. In this case: -- Predicate pushdown: Apply filters as early as possible while reading the dataset, thus only reading rows with sepal length greater than 5. -- Projection pushdown: Select only the columns that are needed while reading the dataset, thus removing the need to load additional columns (e.g., petal length and petal width). +- Predicate pushdown: Apply filters as early as possible while reading the dataset, thus only + reading rows with sepal length greater than 5. +- Projection pushdown: Select only the columns that are needed while reading the dataset, thus + removing the need to load additional columns (e.g., petal length and petal width). {{code_block('user-guide/concepts/lazy-vs-eager','lazy',['scan_csv'])}} -These will significantly lower the load on memory & CPU thus allowing you to fit bigger datasets in memory and process them faster. Once the query is defined you call `collect` to inform Polars that you want to execute it. You can [learn more about the lazy API in its dedicated chapter](../lazy/index.md). +These will significantly lower the load on memory & CPU thus allowing you to fit bigger datasets in +memory and process them faster. Once the query is defined you call `collect` to inform Polars that +you want to execute it. You can +[learn more about the lazy API in its dedicated chapter](../lazy/index.md). !!! info "Eager API" @@ -26,13 +36,15 @@ These will significantly lower the load on memory & CPU thus allowing you to fit ## When to use which -In general, the lazy API should be preferred unless you are either interested in the intermediate results or are doing exploratory work and don't know yet what your query is going to look like. +In general, the lazy API should be preferred unless you are either interested in the intermediate +results or are doing exploratory work and don't know yet what your query is going to look like. ## Previewing the query plan -When using the lazy API you can use the function `explain` to ask Polars to create a description of the query plan that will be executed once you collect the results. -This can be useful if you want to see what types of optimizations Polars performs on your queries. -We can ask Polars to explain the query `q` we defined above: +When using the lazy API you can use the function `explain` to ask Polars to create a description of +the query plan that will be executed once you collect the results. This can be useful if you want to +see what types of optimizations Polars performs on your queries. We can ask Polars to explain the +query `q` we defined above: {{code_block('user-guide/concepts/lazy-vs-eager','explain',['explain'])}} @@ -42,10 +54,13 @@ We can ask Polars to explain the query `q` we defined above: --8<-- "python/user-guide/concepts/lazy-vs-eager.py:explain" ``` -Immediately, we can see in the explanation that Polars did apply predicate pushdown, as it is only reading rows where the sepal length is greater than 5, and it did apply projection pushdown, as it is only reading the columns that are needed by the query. +Immediately, we can see in the explanation that Polars did apply predicate pushdown, as it is only +reading rows where the sepal length is greater than 5, and it did apply projection pushdown, as it +is only reading the columns that are needed by the query. -The function `explain` can also be used to see how expression expansion will unfold in the context of a given schema. -Consider the example expression from the [section on expression expansion](expressions-and-contexts.md#expression-expansion): +The function `explain` can also be used to see how expression expansion will unfold in the context +of a given schema. Consider the example expression from the +[section on expression expansion](expressions-and-contexts.md#expression-expansion): ```python (pl.col(pl.Float64) * 1.1).name.suffix("*1.1") diff --git a/docs/source/user-guide/ecosystem.md b/docs/source/user-guide/ecosystem.md index 9a8f96c4f72f..30a7e61635e0 100644 --- a/docs/source/user-guide/ecosystem.md +++ b/docs/source/user-guide/ecosystem.md @@ -2,7 +2,9 @@ ## Introduction -On this page you can find a non-exhaustive list of libraries and tools that support Polars. As the data ecosystem is evolving fast, more libraries will likely support Polars in the future. One of the main drivers is that Polars makes adheres its memory layout to the `Apache Arrow` spec. +On this page you can find a non-exhaustive list of libraries and tools that support Polars. As the +data ecosystem is evolving fast, more libraries will likely support Polars in the future. One of the +main drivers is that Polars makes adheres its memory layout to the `Apache Arrow` spec. ### Table of contents: @@ -16,7 +18,11 @@ On this page you can find a non-exhaustive list of libraries and tools that supp ### Apache Arrow -[Apache Arrow](https://arrow.apache.org/) enables zero-copy reads of data within the same process, meaning that data can be directly accessed in its in-memory format without the need for copying or serialisation. This enhances performance when integrating with different tools using Apache Arrow. Polars is compatible with a wide range of libraries that also make use of Apache Arrow, like Pandas and DuckDB. +[Apache Arrow](https://arrow.apache.org/) enables zero-copy reads of data within the same process, +meaning that data can be directly accessed in its in-memory format without the need for copying or +serialisation. This enhances performance when integrating with different tools using Apache Arrow. +Polars is compatible with a wide range of libraries that also make use of Apache Arrow, like Pandas +and DuckDB. ### Data visualisation @@ -26,34 +32,56 @@ See the [dedicated visualization section](misc/visualization.md). #### Delta Lake -The [Delta Lake](https://github.com/delta-io/delta-rs) project aims to unlock the power of the Deltalake for as many users and projects as possible by providing native low-level APIs aimed at developers and integrators, as well as a high-level operations API that lets you query, inspect, and operate your Delta Lake with ease. +The [Delta Lake](https://github.com/delta-io/delta-rs) project aims to unlock the power of the +Deltalake for as many users and projects as possible by providing native low-level APIs aimed at +developers and integrators, as well as a high-level operations API that lets you query, inspect, and +operate your Delta Lake with ease. -Read how to use Delta Lake with Polars [at Delta Lake](https://delta-io.github.io/delta-rs/integrations/delta-lake-polars/#reading-a-delta-lake-table-with-polars). +Read how to use Delta Lake with Polars +[at Delta Lake](https://delta-io.github.io/delta-rs/integrations/delta-lake-polars/#reading-a-delta-lake-table-with-polars). ### Machine Learning #### Scikit Learn -Since [Scikit Learn](https://scikit-learn.org/stable/) 1.4, all transformers support Polars output. See the change log for [more details](https://scikit-learn.org/dev/whats_new/v1.4.html#changes-impacting-all-modules). +Since [Scikit Learn](https://scikit-learn.org/stable/) 1.4, all transformers support Polars output. +See the change log for +[more details](https://scikit-learn.org/dev/whats_new/v1.4.html#changes-impacting-all-modules). ### Other #### DuckDB -[DuckDB](https://duckdb.org) is a high-performance analytical database system. It is designed to be fast, reliable, portable, and easy to use. DuckDB provides a rich SQL dialect, with support far beyond basic SQL. DuckDB supports arbitrary and nested correlated subqueries, window functions, collations, complex types (arrays, structs), and more. Read about integration with Polars [on the DuckDB website](https://duckdb.org/docs/guides/python/polars). +[DuckDB](https://duckdb.org) is a high-performance analytical database system. It is designed to be +fast, reliable, portable, and easy to use. DuckDB provides a rich SQL dialect, with support far +beyond basic SQL. DuckDB supports arbitrary and nested correlated subqueries, window functions, +collations, complex types (arrays, structs), and more. Read about integration with Polars +[on the DuckDB website](https://duckdb.org/docs/guides/python/polars). #### Great Tables -With [Great Tables](https://posit-dev.github.io/great-tables/articles/intro.html) anyone can make wonderful-looking tables in Python. Here is a [blog post](https://posit-dev.github.io/great-tables/blog/polars-styling/) on how to use Great Tables with Polars. +With [Great Tables](https://posit-dev.github.io/great-tables/articles/intro.html) anyone can make +wonderful-looking tables in Python. Here is a +[blog post](https://posit-dev.github.io/great-tables/blog/polars-styling/) on how to use Great +Tables with Polars. #### LanceDB -[LanceDB](https://lancedb.com/) is a developer-friendly, serverless vector database for AI applications. They have added a direct integration with Polars. LanceDB can ingest Polars dataframes, return results as polars dataframes, and export the entire table as a polars lazyframe. You can find a quick tutorial in their blog [LanceDB + Polars](https://blog.lancedb.com/lancedb-polars-2d5eb32a8aa3) +[LanceDB](https://lancedb.com/) is a developer-friendly, serverless vector database for AI +applications. They have added a direct integration with Polars. LanceDB can ingest Polars +dataframes, return results as polars dataframes, and export the entire table as a polars lazyframe. +You can find a quick tutorial in their blog +[LanceDB + Polars](https://blog.lancedb.com/lancedb-polars-2d5eb32a8aa3) #### Mage -[Mage](https://www.mage.ai) is an open-source data pipeline tool for transforming and integrating data. Learn about integration between Polars and Mage at [docs.mage.ai](https://docs.mage.ai/integrations/polars). +[Mage](https://www.mage.ai) is an open-source data pipeline tool for transforming and integrating +data. Learn about integration between Polars and Mage at +[docs.mage.ai](https://docs.mage.ai/integrations/polars). #### marimo -[marimo](https://marimo.io) is a reactive notebook for Python and SQL that models notebooks as dataflow graphs. It offers built-in support for Polars, allowing seamless integration of Polars dataframes in an interactive, reactive environment - such as displaying rich Polars tables, no-code transformations of Polars dataframes, or selecting points on a Polars-backed reactive chart. +[marimo](https://marimo.io) is a reactive notebook for Python and SQL that models notebooks as +dataflow graphs. It offers built-in support for Polars, allowing seamless integration of Polars +dataframes in an interactive, reactive environment - such as displaying rich Polars tables, no-code +transformations of Polars dataframes, or selecting points on a Polars-backed reactive chart. diff --git a/docs/source/user-guide/expressions/aggregation.md b/docs/source/user-guide/expressions/aggregation.md index 65ebfc776c00..c162b4094970 100644 --- a/docs/source/user-guide/expressions/aggregation.md +++ b/docs/source/user-guide/expressions/aggregation.md @@ -1,9 +1,11 @@ # Aggregation -The Polars [context](../concepts/expressions-and-contexts.md#contexts) `group_by` lets you apply expressions on subsets of columns, as defined by the unique values of the column over which the data is grouped. -This is a very powerful capability that we explore in this section of the user guide. +The Polars [context](../concepts/expressions-and-contexts.md#contexts) `group_by` lets you apply +expressions on subsets of columns, as defined by the unique values of the column over which the data +is grouped. This is a very powerful capability that we explore in this section of the user guide. -We start by reading in a [US congress `dataset`](https://github.com/unitedstates/congress-legislators): +We start by reading in a +[US congress `dataset`](https://github.com/unitedstates/congress-legislators): {{code_block('user-guide/expressions/aggregation','dataframe',['DataFrame','Categorical'])}} @@ -13,16 +15,19 @@ We start by reading in a [US congress `dataset`](https://github.com/unitedstates ## Basic aggregations -You can easily apply multiple expressions to your aggregated values. -Simply list all of the expressions you want inside the function `agg`. -There is no upper bound on the number of aggregations you can do and you can make any combination you want. -In the snippet below we will group the data based on the column “first_name” and then we will apply the following aggregations: +You can easily apply multiple expressions to your aggregated values. Simply list all of the +expressions you want inside the function `agg`. There is no upper bound on the number of +aggregations you can do and you can make any combination you want. In the snippet below we will +group the data based on the column “first_name” and then we will apply the following aggregations: -- count the number of rows in the group (which means we count how many people in the data set have each unique first name); -- combine the values of the column “gender” into a list by referring the column but omitting an aggregate function; and +- count the number of rows in the group (which means we count how many people in the data set have + each unique first name); +- combine the values of the column “gender” into a list by referring the column but omitting an + aggregate function; and - get the first value of the column “last_name” within the group. -After computing the aggregations, we immediately sort the result and limit it to the top five rows so that we have a nice summary overview: +After computing the aggregations, we immediately sort the result and limit it to the top five rows +so that we have a nice summary overview: {{code_block('user-guide/expressions/aggregation','basic',['group_by'])}} @@ -30,13 +35,12 @@ After computing the aggregations, we immediately sort the result and limit it to --8<-- "python/user-guide/expressions/aggregation.py:basic" ``` -It's that easy! -Let's turn it up a notch. +It's that easy! Let's turn it up a notch. ## Conditionals -Let's say we want to know how many delegates of a state are “Pro” or “Anti” administration. -We can query that directly in the aggregation without the need for a `lambda` or grooming the dataframe: +Let's say we want to know how many delegates of a state are “Pro” or “Anti” administration. We can +query that directly in the aggregation without the need for a `lambda` or grooming the dataframe: {{code_block('user-guide/expressions/aggregation','conditional',['group_by'])}} @@ -46,8 +50,9 @@ We can query that directly in the aggregation without the need for a `lambda` or ## Filtering -We can also filter the groups. -Let's say we want to compute a mean per group, but we don't want to include all values from that group, and we also don't want to actually filter the rows from the dataframe because we need those rows for another aggregation. +We can also filter the groups. Let's say we want to compute a mean per group, but we don't want to +include all values from that group, and we also don't want to actually filter the rows from the +dataframe because we need those rows for another aggregation. In the example below we show how this can be done. @@ -63,15 +68,18 @@ In the example below we show how this can be done. --8<-- "python/user-guide/expressions/aggregation.py:filter" ``` -Do the average age values look nonsensical? -That's because we are working with historical data that dates back to the 1800s and we are doing our computations assuming everyone represented in the dataset is still alive and kicking. +Do the average age values look nonsensical? That's because we are working with historical data that +dates back to the 1800s and we are doing our computations assuming everyone represented in the +dataset is still alive and kicking. ## Nested grouping -The two previous queries could have been done with a nested `group_by`, but that wouldn't have let us show off some of these features. 😉 -To do a nested `group_by`, simply list the columns that will be used for grouping. +The two previous queries could have been done with a nested `group_by`, but that wouldn't have let +us show off some of these features. 😉 To do a nested `group_by`, simply list the columns that will +be used for grouping. -First, we use a nested `group_by` to figure out how many delegates of a state are “Pro” or “Anti” administration: +First, we use a nested `group_by` to figure out how many delegates of a state are “Pro” or “Anti” +administration: {{code_block('user-guide/expressions/aggregation','nested',['group_by'])}} @@ -87,14 +95,14 @@ Next, we use a nested `group_by` to compute the average age of delegates per sta --8<-- "python/user-guide/expressions/aggregation.py:filter-nested" ``` -Note that we get the same results but the format of the data is different. -Depending on the situation, one format may be more suitable than the other. +Note that we get the same results but the format of the data is different. Depending on the +situation, one format may be more suitable than the other. ## Sorting -It is common to see a dataframe being sorted for the sole purpose of managing the ordering during a grouping operation. -Let's say that we want to get the names of the oldest and youngest politicians per state. -We could start by sorting and then grouping: +It is common to see a dataframe being sorted for the sole purpose of managing the ordering during a +grouping operation. Let's say that we want to get the names of the oldest and youngest politicians +per state. We could start by sorting and then grouping: {{code_block('user-guide/expressions/aggregation','sort',['group_by'])}} @@ -102,8 +110,9 @@ We could start by sorting and then grouping: --8<-- "python/user-guide/expressions/aggregation.py:sort" ``` -However, if we also want to sort the names alphabetically, we need to perform an extra sort operation. -Luckily, we can sort in a `group_by` context without changing the sorting of the underlying dataframe: +However, if we also want to sort the names alphabetically, we need to perform an extra sort +operation. Luckily, we can sort in a `group_by` context without changing the sorting of the +underlying dataframe: {{code_block('user-guide/expressions/aggregation','sort2',['group_by'])}} @@ -111,8 +120,9 @@ Luckily, we can sort in a `group_by` context without changing the sorting of the --8<-- "python/user-guide/expressions/aggregation.py:sort2" ``` -We can even sort a column with the order induced by another column, and this also works inside the context `group_by`. -This modification to the previous query lets us check if the delegate with the first name is male or female: +We can even sort a column with the order induced by another column, and this also works inside the +context `group_by`. This modification to the previous query lets us check if the delegate with the +first name is male or female: {{code_block('user-guide/expressions/aggregation','sort3',['group_by'])}} @@ -127,10 +137,13 @@ This modification to the previous query lets us check if the delegate with the f The following section is specific to Python, and doesn't apply to Rust. Within Rust, blocks and closures (lambdas) can, and will, be executed concurrently. -Python is generally slower than Rust. -Besides the overhead of running “slow” bytecode, Python has to remain within the constraints of the Global Interpreter Lock (GIL). -This means that if you were to use a `lambda` or a custom Python function to apply during a parallelized phase, Polars' speed is capped running Python code, preventing any multiple threads from executing the function. +Python is generally slower than Rust. Besides the overhead of running “slow” bytecode, Python has to +remain within the constraints of the Global Interpreter Lock (GIL). This means that if you were to +use a `lambda` or a custom Python function to apply during a parallelized phase, Polars' speed is +capped running Python code, preventing any multiple threads from executing the function. -Polars will try to parallelize the computation of the aggregating functions over the groups, so it is recommended that you avoid using `lambda`s and custom Python functions as much as possible. -Instead, try to stay within the realm of the Polars expression API. -This is not always possible, though, so if you want to learn more about using `lambda`s you can go [the user guide section on using user-defined functions](user-defined-python-functions.md). +Polars will try to parallelize the computation of the aggregating functions over the groups, so it +is recommended that you avoid using `lambda`s and custom Python functions as much as possible. +Instead, try to stay within the realm of the Polars expression API. This is not always possible, +though, so if you want to learn more about using `lambda`s you can go +[the user guide section on using user-defined functions](user-defined-python-functions.md). diff --git a/docs/source/user-guide/expressions/basic-operations.md b/docs/source/user-guide/expressions/basic-operations.md index 8cfce9e5392e..b3eaa6364850 100644 --- a/docs/source/user-guide/expressions/basic-operations.md +++ b/docs/source/user-guide/expressions/basic-operations.md @@ -1,7 +1,8 @@ # Basic operations -This section shows how to do basic operations on dataframe columns, like do basic arithmetic calculations, perform comparisons, and other general-purpose operations. -We will use the following dataframe for the examples that follow: +This section shows how to do basic operations on dataframe columns, like do basic arithmetic +calculations, perform comparisons, and other general-purpose operations. We will use the following +dataframe for the examples that follow: {{code_block('user-guide/expressions/operations', 'dataframe', ['DataFrame'])}} @@ -12,7 +13,8 @@ We will use the following dataframe for the examples that follow: ## Basic arithmetic Polars supports basic arithmetic between series of the same length, or between series and literals. -When literals are mixed with series, the literals are broadcast to match the length of the series they are being used with. +When literals are mixed with series, the literals are broadcast to match the length of the series +they are being used with. {{code_block('user-guide/expressions/operations', 'arithmetic', ['operators'])}} @@ -20,10 +22,12 @@ When literals are mixed with series, the literals are broadcast to match the len --8<-- "python/user-guide/expressions/operations.py:arithmetic" ``` -The example above shows that when an arithmetic operation takes `null` as one of its operands, the result is `null`. +The example above shows that when an arithmetic operation takes `null` as one of its operands, the +result is `null`. -Polars uses operator overloading to allow you to use your language's native arithmetic operators within your expressions. -If you prefer, in Python you can use the corresponding named functions, as the snippet below demonstrates: +Polars uses operator overloading to allow you to use your language's native arithmetic operators +within your expressions. If you prefer, in Python you can use the corresponding named functions, as +the snippet below demonstrates: ```python --8<-- "python/user-guide/expressions/operations.py:operator-overloading" @@ -35,7 +39,8 @@ If you prefer, in Python you can use the corresponding named functions, as the s ## Comparisons -Like with arithmetic operations, Polars supports comparisons via the overloaded operators or named functions: +Like with arithmetic operations, Polars supports comparisons via the overloaded operators or named +functions: {{code_block('user-guide/expressions/operations','comparison',['operators'])}} @@ -45,7 +50,8 @@ Like with arithmetic operations, Polars supports comparisons via the overloaded ## Boolean and bitwise operations -Depending on the language, you may use the operators `&`, `|`, and `~`, for the Boolean operations “and”, “or”, and “not”, respectively, or the functions of the same name: +Depending on the language, you may use the operators `&`, `|`, and `~`, for the Boolean operations +“and”, “or”, and “not”, respectively, or the functions of the same name: {{code_block('user-guide/expressions/operations', 'boolean', ['operators'])}} @@ -59,7 +65,8 @@ Depending on the language, you may use the operators `&`, `|`, and `~`, for the Similarly, we cannot use the keywords `and`, `or`, and `not`, as the Boolean operators because these Python keywords will interpret their operands in the context of Truthy and Falsy through the dunder method `__bool__`. Thus, we overload the bitwise operators `&`, `|`, and `~`, as the Boolean operators because they are the second best choice. -These operators/functions can also be used for the respective bitwise operations, alongside the bitwise operator `^` / function `xor`: +These operators/functions can also be used for the respective bitwise operations, alongside the +bitwise operator `^` / function `xor`: {{code_block('user-guide/expressions/operations', 'bitwise', [])}} @@ -69,10 +76,11 @@ These operators/functions can also be used for the respective bitwise operations ## Counting (unique) values -Polars has two functions to count the number of unique values in a series. -The function `n_unique` can be used to count the exact number of unique values in a series. -However, for very large data sets, this operation can be quite slow. -In those cases, if an approximation is good enough, you can use the function `approx_n_unique` that uses the algorithm [HyperLogLog++](https://en.wikipedia.org/wiki/HyperLogLog) to estimate the result. +Polars has two functions to count the number of unique values in a series. The function `n_unique` +can be used to count the exact number of unique values in a series. However, for very large data +sets, this operation can be quite slow. In those cases, if an approximation is good enough, you can +use the function `approx_n_unique` that uses the algorithm +[HyperLogLog++](https://en.wikipedia.org/wiki/HyperLogLog) to estimate the result. The example below shows an example series where the `approx_n_unique` estimation is wrong by 0.9%: @@ -82,7 +90,8 @@ The example below shows an example series where the `approx_n_unique` estimation --8<-- "python/user-guide/expressions/operations.py:count" ``` -You can get more information about the unique values and their counts with the function `value_counts`, that Polars also provides: +You can get more information about the unique values and their counts with the function +`value_counts`, that Polars also provides: {{code_block('user-guide/expressions/operations', 'value_counts', ['value_counts'])}} @@ -90,9 +99,11 @@ You can get more information about the unique values and their counts with the f --8<-- "python/user-guide/expressions/operations.py:value_counts" ``` -The function `value_counts` returns the results in [structs, a data type that we will explore in a later section](structs.md). +The function `value_counts` returns the results in +[structs, a data type that we will explore in a later section](structs.md). -Alternatively, if you only need a series with the unique values or a series with the unique counts, they are one function away: +Alternatively, if you only need a series with the unique values or a series with the unique counts, +they are one function away: {{code_block('user-guide/expressions/operations', 'unique_counts', ['unique', 'unique_counts'])}} @@ -100,18 +111,23 @@ Alternatively, if you only need a series with the unique values or a series with --8<-- "python/user-guide/expressions/operations.py:unique_counts" ``` -Note that we need to specify `maintain_order=True` in the function `unique` so that the order of the results is consistent with the order of the results in `unique_counts`. -See the API reference for more information. +Note that we need to specify `maintain_order=True` in the function `unique` so that the order of the +results is consistent with the order of the results in `unique_counts`. See the API reference for +more information. ## Conditionals -Polars supports something akin to a ternary operator through the function `when`, which is followed by one function `then` and an optional function `otherwise`. +Polars supports something akin to a ternary operator through the function `when`, which is followed +by one function `then` and an optional function `otherwise`. -The function `when` accepts a predicate expression. -The values that evaluate to `True` are replaced by the corresponding values of the expression inside the function `then`. -The values that evaluate to `False` are replaced by the corresponding values of the expression inside the function `otherwise` or `null`, if `otherwise` is not provided. +The function `when` accepts a predicate expression. The values that evaluate to `True` are replaced +by the corresponding values of the expression inside the function `then`. The values that evaluate +to `False` are replaced by the corresponding values of the expression inside the function +`otherwise` or `null`, if `otherwise` is not provided. -The example below applies one step of the [Collatz conjecture](https://en.wikipedia.org/wiki/Collatz_conjecture) to the numbers in the column “nrs”: +The example below applies one step of the +[Collatz conjecture](https://en.wikipedia.org/wiki/Collatz_conjecture) to the numbers in the column +“nrs”: {{code_block('user-guide/expressions/operations', 'collatz', ['when'])}} @@ -119,5 +135,7 @@ The example below applies one step of the [Collatz conjecture](https://en.wikipe --8<-- "python/user-guide/expressions/operations.py:collatz" ``` -You can also emulate a chain of an arbitrary number of conditionals, akin to Python's `elif` statement, by chaining an arbitrary number of consecutive blocks of `.when(...).then(...)`. -In those cases, and for each given value, Polars will only consider a replacement expression that is deeper within the chain if the previous predicates all failed for that value. +You can also emulate a chain of an arbitrary number of conditionals, akin to Python's `elif` +statement, by chaining an arbitrary number of consecutive blocks of `.when(...).then(...)`. In those +cases, and for each given value, Polars will only consider a replacement expression that is deeper +within the chain if the previous predicates all failed for that value. diff --git a/docs/source/user-guide/expressions/casting.md b/docs/source/user-guide/expressions/casting.md index daa11a94a31e..4642ee1148f3 100644 --- a/docs/source/user-guide/expressions/casting.md +++ b/docs/source/user-guide/expressions/casting.md @@ -1,15 +1,19 @@ # Casting -Casting converts the [underlying data type of a column](../concepts/data-types-and-structures.md) to a new one. -Casting is available through the function `cast`. +Casting converts the [underlying data type of a column](../concepts/data-types-and-structures.md) to +a new one. Casting is available through the function `cast`. -The function `cast` includes a parameter `strict` that determines how Polars behaves when it encounters a value that cannot be converted from the source data type to the target data type. -The default behaviour is `strict=True`, which means that Polars will thrown an error to notify the user of the failed conversion while also providing details on the values that couldn't be cast. -On the other hand, if `strict=False`, any values that cannot be converted to the target data type will be quietly converted to `null`. +The function `cast` includes a parameter `strict` that determines how Polars behaves when it +encounters a value that cannot be converted from the source data type to the target data type. The +default behaviour is `strict=True`, which means that Polars will thrown an error to notify the user +of the failed conversion while also providing details on the values that couldn't be cast. On the +other hand, if `strict=False`, any values that cannot be converted to the target data type will be +quietly converted to `null`. ## Basic example -Let's take a look at the following dataframe which contains both integers and floating point numbers: +Let's take a look at the following dataframe which contains both integers and floating point +numbers: {{code_block('user-guide/expressions/casting', 'dfnum', [])}} @@ -17,7 +21,8 @@ Let's take a look at the following dataframe which contains both integers and fl --8<-- "python/user-guide/expressions/casting.py:dfnum" ``` -To perform casting operations between floats and integers, or vice versa, we use the function `cast`: +To perform casting operations between floats and integers, or vice versa, we use the function +`cast`: {{code_block('user-guide/expressions/casting','castnum',['cast'])}} @@ -29,8 +34,9 @@ Note that floating point numbers are truncated when casting to an integer data t ## Downcasting numerical data types -You can reduce the memory footprint of a column by changing the precision associated with its numeric data type. -As an illustration, the code below demonstrates how casting from `Int64` to `Int16` and from `Float64` to `Float32` can be used to lower memory usage: +You can reduce the memory footprint of a column by changing the precision associated with its +numeric data type. As an illustration, the code below demonstrates how casting from `Int64` to +`Int16` and from `Float64` to `Float32` can be used to lower memory usage: {{code_block('user-guide/expressions/casting','downcast',['cast', 'estimated_size'])}} @@ -38,9 +44,11 @@ As an illustration, the code below demonstrates how casting from `Int64` to `Int --8<-- "python/user-guide/expressions/casting.py:downcast" ``` -When performing downcasting it is crucial to ensure that the chosen number of bits (such as 64, 32, or 16) is sufficient to accommodate the largest and smallest numbers in the column. -For example, a 32-bit signed integer (`Int32`) represents integers between -2147483648 and 2147483647, inclusive, while an 8-bit signed integer only represents integers between -128 and 127, inclusive. -Attempting to downcast to a data type with insufficient precision results in an error thrown by Polars: +When performing downcasting it is crucial to ensure that the chosen number of bits (such as 64, 32, +or 16) is sufficient to accommodate the largest and smallest numbers in the column. For example, a +32-bit signed integer (`Int32`) represents integers between -2147483648 and 2147483647, inclusive, +while an 8-bit signed integer only represents integers between -128 and 127, inclusive. Attempting +to downcast to a data type with insufficient precision results in an error thrown by Polars: {{code_block('user-guide/expressions/casting','overflow',['cast'])}} @@ -48,7 +56,8 @@ Attempting to downcast to a data type with insufficient precision results in an --8<-- "python/user-guide/expressions/casting.py:overflow" ``` -If you set the parameter `strict` to `False` the overflowing/underflowing values are converted to `null`: +If you set the parameter `strict` to `False` the overflowing/underflowing values are converted to +`null`: {{code_block('user-guide/expressions/casting','overflow2',['cast'])}} @@ -58,8 +67,8 @@ If you set the parameter `strict` to `False` the overflowing/underflowing values ## Converting strings to numeric data types -Strings that represent numbers can be converted to the appropriate data types via casting. -The opposite conversion is also possible: +Strings that represent numbers can be converted to the appropriate data types via casting. The +opposite conversion is also possible: {{code_block('user-guide/expressions/casting','strings',['cast'])}} @@ -67,8 +76,9 @@ The opposite conversion is also possible: --8<-- "python/user-guide/expressions/casting.py:strings" ``` -In case the column contains a non-numerical value, or a poorly formatted one, Polars will throw an error with details on the conversion error. -You can set `strict=False` to circumvent the error and get a `null` value instead. +In case the column contains a non-numerical value, or a poorly formatted one, Polars will throw an +error with details on the conversion error. You can set `strict=False` to circumvent the error and +get a `null` value instead. {{code_block('user-guide/expressions/casting','strings2',['cast'])}} @@ -78,10 +88,11 @@ You can set `strict=False` to circumvent the error and get a `null` value instea ## Booleans -Booleans can be expressed as either 1 (`True`) or 0 (`False`). -It's possible to perform casting operations between a numerical data type and a Boolean, and vice versa. +Booleans can be expressed as either 1 (`True`) or 0 (`False`). It's possible to perform casting +operations between a numerical data type and a Boolean, and vice versa. -When converting numbers to Booleans, the number 0 is converted to `False` and all other numbers are converted to `True`, in alignment with Python's Truthy and Falsy values for numbers: +When converting numbers to Booleans, the number 0 is converted to `False` and all other numbers are +converted to `True`, in alignment with Python's Truthy and Falsy values for numbers: {{code_block('user-guide/expressions/casting','bool',['cast'])}} @@ -91,9 +102,10 @@ When converting numbers to Booleans, the number 0 is converted to `False` and al ## Parsing / formatting temporal data types -All temporal data types are represented internally as the number of time units elapsed since a reference moment, usually referred to as the epoch. -For example, values of the data type `Date` are stored as the number of days since the epoch. -For the data type `Datetime` the time unit is the microsecond (us) and for `Time` the time unit is the nanosecond (ns). +All temporal data types are represented internally as the number of time units elapsed since a +reference moment, usually referred to as the epoch. For example, values of the data type `Date` are +stored as the number of days since the epoch. For the data type `Datetime` the time unit is the +microsecond (us) and for `Time` the time unit is the nanosecond (ns). Casting between numerical types and temporal data types is allowed and exposes this relationship: @@ -103,8 +115,10 @@ Casting between numerical types and temporal data types is allowed and exposes t --8<-- "python/user-guide/expressions/casting.py:dates" ``` -To format temporal data types as strings we can use the function `dt.to_string` and to parse temporal data types from strings we can use the function `str.to_datetime`. -Both functions adopt the [chrono format syntax](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) for formatting. +To format temporal data types as strings we can use the function `dt.to_string` and to parse +temporal data types from strings we can use the function `str.to_datetime`. Both functions adopt the +[chrono format syntax](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) for +formatting. {{code_block('user-guide/expressions/casting','dates2',['dt.to_string','str.to_date'])}} @@ -112,5 +126,5 @@ Both functions adopt the [chrono format syntax](https://docs.rs/chrono/latest/ch --8<-- "python/user-guide/expressions/casting.py:dates2" ``` -It's worth noting that `str.to_datetime` features additional options that support timezone functionality. -Refer to the API documentation for further information. +It's worth noting that `str.to_datetime` features additional options that support timezone +functionality. Refer to the API documentation for further information. diff --git a/docs/source/user-guide/expressions/categorical-data-and-enums.md b/docs/source/user-guide/expressions/categorical-data-and-enums.md index 4c9ce9734d3a..64ce602b40a2 100644 --- a/docs/source/user-guide/expressions/categorical-data-and-enums.md +++ b/docs/source/user-guide/expressions/categorical-data-and-enums.md @@ -1,27 +1,33 @@ # Categorical data and enums -A column that holds string values that can only take on one of a limited number of possible values is a column that holds [categorical data](https://en.wikipedia.org/wiki/Categorical_variable). -Usually, the number of possible values is much smaller than the length of the column. -Some typical examples include your nationality, the operating system of your computer, or the license that your favorite open source project uses. - -When working with categorical data you can use Polars' dedicated types, `Categorical` and `Enum`, to make your queries more performant. -Now, we will see what are the differences between the two data types `Categorical` and `Enum` and when you should use one data type or the other. -We also include some notes on [why the data types `Categorical` and `Enum` are more efficient than using the plain string values](#performance-considerations-on-categorical-data-types) in the end of this user guide section. +A column that holds string values that can only take on one of a limited number of possible values +is a column that holds [categorical data](https://en.wikipedia.org/wiki/Categorical_variable). +Usually, the number of possible values is much smaller than the length of the column. Some typical +examples include your nationality, the operating system of your computer, or the license that your +favorite open source project uses. + +When working with categorical data you can use Polars' dedicated types, `Categorical` and `Enum`, to +make your queries more performant. Now, we will see what are the differences between the two data +types `Categorical` and `Enum` and when you should use one data type or the other. We also include +some notes on +[why the data types `Categorical` and `Enum` are more efficient than using the plain string values](#performance-considerations-on-categorical-data-types) +in the end of this user guide section. ## `Enum` vs `Categorical` -In short, you should prefer `Enum` over `Categorical` whenever possible. -When the categories are fixed and known up front, use `Enum`. -When you don't know the categories or they are not fixed then you must use `Categorical`. -In case your requirements change along the way you can always cast from one to the other. +In short, you should prefer `Enum` over `Categorical` whenever possible. When the categories are +fixed and known up front, use `Enum`. When you don't know the categories or they are not fixed then +you must use `Categorical`. In case your requirements change along the way you can always cast from +one to the other. ## Data type `Enum` ### Creating an `Enum` -The data type `Enum` is an ordered categorical data type. -To use the data type `Enum` you have to specify the categories in advance to create a new data type that is a variant of an `Enum`. -Then, when creating a new series, a new dataframe, or when casting a string column, you can use that `Enum` variant. +The data type `Enum` is an ordered categorical data type. To use the data type `Enum` you have to +specify the categories in advance to create a new data type that is a variant of an `Enum`. Then, +when creating a new series, a new dataframe, or when casting a string column, you can use that +`Enum` variant. {{code_block('user-guide/expressions/categoricals', 'enum-example', ['Enum'])}} @@ -31,7 +37,8 @@ Then, when creating a new series, a new dataframe, or when casting a string colu ### Invalid values -Polars will raise an error if you try to specify a data type `Enum` whose categories do not include all the values present: +Polars will raise an error if you try to specify a data type `Enum` whose categories do not include +all the values present: {{code_block('user-guide/expressions/categoricals', 'enum-wrong-value', ['Enum'])}} @@ -39,12 +46,14 @@ Polars will raise an error if you try to specify a data type `Enum` whose catego --8<-- "python/user-guide/expressions/categoricals.py:enum-wrong-value" ``` -If you are in a position where you cannot know all of the possible values in advance and erroring on unknown values is semantically wrong, you may need to [use the data type `Categorical`](#data-type-categorical). +If you are in a position where you cannot know all of the possible values in advance and erroring on +unknown values is semantically wrong, you may need to +[use the data type `Categorical`](#data-type-categorical). ### Category ordering and comparison -The data type `Enum` is ordered and the order is induced by the order in which you specify the categories. -The example below uses log levels as an example of where an ordered `Enum` is useful: +The data type `Enum` is ordered and the order is induced by the order in which you specify the +categories. The example below uses log levels as an example of where an ordered `Enum` is useful: {{code_block('user-guide/expressions/categoricals', 'log-levels', ['Enum'])}} @@ -52,10 +61,12 @@ The example below uses log levels as an example of where an ordered `Enum` is us --8<-- "python/user-guide/expressions/categoricals.py:log-levels" ``` -This example shows that we can compare `Enum` values with a string, but this only works if the string matches one of the `Enum` values. -If we compared the column “level” with any string other than `"debug"`, `"info"`, `"warning"`, or `"error"`, Polars would raise an exception. +This example shows that we can compare `Enum` values with a string, but this only works if the +string matches one of the `Enum` values. If we compared the column “level” with any string other +than `"debug"`, `"info"`, `"warning"`, or `"error"`, Polars would raise an exception. -Columns with the data type `Enum` can also be compared with other columns that have the same data type `Enum` or columns that hold strings, but only if all the strings are valid `Enum` values. +Columns with the data type `Enum` can also be compared with other columns that have the same data +type `Enum` or columns that hold strings, but only if all the strings are valid `Enum` values. ## Data type `Categorical` @@ -63,7 +74,8 @@ The data type `Categorical` can be seen as a more flexible version of `Enum`. ### Creating a `Categorical` series -To use the data type `Categorical`, you can cast a column of strings or specify `Categorical` as the data type of a series or dataframe column: +To use the data type `Categorical`, you can cast a column of strings or specify `Categorical` as the +data type of a series or dataframe column: {{code_block('user-guide/expressions/categoricals', 'categorical-example', ['Categorical'])}} @@ -71,48 +83,57 @@ To use the data type `Categorical`, you can cast a column of strings or specify --8<-- "python/user-guide/expressions/categoricals.py:categorical-example" ``` -Having Polars infer the categories for you may sound strictly better than listing the categories beforehand, but this inference comes with a performance cost. -That is why, whenever possible, you should use `Enum`. -You can learn more by [reading the subsection about the data type `Categorical` and its encodings](#data-type-categorical-and-encodings). +Having Polars infer the categories for you may sound strictly better than listing the categories +beforehand, but this inference comes with a performance cost. That is why, whenever possible, you +should use `Enum`. You can learn more by +[reading the subsection about the data type `Categorical` and its encodings](#data-type-categorical-and-encodings). ### Lexical comparison with strings When comparing a `Categorical` column with a string, Polars will perform a lexical comparison: -{{code_block('user-guide/expressions/categoricals', 'categorical-comparison-string', ['Categorical'])}} +{{code_block('user-guide/expressions/categoricals', 'categorical-comparison-string', +['Categorical'])}} ```python exec="on" result="text" session="expressions/categoricals" --8<-- "python/user-guide/expressions/categoricals.py:categorical-comparison-string" ``` -You can also compare a column of strings with your `Categorical` column, and the comparison will also be lexical: +You can also compare a column of strings with your `Categorical` column, and the comparison will +also be lexical: -{{code_block('user-guide/expressions/categoricals', 'categorical-comparison-string-column', ['Categorical'])}} +{{code_block('user-guide/expressions/categoricals', 'categorical-comparison-string-column', +['Categorical'])}} ```python exec="on" result="text" session="expressions/categoricals" --8<-- "python/user-guide/expressions/categoricals.py:categorical-comparison-string-column" ``` -Although it is possible to compare a string column with a categorical column, it is typically more efficient to compare two categorical columns. -We will see how to do that next. +Although it is possible to compare a string column with a categorical column, it is typically more +efficient to compare two categorical columns. We will see how to do that next. ### Comparing `Categorical` columns and the string cache -You are told that comparing columns with the data type `Categorical` is more efficient than if one of them is a string column. -So, you change your code so that the second column is also a categorical column and then you perform your comparison... -But Polars raises an exception: +You are told that comparing columns with the data type `Categorical` is more efficient than if one +of them is a string column. So, you change your code so that the second column is also a categorical +column and then you perform your comparison... But Polars raises an exception: -{{code_block('user-guide/expressions/categoricals', 'categorical-comparison-categorical-column', ['Categorical'])}} +{{code_block('user-guide/expressions/categoricals', 'categorical-comparison-categorical-column', +['Categorical'])}} ```python exec="on" result="text" session="expressions/categoricals" --8<-- "python/user-guide/expressions/categoricals.py:categorical-comparison-categorical-column" ``` -By default, the values in columns with the data type `Categorical` are [encoded in the order they are seen in the column](#encodings), and independently from other columns, which means that Polars cannot compare efficiently two categorical columns that were created independently. +By default, the values in columns with the data type `Categorical` are +[encoded in the order they are seen in the column](#encodings), and independently from other +columns, which means that Polars cannot compare efficiently two categorical columns that were +created independently. Enabling the Polars string cache and creating the columns with the cache enabled fixes this issue: -{{code_block('user-guide/expressions/categoricals', 'stringcache-categorical-equality', ['StringCache', 'Categorical'])}} +{{code_block('user-guide/expressions/categoricals', 'stringcache-categorical-equality', +['StringCache', 'Categorical'])}} ```python exec="on" result="text" session="expressions/categoricals" --8<-- "python/user-guide/expressions/categoricals.py:stringcache-categorical-equality" @@ -122,25 +143,31 @@ Note that using [the string cache comes at a performance cost](#using-the-global ### Combining `Categorical` columns -The string cache is also useful in any operation that combines or mixes two columns with the data type `Categorical` in any way. -An example of this is when [concatenating two dataframes vertically](../getting-started.md#concatenating-dataframes): +The string cache is also useful in any operation that combines or mixes two columns with the data +type `Categorical` in any way. An example of this is when +[concatenating two dataframes vertically](../getting-started.md#concatenating-dataframes): -{{code_block('user-guide/expressions/categoricals', 'concatenating-categoricals', ['StringCache', 'Categorical'])}} +{{code_block('user-guide/expressions/categoricals', 'concatenating-categoricals', ['StringCache', +'Categorical'])}} ```python exec="on" result="text" session="expressions/categoricals" --8<-- "python/user-guide/expressions/categoricals.py:concatenating-categoricals" ``` -In this case, Polars issues a warning complaining about an expensive reenconding that implies taking a performance hit. -Polars then suggests using the data type `Enum` if possible, or using the string cache. -To understand the issue with this operation and why Polars raises an error, please read the final section about [the performance considerations of using categorical data types](#performance-considerations-on-categorical-data-types). +In this case, Polars issues a warning complaining about an expensive reenconding that implies taking +a performance hit. Polars then suggests using the data type `Enum` if possible, or using the string +cache. To understand the issue with this operation and why Polars raises an error, please read the +final section about +[the performance considerations of using categorical data types](#performance-considerations-on-categorical-data-types). ### Comparison between `Categorical` columns is not lexical -When comparing two columns with data type `Categorical`, Polars does not perform lexical comparison between the values by default. -If you want lexical ordering, you need to specify so when creating the column: +When comparing two columns with data type `Categorical`, Polars does not perform lexical comparison +between the values by default. If you want lexical ordering, you need to specify so when creating +the column: -{{code_block('user-guide/expressions/categoricals', 'stringcache-categorical-comparison-lexical', ['StringCache', 'Categorical'])}} +{{code_block('user-guide/expressions/categoricals', 'stringcache-categorical-comparison-lexical', +['StringCache', 'Categorical'])}} ```python exec="on" result="text" session="expressions/categoricals" --8<-- "python/user-guide/expressions/categoricals.py:stringcache-categorical-comparison-lexical" @@ -148,7 +175,8 @@ If you want lexical ordering, you need to specify so when creating the column: Otherwise, the order is inferred together with the values: -{{code_block('user-guide/expressions/categoricals', 'stringcache-categorical-comparison-physical', ['StringCache', 'Categorical'])}} +{{code_block('user-guide/expressions/categoricals', 'stringcache-categorical-comparison-physical', +['StringCache', 'Categorical'])}} ```python exec="on" result="text" session="expressions/categoricals" --8<-- "python/user-guide/expressions/categoricals.py:stringcache-categorical-comparison-physical" @@ -163,14 +191,17 @@ This part of the user guide explains ### Encodings -Categorical data represents string data where the values in the column have a finite set of values (usually way smaller than the length of the column). -Storing these values as plain strings is a waste of memory and performance as we will be repeating the same string over and over again. +Categorical data represents string data where the values in the column have a finite set of values +(usually way smaller than the length of the column). Storing these values as plain strings is a +waste of memory and performance as we will be repeating the same string over and over again. Additionally, in operations like joins we have to perform expensive string comparisons. -Categorical data types like `Enum` and `Categorical` let you encode the string values in a cheaper way, establishing a relationship between a cheap encoding value and the original string literal. +Categorical data types like `Enum` and `Categorical` let you encode the string values in a cheaper +way, establishing a relationship between a cheap encoding value and the original string literal. -As an example of a sensible encoding, Polars could choose to represent the finite set of categories as positive integers. -With that in mind, the diagram below shows a regular string column and a possible representation of a Polars column with the categorical data type: +As an example of a sensible encoding, Polars could choose to represent the finite set of categories +as positive integers. With that in mind, the diagram below shows a regular string column and a +possible representation of a Polars column with the categorical data type: @@ -269,26 +300,27 @@ With that in mind, the diagram below shows a regular string column and a possibl
String Column Categorical Column
-The physical `0` in this case encodes (or maps) to the value 'Polar', the value `1` encodes to 'Panda', and the value `2` to 'Brown'. -This encoding has the benefit of only storing the string values once. -Additionally, when we perform operations (e.g. sorting, counting) we can work directly on the physical representation which is much faster than the working with string data. +The physical `0` in this case encodes (or maps) to the value 'Polar', the value `1` encodes to +'Panda', and the value `2` to 'Brown'. This encoding has the benefit of only storing the string +values once. Additionally, when we perform operations (e.g. sorting, counting) we can work directly +on the physical representation which is much faster than the working with string data. ### Encodings for the data type `Enum` are global -When working with the data type `Enum` we specify the categories in advance. -This way, Polars can ensure different columns and even different datasets have the same encoding and there is no need for expensive re-encoding or cache lookups. +When working with the data type `Enum` we specify the categories in advance. This way, Polars can +ensure different columns and even different datasets have the same encoding and there is no need for +expensive re-encoding or cache lookups. ### Data type `Categorical` and encodings -The fact that the categories for the data type `Categorical` are inferred come at a cost. -The main cost here is that we have no control over our encodings. +The fact that the categories for the data type `Categorical` are inferred come at a cost. The main +cost here is that we have no control over our encodings. Consider the following scenario where we append the following two categorical series: {{code_block('user-guide/concepts/data-types/categoricals','append',[])}} -Polars encodes the string values in the order they appear. -So, the series would look like this: +Polars encodes the string values in the order they appear. So, the series would look like this: @@ -404,13 +436,15 @@ So, the series would look like this:
cat_series cat2_series
-Combining the series becomes a non-trivial task which is expensive as the physical value of `0` represents something different in both series. -Polars does support these types of operations for convenience, however these should be avoided due to its slower performance as it requires making both encodings compatible first before doing any merge operations. +Combining the series becomes a non-trivial task which is expensive as the physical value of `0` +represents something different in both series. Polars does support these types of operations for +convenience, however these should be avoided due to its slower performance as it requires making +both encodings compatible first before doing any merge operations. ### Using the global string cache -One way to handle this reencoding problem is to enable the string cache. -Under the string cache, the diagram would instead look like this: +One way to handle this reencoding problem is to enable the string cache. Under the string cache, the +diagram would instead look like this: @@ -496,10 +530,12 @@ Under the string cache, the diagram would instead look like this:
SeriesString cache
-When you enable the string cache, strings are no longer encoded in the order they appear on a per-column basis. -Instead, the encoding is shared across columns. -The value 'Polar' will always be encoded by the same value for all categorical columns created under the string cache. -Merge operations (e.g. appends, joins) become cheap again as there is no need to make the encodings compatible first, solving the problem we had above. +When you enable the string cache, strings are no longer encoded in the order they appear on a +per-column basis. Instead, the encoding is shared across columns. The value 'Polar' will always be +encoded by the same value for all categorical columns created under the string cache. Merge +operations (e.g. appends, joins) become cheap again as there is no need to make the encodings +compatible first, solving the problem we had above. -However, the string cache does come at a small performance hit during construction of the series as we need to look up or insert the string values in the cache. -Therefore, it is preferred to use the data type `Enum` if you know your categories in advance. +However, the string cache does come at a small performance hit during construction of the series as +we need to look up or insert the string values in the cache. Therefore, it is preferred to use the +data type `Enum` if you know your categories in advance. diff --git a/docs/source/user-guide/expressions/expression-expansion.md b/docs/source/user-guide/expressions/expression-expansion.md index c40c1ddeeafd..23c1bdb3fb25 100644 --- a/docs/source/user-guide/expressions/expression-expansion.md +++ b/docs/source/user-guide/expressions/expression-expansion.md @@ -1,12 +1,18 @@ # Expression expansion -As you've seen in [the section about expressions and contexts](../concepts/expressions-and-contexts.md), expression expansion is a feature that enables you to write a single expression that can expand to multiple different expressions, possibly depending on the schema of the context in which the expression is used. +As you've seen in +[the section about expressions and contexts](../concepts/expressions-and-contexts.md), expression +expansion is a feature that enables you to write a single expression that can expand to multiple +different expressions, possibly depending on the schema of the context in which the expression is +used. -This feature isn't just decorative or syntactic sugar. -It allows for a very powerful application of [DRY](https://en.wikipedia.org/wiki/Don%27t_repeat_yourself) principles in your code: -a single expression that specifies multiple columns expands into a list of expressions, which means you can write one single expression and reuse the computation that it represents. +This feature isn't just decorative or syntactic sugar. It allows for a very powerful application of +[DRY](https://en.wikipedia.org/wiki/Don%27t_repeat_yourself) principles in your code: a single +expression that specifies multiple columns expands into a list of expressions, which means you can +write one single expression and reuse the computation that it represents. -In this section we will show several forms of expression expansion and we will be using the dataframe that you can see below for that effect: +In this section we will show several forms of expression expansion and we will be using the +dataframe that you can see below for that effect: {{code_block('user-guide/expressions/expression-expansion', 'df', [])}} @@ -17,13 +23,16 @@ In this section we will show several forms of expression expansion and we will b ## Function `col` The function `col` is the most common way of making use of expression expansion features in Polars. -Typically used to refer to one column of a dataframe, in this section we explore other ways in which you can use `col` (or its variants, when in Rust). +Typically used to refer to one column of a dataframe, in this section we explore other ways in which +you can use `col` (or its variants, when in Rust). ### Explicit expansion by column name -The simplest form of expression expansion happens when you provide multiple column names to the function `col`. +The simplest form of expression expansion happens when you provide multiple column names to the +function `col`. -The example below uses a single function `col` with multiple column names to convert the values in USD to EUR: +The example below uses a single function `col` with multiple column names to convert the values in +USD to EUR: {{code_block('user-guide/expressions/expression-expansion', 'col-with-names', ['col'])}} @@ -31,8 +40,9 @@ The example below uses a single function `col` with multiple column names to con --8<-- "python/user-guide/expressions/expression-expansion.py:col-with-names" ``` -When you list the column names you want the expression to expand to, you can predict what the expression will expand to. -In this case, the expression that does the currency conversion is expanded to a list of five expressions: +When you list the column names you want the expression to expand to, you can predict what the +expression will expand to. In this case, the expression that does the currency conversion is +expanded to a list of five expressions: {{code_block('user-guide/expressions/expression-expansion', 'expression-list', ['col'])}} @@ -42,23 +52,28 @@ In this case, the expression that does the currency conversion is expanded to a ### Expansion by data type -We had to type five column names in the previous example but the function `col` can also conveniently accept one or more data types. -If you provide data types instead of column names, the expression is expanded to all columns that match one of the data types provided. +We had to type five column names in the previous example but the function `col` can also +conveniently accept one or more data types. If you provide data types instead of column names, the +expression is expanded to all columns that match one of the data types provided. The example below performs the exact same computation as before: -{{code_block('user-guide/expressions/expression-expansion', 'col-with-dtype', [], ['col'], ['dtype_col'])}} +{{code_block('user-guide/expressions/expression-expansion', 'col-with-dtype', [], ['col'], +['dtype_col'])}} ```python exec="on" result="text" session="expressions/expression-expansion" --8<-- "python/user-guide/expressions/expression-expansion.py:col-with-dtype" ``` -When we use a data type with expression expansion we cannot know, beforehand, how many columns a single expression will expand to. -We need the schema of the input dataframe if we want to determine what is the final list of expressions that is to be applied. +When we use a data type with expression expansion we cannot know, beforehand, how many columns a +single expression will expand to. We need the schema of the input dataframe if we want to determine +what is the final list of expressions that is to be applied. -If we weren't sure about whether the price columns where of the type `Float64` or `Float32`, we could specify both data types: +If we weren't sure about whether the price columns where of the type `Float64` or `Float32`, we +could specify both data types: -{{code_block('user-guide/expressions/expression-expansion', 'col-with-dtypes', [], ['col'], ['dtype_cols'])}} +{{code_block('user-guide/expressions/expression-expansion', 'col-with-dtypes', [], ['col'], +['dtype_cols'])}} ```python exec="on" result="text" session="expressions/expression-expansion" --8<-- "python/user-guide/expressions/expression-expansion.py:col-with-dtypes" @@ -66,9 +81,10 @@ If we weren't sure about whether the price columns where of the type `Float64` o ### Expansion by pattern matching -You can also use regular expressions to specify patterns that are used to match the column names. -To distinguish between a regular column name and expansion by pattern matching, regular expressions start and end with `^` and `$`, respectively. -This also means that the pattern must match against the whole column name string. +You can also use regular expressions to specify patterns that are used to match the column names. To +distinguish between a regular column name and expansion by pattern matching, regular expressions +start and end with `^` and `$`, respectively. This also means that the pattern must match against +the whole column name string. Regular expressions can be mixed with regular column names: @@ -80,7 +96,10 @@ Regular expressions can be mixed with regular column names: ### Arguments cannot be of mixed types -In Python, the function `col` accepts an arbitrary number of strings (as [column names](#explicit-expansion-by-column-name) or as [regular expressions](#expansion-by-pattern-matching)) or an arbitrary number of data types, but you cannot mix both in the same function call: +In Python, the function `col` accepts an arbitrary number of strings (as +[column names](#explicit-expansion-by-column-name) or as +[regular expressions](#expansion-by-pattern-matching)) or an arbitrary number of data types, but you +cannot mix both in the same function call: ```python --8<-- "python/user-guide/expressions/expression-expansion.py:col-error" @@ -106,8 +125,8 @@ Polars provides the function `all` as shorthand notation to refer to all columns ## Excluding columns -Polars also provides a mechanism to exclude certain columns from expression expansion. -For that, you use the function `exclude`, which accepts exactly the same types of arguments as `col`: +Polars also provides a mechanism to exclude certain columns from expression expansion. For that, you +use the function `exclude`, which accepts exactly the same types of arguments as `col`: {{code_block('user-guide/expressions/expression-expansion', 'all-exclude', ['exclude'])}} @@ -125,9 +144,11 @@ Naturally, the function `exclude` can also be used after the function `col`: ## Column renaming -By default, when you apply an expression to a column, the result keeps the same name as the original column. +By default, when you apply an expression to a column, the result keeps the same name as the original +column. -Preserving the column name can be semantically wrong and in certain cases Polars may even raise an error if duplicate names occur: +Preserving the column name can be semantically wrong and in certain cases Polars may even raise an +error if duplicate names occur: {{code_block('user-guide/expressions/expression-expansion', 'duplicate-error', [])}} @@ -135,11 +156,13 @@ Preserving the column name can be semantically wrong and in certain cases Polars --8<-- "python/user-guide/expressions/expression-expansion.py:duplicate-error" ``` -To prevent errors like this, and to allow users to rename their columns when appropriate, Polars provides a series of functions that let you change the name of a column or a group of columns. +To prevent errors like this, and to allow users to rename their columns when appropriate, Polars +provides a series of functions that let you change the name of a column or a group of columns. ### Renaming a single column with `alias` -The function `alias` has been used thoroughly in the documentation already and it lets you rename a single column: +The function `alias` has been used thoroughly in the documentation already and it lets you rename a +single column: {{code_block('user-guide/expressions/expression-expansion', 'alias', ['alias'])}} @@ -149,11 +172,14 @@ The function `alias` has been used thoroughly in the documentation already and i ### Prefixing and suffixing column names -When using expression expansion you cannot use the function `alias` because the function `alias` is designed specifically to rename a single column. +When using expression expansion you cannot use the function `alias` because the function `alias` is +designed specifically to rename a single column. -When it suffices to add a static prefix or a static suffix to the existing names, we can use the functions `prefix` and `suffix` from the namespace `name`: +When it suffices to add a static prefix or a static suffix to the existing names, we can use the +functions `prefix` and `suffix` from the namespace `name`: -{{code_block('user-guide/expressions/expression-expansion', 'prefix-suffix', ['Expr.name', 'prefix', 'suffix'])}} +{{code_block('user-guide/expressions/expression-expansion', 'prefix-suffix', ['Expr.name', 'prefix', +'suffix'])}} ```python exec="on" result="text" session="expressions/expression-expansion" --8<-- "python/user-guide/expressions/expression-expansion.py:prefix-suffix" @@ -161,7 +187,8 @@ When it suffices to add a static prefix or a static suffix to the existing names ### Dynamic name replacement -If a static prefix/suffix is not enough, the namespace `name` also provides the function `map` that accepts a callable that accepts the old column names and produces the new ones: +If a static prefix/suffix is not enough, the namespace `name` also provides the function `map` that +accepts a callable that accepts the old column names and produces the new ones: {{code_block('user-guide/expressions/expression-expansion', 'name-map', ['Expr.name', 'map'])}} @@ -173,8 +200,9 @@ See the API reference for the full contents of the namespace `name`. ## Programmatically generating expressions -Expression expansion is a very useful feature but it does not solve all of your problems. -For example, if we want to compute the day and year amplitude of the prices of the stocks in our dataframe, expression expansion won't help us. +Expression expansion is a very useful feature but it does not solve all of your problems. For +example, if we want to compute the day and year amplitude of the prices of the stocks in our +dataframe, expression expansion won't help us. At first, you may think about using a `for` loop: @@ -184,10 +212,9 @@ At first, you may think about using a `for` loop: --8<-- "python/user-guide/expressions/expression-expansion.py:for-with_columns" ``` -Do not do this. -Instead, generate all of the expressions you want to compute programmatically and use them only once in a context. -Loosely speaking, you want to swap the `for` loop with the context `with_columns`. -In practice, you could do something like the following: +Do not do this. Instead, generate all of the expressions you want to compute programmatically and +use them only once in a context. Loosely speaking, you want to swap the `for` loop with the context +`with_columns`. In practice, you could do something like the following: {{code_block('user-guide/expressions/expression-expansion', 'yield-expressions', [])}} @@ -195,20 +222,24 @@ In practice, you could do something like the following: --8<-- "python/user-guide/expressions/expression-expansion.py:yield-expressions" ``` -This produces the same final result and by specifying all of the expressions in one go we give Polars the opportunity to: +This produces the same final result and by specifying all of the expressions in one go we give +Polars the opportunity to: 1. do a better job at optimising the query; and 2. parallelise the execution of the actual computations. ## More flexible column selections -Polars comes with the submodule `selectors` that provides a number of functions that allow you to write more flexible column selections for expression expansion. +Polars comes with the submodule `selectors` that provides a number of functions that allow you to +write more flexible column selections for expression expansion. !!! warning This functionality is not available in Rust yet. Refer to [Polars issue #10594](https://github.com/pola-rs/polars/issues/10594). -As a first example, here is how we can use the functions `string` and `ends_with`, and the set operations that the functions from `selectors` support, to select all string columns and the columns whose names end with `"_high"`: +As a first example, here is how we can use the functions `string` and `ends_with`, and the set +operations that the functions from `selectors` support, to select all string columns and the columns +whose names end with `"_high"`: {{code_block('user-guide/expressions/expression-expansion', 'selectors', [], ['selectors'], [])}} @@ -216,9 +247,15 @@ As a first example, here is how we can use the functions `string` and `ends_with --8<-- "python/user-guide/expressions/expression-expansion.py:selectors" ``` -The submodule `selectors` provides [a number of selectors that match based on the data type of the columns](#selectors-for-data-types), of which the most useful are the functions that match a whole category of types, like `cs.numeric` for all numeric data types or `cs.temporal` for all temporal data types. +The submodule `selectors` provides +[a number of selectors that match based on the data type of the columns](#selectors-for-data-types), +of which the most useful are the functions that match a whole category of types, like `cs.numeric` +for all numeric data types or `cs.temporal` for all temporal data types. -The submodule `selectors` also provides [a number of selectors that match based on patterns in the column names](#selectors-for-column-name-patterns) which make it more convenient to specify common patterns you may want to check for, like the function `cs.ends_with` that was shown above. +The submodule `selectors` also provides +[a number of selectors that match based on patterns in the column names](#selectors-for-column-name-patterns) +which make it more convenient to specify common patterns you may want to check for, like the +function `cs.ends_with` that was shown above. ### Combining selectors with set operations @@ -237,7 +274,8 @@ We can combine multiple selectors using set operations and the usual Python oper The next example matches all non-string columns that contain an underscore in the name: -{{code_block('user-guide/expressions/expression-expansion', 'selectors-set-operations', [], ['selectors'], [])}} +{{code_block('user-guide/expressions/expression-expansion', 'selectors-set-operations', [], +['selectors'], [])}} ```python exec="on" result="text" session="expressions/expression-expansion" --8<-- "python/user-guide/expressions/expression-expansion.py:selectors-set-operations" @@ -247,19 +285,25 @@ The next example matches all non-string columns that contain an underscore in th Expression functions can be chained on top of selectors: -{{code_block('user-guide/expressions/expression-expansion', 'selectors-expressions', [], ['selectors'], [])}} +{{code_block('user-guide/expressions/expression-expansion', 'selectors-expressions', [], +['selectors'], [])}} ```python exec="on" result="text" session="expressions/expression-expansion" --8<-- "python/user-guide/expressions/expression-expansion.py:selectors-expressions" ``` However, some operators have been overloaded to operate both on Polars selectors and on expressions. -For example, the operator `~` on a selector represents [the set operation “complement”](#combining-selectors-with-set-operations) and on an expression represents the Boolean operation of negation. +For example, the operator `~` on a selector represents +[the set operation “complement”](#combining-selectors-with-set-operations) and on an expression +represents the Boolean operation of negation. -When you use a selector and then want to use, in the context of an expression, one of the [operators that act as set operators for selectors](#combining-selectors-with-set-operations), you can use the function `as_expr`. +When you use a selector and then want to use, in the context of an expression, one of the +[operators that act as set operators for selectors](#combining-selectors-with-set-operations), you +can use the function `as_expr`. -Below, we want to negate the Boolean values in the columns “has_partner”, “has_kids”, and “has_tattoos”. -If we are not careful, the combination of the operator `~` and the selector `cs.starts_with("has_")` will actually select the columns that we do not care about: +Below, we want to negate the Boolean values in the columns “has_partner”, “has_kids”, and +“has_tattoos”. If we are not careful, the combination of the operator `~` and the selector +`cs.starts_with("has_")` will actually select the columns that we do not care about: {{code_block('user-guide/expressions/expression-expansion', 'selector-ambiguity', [], [], [])}} @@ -277,20 +321,24 @@ The correct solution uses `as_expr`: ### Debugging selectors -When you are not sure whether you have a Polars selector at hand or not, you can use the function `cs.is_selector` to check: +When you are not sure whether you have a Polars selector at hand or not, you can use the function +`cs.is_selector` to check: -{{code_block('user-guide/expressions/expression-expansion', 'is_selector', [], ['is_selector'], [])}} +{{code_block('user-guide/expressions/expression-expansion', 'is_selector', [], ['is_selector'], +[])}} ```python exec="on" result="text" session="expressions/expression-expansion" --8<-- "python/user-guide/expressions/expression-expansion.py:is_selector" ``` -This should help you avoid any ambiguous situations where you think you are operating with expressions but are in fact operating with selectors. +This should help you avoid any ambiguous situations where you think you are operating with +expressions but are in fact operating with selectors. -Another helpful debugging utility is the function `expand_selector`. -Given a target frame or schema, you can check what columns a given selector will expand to: +Another helpful debugging utility is the function `expand_selector`. Given a target frame or schema, +you can check what columns a given selector will expand to: -{{code_block('user-guide/expressions/expression-expansion', 'expand_selector', [], ['expand_selector'], [])}} +{{code_block('user-guide/expressions/expression-expansion', 'expand_selector', [], +['expand_selector'], [])}} ```python exec="on" result="text" session="expressions/expression-expansion" --8<-- "python/user-guide/expressions/expression-expansion.py:expand_selector" @@ -298,7 +346,8 @@ Given a target frame or schema, you can check what columns a given selector will ### Complete reference -The tables below group the functions available in the submodule `selectors` by their type of behaviour. +The tables below group the functions available in the submodule `selectors` by their type of +behaviour. #### Selectors for data types @@ -360,4 +409,5 @@ The submodule `selectors` also provides the following functions: | `expand_selector` | Expand selector to matching columns with respect to a specific frame or target schema | | `is_selector` | Check whether the given object/expression is a selector | -*`as_expr` isn't a function defined on the submodule `selectors`, but rather a method defined on selectors. +*`as_expr` isn't a function defined on the submodule `selectors`, but rather a method defined on +selectors. diff --git a/docs/source/user-guide/expressions/folds.md b/docs/source/user-guide/expressions/folds.md index 6fb8d56072c4..d6d7647401f1 100644 --- a/docs/source/user-guide/expressions/folds.md +++ b/docs/source/user-guide/expressions/folds.md @@ -1,10 +1,12 @@ # Folds -Polars provides many expressions to perform computations across columns, like `sum_horizontal`, `mean_horizontal`, and `min_horizontal`. -However, these are just special cases of a general algorithm called a fold, and Polars provides a general mechanism for you to compute custom folds for when the specialised versions of Polars are not enough. +Polars provides many expressions to perform computations across columns, like `sum_horizontal`, +`mean_horizontal`, and `min_horizontal`. However, these are just special cases of a general +algorithm called a fold, and Polars provides a general mechanism for you to compute custom folds for +when the specialised versions of Polars are not enough. -Folds computed with the function `fold` operate on the full columns for maximum speed. -They utilize the data layout very efficiently and often have vectorized execution. +Folds computed with the function `fold` operate on the full columns for maximum speed. They utilize +the data layout very efficiently and often have vectorized execution. ## Basic example @@ -16,11 +18,13 @@ As a first example, we will reimplement `sum_horizontal` with the function `fold --8<-- "python/user-guide/expressions/folds.py:mansum" ``` -The function `fold` expects a function `f` as the parameter `function` and `f` should accept two arguments. -The first argument is the accumulated result, which we initialise as zero, and the second argument takes the successive values of the expressions listed in the parameter `exprs`. -In our case, they're the two columns “a” and “b”. +The function `fold` expects a function `f` as the parameter `function` and `f` should accept two +arguments. The first argument is the accumulated result, which we initialise as zero, and the second +argument takes the successive values of the expressions listed in the parameter `exprs`. In our +case, they're the two columns “a” and “b”. -The snippet below includes a third explicit expression that represents what the function `fold` is doing above: +The snippet below includes a third explicit expression that represents what the function `fold` is +doing above: {{code_block('user-guide/expressions/folds','mansum-explicit',['fold'])}} @@ -36,8 +40,10 @@ The snippet below includes a third explicit expression that represents what the ## The initial value `acc` -The initial value chosen for the accumulator `acc` is typically, but not always, the [identity element](https://en.wikipedia.org/wiki/Identity_element) of the operation you want to apply. -For example, if we wanted to multiply across the columns, we would not get the correct result if our accumulator was set to zero: +The initial value chosen for the accumulator `acc` is typically, but not always, the +[identity element](https://en.wikipedia.org/wiki/Identity_element) of the operation you want to +apply. For example, if we wanted to multiply across the columns, we would not get the correct result +if our accumulator was set to zero: {{code_block('user-guide/expressions/folds','manprod',['fold'])}} @@ -55,7 +61,8 @@ To fix this, the accumulator `acc` should be set to `1`: ## Conditional -In the case where you'd want to apply a condition/predicate across all columns in a dataframe, a fold can be a very concise way to express this. +In the case where you'd want to apply a condition/predicate across all columns in a dataframe, a +fold can be a very concise way to express this. {{code_block('user-guide/expressions/folds','conditional',['fold'])}} @@ -67,8 +74,8 @@ The snippet above filters all rows where all columns are greater than 1. ## Folds and string data -Folds could be used to concatenate string data. -However, due to the materialization of intermediate columns, this operation will have squared complexity. +Folds could be used to concatenate string data. However, due to the materialization of intermediate +columns, this operation will have squared complexity. Therefore, we recommend using the function `concat_str` for this: diff --git a/docs/source/user-guide/expressions/index.md b/docs/source/user-guide/expressions/index.md index b4442d6f4289..2d07da0a98be 100644 --- a/docs/source/user-guide/expressions/index.md +++ b/docs/source/user-guide/expressions/index.md @@ -1,8 +1,9 @@ # Expressions -We [introduced the concept of “expressions” in a previous section](../concepts/expressions-and-contexts.md#expressions). -In this section we will focus on exploring the types of expressions that Polars offers. -Each section gives an overview of what they do and provides additional examples. +We +[introduced the concept of “expressions” in a previous section](../concepts/expressions-and-contexts.md#expressions). +In this section we will focus on exploring the types of expressions that Polars offers. Each section +gives an overview of what they do and provides additional examples. - Essentials: diff --git a/docs/source/user-guide/expressions/lists-and-arrays.md b/docs/source/user-guide/expressions/lists-and-arrays.md index 6ba5fd319421..bb95e3446421 100644 --- a/docs/source/user-guide/expressions/lists-and-arrays.md +++ b/docs/source/user-guide/expressions/lists-and-arrays.md @@ -1,13 +1,15 @@ # Lists and arrays -Polars has first-class support for two homogeneous container data types: `List` and `Array`. -Polars supports many operations with the two data types and their APIs overlap, so this section of the user guide has the objective of clarifying when one data type should be chosen in favour of the other. +Polars has first-class support for two homogeneous container data types: `List` and `Array`. Polars +supports many operations with the two data types and their APIs overlap, so this section of the user +guide has the objective of clarifying when one data type should be chosen in favour of the other. ## Lists vs arrays ### The data type `List` -The data type list is suitable for columns whose values are homogeneous 1D containers of varying lengths. +The data type list is suitable for columns whose values are homogeneous 1D containers of varying +lengths. The dataframe below contains three examples of columns with the data type `List`: @@ -17,12 +19,14 @@ The dataframe below contains three examples of columns with the data type `List` --8<-- "python/user-guide/expressions/lists.py:list-example" ``` -Note that the data type `List` is different from Python's type `list`, where elements can be of any type. -If you want to store true Python lists in a column, you can do so with the data type `Object` and your column will not have the list manipulation features that we're about to discuss. +Note that the data type `List` is different from Python's type `list`, where elements can be of any +type. If you want to store true Python lists in a column, you can do so with the data type `Object` +and your column will not have the list manipulation features that we're about to discuss. ### The data type `Array` -The data type `Array` is suitable for columns whose values are homogeneous containers of an arbitrary dimension with a known and fixed shape. +The data type `Array` is suitable for columns whose values are homogeneous containers of an +arbitrary dimension with a known and fixed shape. The dataframe below contains two examples of columns with the data type `Array`. @@ -32,11 +36,15 @@ The dataframe below contains two examples of columns with the data type `Array`. --8<-- "python/user-guide/expressions/lists.py:array-example" ``` -The example above shows how to specify that the columns “bit_flags” and “tic_tac_toe” have the data type `Array`, parametrised by the data type of the elements contained within and by the shape of each array. +The example above shows how to specify that the columns “bit_flags” and “tic_tac_toe” have the data +type `Array`, parametrised by the data type of the elements contained within and by the shape of +each array. -In general, Polars does not infer that a column has the data type `Array` for performance reasons, and defaults to the appropriate variant of the data type `List`. -In Python, an exception to this rule is when you provide a NumPy array to build a column. -In that case, Polars has the guarantee from NumPy that all subarrays have the same shape, so an array of $n + 1$ dimensions will generate a column of $n$ dimensional arrays: +In general, Polars does not infer that a column has the data type `Array` for performance reasons, +and defaults to the appropriate variant of the data type `List`. In Python, an exception to this +rule is when you provide a NumPy array to build a column. In that case, Polars has the guarantee +from NumPy that all subarrays have the same shape, so an array of $n + 1$ dimensions will generate a +column of $n$ dimensional arrays: {{code_block('user-guide/expressions/lists', 'numpy-array-inference', ['Array'])}} @@ -46,8 +54,8 @@ In that case, Polars has the guarantee from NumPy that all subarrays have the sa ### When to use each -In short, prefer the data type `Array` over `List` because it is more memory efficient and more performant. -If you cannot use `Array`, then use `List`: +In short, prefer the data type `Array` over `List` because it is more memory efficient and more +performant. If you cannot use `Array`, then use `List`: - when the values within a column do not have a fixed shape; or - when you need functions that are only available in the list API. @@ -56,8 +64,8 @@ If you cannot use `Array`, then use `List`: ### The namespace `list` -Polars provides many functions to work with values of the data type `List` and these are grouped inside the namespace `list`. -We will explore this namespace a bit now. +Polars provides many functions to work with values of the data type `List` and these are grouped +inside the namespace `list`. We will explore this namespace a bit now. !!! warning "`arr` then, `list` now" @@ -66,7 +74,8 @@ We will explore this namespace a bit now. If you find references to the namespace `arr` on StackOverflow or other sources, note that those sources _may_ be outdated. The dataframe `weather` defined below contains data from different weather stations across a region. -When the weather station is unable to get a result, an error code is recorded instead of the actual temperature at that time. +When the weather station is unable to get a result, an error code is recorded instead of the actual +temperature at that time. {{code_block('user-guide/expressions/lists', 'weather', [])}} @@ -76,9 +85,10 @@ When the weather station is unable to get a result, an error code is recorded in ### Programmatically creating lists -Given the dataframe `weather` defined previously, it is very likely we need to run some analysis on the temperatures that are captured by each station. -To make this happen, we need to first be able to get individual temperature measurements. -We [can use the namespace `str`](strings.md#the-string-namespace) for this: +Given the dataframe `weather` defined previously, it is very likely we need to run some analysis on +the temperatures that are captured by each station. To make this happen, we need to first be able to +get individual temperature measurements. We +[can use the namespace `str`](strings.md#the-string-namespace) for this: {{code_block('user-guide/expressions/lists', 'split', ['str.split'])}} @@ -86,7 +96,8 @@ We [can use the namespace `str`](strings.md#the-string-namespace) for this: --8<-- "python/user-guide/expressions/lists.py:split" ``` -A natural follow-up would be to explode the list of temperatures so that each measurement is in its own row: +A natural follow-up would be to explode the list of temperatures so that each measurement is in its +own row: {{code_block('user-guide/expressions/lists', 'explode', ['explode'])}} @@ -99,7 +110,8 @@ However, in Polars we often do not need to do this to operate on the list elemen ### Operating on lists Polars provides several standard operations on columns with the `List` data type. -[Similar to what you can do with strings](strings.md#slicing), lists can be sliced with the functions `head`, `tail`, and `slice`: +[Similar to what you can do with strings](strings.md#slicing), lists can be sliced with the +functions `head`, `tail`, and `slice`: {{code_block('user-guide/expressions/lists', 'list-slicing', ['Expr.list'])}} @@ -115,9 +127,10 @@ If we need to identify the stations that are giving the most number of errors we 2. count the number of non-numeric values (i.e., `null` values) in the list, by row; and 3. rename this output column as “errors” so that we can easily identify the stations. -To perform these steps, we need to perform a casting operation on each measurement within the list values. -The function `eval` is used as the entry point to perform operations on the elements of the list. -Within it, you can use the context `element` to refer to each single element of the list individually, and then you can use any Polars expression on the element: +To perform these steps, we need to perform a casting operation on each measurement within the list +values. The function `eval` is used as the entry point to perform operations on the elements of the +list. Within it, you can use the context `element` to refer to each single element of the list +individually, and then you can use any Polars expression on the element: {{code_block('user-guide/expressions/lists', 'element-wise-casting', ['element'])}} @@ -125,7 +138,8 @@ Within it, you can use the context `element` to refer to each single element of --8<-- "python/user-guide/expressions/lists.py:element-wise-casting" ``` -Another alternative would be to use a regular expression to check if a measurement starts with a letter: +Another alternative would be to use a regular expression to check if a measurement starts with a +letter: {{code_block('user-guide/expressions/lists', 'element-wise-regex', ['element'])}} @@ -133,11 +147,14 @@ Another alternative would be to use a regular expression to check if a measureme --8<-- "python/user-guide/expressions/lists.py:element-wise-regex" ``` -If you are unfamiliar with the namespace `str` or the notation `(?i)` in the regex, now is a good time to [look at how to work with strings and regular expressions in Polars](strings.md#check-for-the-existence-of-a-pattern). +If you are unfamiliar with the namespace `str` or the notation `(?i)` in the regex, now is a good +time to +[look at how to work with strings and regular expressions in Polars](strings.md#check-for-the-existence-of-a-pattern). ### Row-wise computations -The function `eval` gives us access to the list elements and `pl.element` refers to each individual element, but we can also use `pl.all()` to refer to all of the elements of the list. +The function `eval` gives us access to the list elements and `pl.element` refers to each individual +element, but we can also use `pl.all()` to refer to all of the elements of the list. To show this in action, we will start by creating another dataframe with some more weather data: @@ -148,8 +165,8 @@ To show this in action, we will start by creating another dataframe with some mo ``` Now, we will calculate the percentage rank of the temperatures by day, measured across stations. -Polars does not provide a function to do this directly, but because expressions are so versatile we can create our own percentage rank expression for highest temperature. -Let's try that: +Polars does not provide a function to do this directly, but because expressions are so versatile we +can create our own percentage rank expression for highest temperature. Let's try that: {{code_block('user-guide/expressions/lists', 'rank_pct', ['element', 'rank'])}} @@ -161,13 +178,15 @@ Let's try that: ### Creating an array column -As [we have seen above](#the-data-type-array), Polars usually does not infer the data type `Array` automatically. -You have to specify the data type `Array` when creating a series/dataframe or [cast a column](casting.md) explicitly unless you create the column out of a NumPy array. +As [we have seen above](#the-data-type-array), Polars usually does not infer the data type `Array` +automatically. You have to specify the data type `Array` when creating a series/dataframe or +[cast a column](casting.md) explicitly unless you create the column out of a NumPy array. ### The namespace `arr` -The data type `Array` was recently introduced and is still pretty nascent in features that it offers. -Even so, the namespace `arr` aggregates several functions that you can use to work with arrays. +The data type `Array` was recently introduced and is still pretty nascent in features that it +offers. Even so, the namespace `arr` aggregates several functions that you can use to work with +arrays. !!! warning "`arr` then, `list` now" @@ -175,7 +194,8 @@ Even so, the namespace `arr` aggregates several functions that you can use to wo `arr` is now the namespace for the data type `Array`. If you find references to the namespace `arr` on StackOverflow or other sources, note that those sources _may_ be outdated. -The API documentation should give you a good overview of the functions in the namespace `arr`, of which we present a couple: +The API documentation should give you a good overview of the functions in the namespace `arr`, of +which we present a couple: {{code_block('user-guide/expressions/lists', 'array-overview', ['Expr.arr'])}} diff --git a/docs/source/user-guide/expressions/missing-data.md b/docs/source/user-guide/expressions/missing-data.md index f1697cced489..98e1bee91fff 100644 --- a/docs/source/user-guide/expressions/missing-data.md +++ b/docs/source/user-guide/expressions/missing-data.md @@ -4,14 +4,15 @@ This section of the user guide teaches how to work with missing data in Polars. ## `null` and `NaN` values -In Polars, missing data is represented by the value `null`. -This missing value `null` is used for all data types, including numerical types. +In Polars, missing data is represented by the value `null`. This missing value `null` is used for +all data types, including numerical types. -Polars also supports the value `NaN` (“Not a Number”) for columns with floating point numbers. -The value `NaN` is considered to be a valid floating point value, which is different from missing data. +Polars also supports the value `NaN` (“Not a Number”) for columns with floating point numbers. The +value `NaN` is considered to be a valid floating point value, which is different from missing data. [We discuss the value `NaN` separately below](#not-a-number-or-nan-values). -When creating a series or a dataframe, you can set a value to `null` by using the appropriate construct for your language: +When creating a series or a dataframe, you can set a value to `null` by using the appropriate +construct for your language: {{code_block('user-guide/expressions/missing-data','dataframe',['DataFrame'])}} @@ -26,8 +27,9 @@ When creating a series or a dataframe, you can set a value to `null` by using th ## Missing data metadata -Polars keeps track of some metadata regarding the missing data of each series. -This metadata allows Polars to answer some basic queries about missing values in a very efficient way, namely how many values are missing and which ones are missing. +Polars keeps track of some metadata regarding the missing data of each series. This metadata allows +Polars to answer some basic queries about missing values in a very efficient way, namely how many +values are missing and which ones are missing. To determine how many values are missing from a column you can use the function `null_count`: @@ -37,13 +39,13 @@ To determine how many values are missing from a column you can use the function --8<-- "python/user-guide/expressions/missing-data.py:count" ``` -The function `null_count` can be called on a dataframe, a column from a dataframe, or on a series directly. -The function `null_count` is a cheap operation because the result is already known. +The function `null_count` can be called on a dataframe, a column from a dataframe, or on a series +directly. The function `null_count` is a cheap operation because the result is already known. -Polars uses something called a “validity bitmap” to know which values are missing in a series. -The validity bitmap is memory efficient as it is bit encoded. -If a series has length $n$, then its validity bitmap will cost $n / 8$ bytes. -The function `is_null` uses the validity bitmap to efficiently report which values are `null` and which are not: +Polars uses something called a “validity bitmap” to know which values are missing in a series. The +validity bitmap is memory efficient as it is bit encoded. If a series has length $n$, then its +validity bitmap will cost $n / 8$ bytes. The function `is_null` uses the validity bitmap to +efficiently report which values are `null` and which are not: {{code_block('user-guide/expressions/missing-data','isnull',['is_null'])}} @@ -51,8 +53,8 @@ The function `is_null` uses the validity bitmap to efficiently report which valu --8<-- "python/user-guide/expressions/missing-data.py:isnull" ``` -The function `is_null` can be used on a column of a dataframe or on a series directly. -Again, this is a cheap operation because the result is already known by Polars. +The function `is_null` can be used on a column of a dataframe or on a series directly. Again, this +is a cheap operation because the result is already known by Polars. ??? info "Why does Polars waste memory on a validity bitmap?" @@ -63,15 +65,16 @@ Again, this is a cheap operation because the result is already known by Polars. ## Filling missing data -Missing data in a series can be filled with the function `fill_null`. -You can specify how missing data is effectively filled in a couple of different ways: +Missing data in a series can be filled with the function `fill_null`. You can specify how missing +data is effectively filled in a couple of different ways: - a literal of the correct data type; - a Polars expression, such as replacing with values computed from another column; - a strategy based on neighbouring values, such as filling forwards or backwards; and - interpolation. -To illustrate how each of these methods work we start by defining a simple dataframe with two missing values in the second column: +To illustrate how each of these methods work we start by defining a simple dataframe with two +missing values in the second column: {{code_block('user-guide/expressions/missing-data','dataframe2',['DataFrame'])}} @@ -81,8 +84,8 @@ To illustrate how each of these methods work we start by defining a simple dataf ### Fill with a specified literal value -You can fill the missing data with a specified literal value. -This literal value will replace all of the occurrences of the value `null`: +You can fill the missing data with a specified literal value. This literal value will replace all of +the occurrences of the value `null`: {{code_block('user-guide/expressions/missing-data','fill',['fill_null'])}} @@ -90,12 +93,15 @@ This literal value will replace all of the occurrences of the value `null`: --8<-- "python/user-guide/expressions/missing-data.py:fill" ``` -However, this is actually just a special case of the general case where [the function `fill_null` replaces missing values with the corresponding values from the result of a Polars expression](#fill-with-a-strategy-based-on-neighbouring-values), as seen next. +However, this is actually just a special case of the general case where +[the function `fill_null` replaces missing values with the corresponding values from the result of a Polars expression](#fill-with-a-strategy-based-on-neighbouring-values), +as seen next. ### Fill with an expression -In the general case, the missing data can be filled by extracting the corresponding values from the result of a general Polars expression. -For example, we can fill the second column with values taken from the double of the first column: +In the general case, the missing data can be filled by extracting the corresponding values from the +result of a general Polars expression. For example, we can fill the second column with values taken +from the double of the first column: {{code_block('user-guide/expressions/missing-data','fillexpr',['fill_null'])}} @@ -106,7 +112,8 @@ For example, we can fill the second column with values taken from the double of ### Fill with a strategy based on neighbouring values You can also fill the missing data by following a fill strategy based on the neighbouring values. -The two simpler strategies look for the first non-`null` value that comes immediately before or immediately after the value `null` that is being filled: +The two simpler strategies look for the first non-`null` value that comes immediately before or +immediately after the value `null` that is being filled: {{code_block('user-guide/expressions/missing-data','fillstrategy',['fill_null'])}} @@ -118,7 +125,8 @@ You can find other fill strategies in the API docs. ### Fill with interpolation -Additionally, you can fill missing data with interpolation by using the function `interpolate` instead of the function `fill_null`: +Additionally, you can fill missing data with interpolation by using the function `interpolate` +instead of the function `fill_null`: {{code_block('user-guide/expressions/missing-data','fillinterpolate',['interpolate'])}} @@ -128,9 +136,9 @@ Additionally, you can fill missing data with interpolation by using the function ## Not a Number, or `NaN` values -Missing data in a series is represented by the value `null`, regardless of the data type of the series. -However, in columns that have a floating point data type, the value `NaN` can be used. -These values can be created directly: +Missing data in a series is represented by the value `null`, regardless of the data type of the +series. However, in columns that have a floating point data type, the value `NaN` can be used. These +values can be created directly: {{code_block('user-guide/expressions/missing-data','nan',['DataFrame'])}} @@ -151,17 +159,21 @@ The special value `NaN` might also arise as the result of a computation: By default, a `NaN` value in an integer column causes the column to be cast to a float data type in pandas. This does not happen in Polars; instead, an exception is raised. -`NaN` values are considered to be a type of floating point data and are **not considered to be missing data** in Polars. -This means: +`NaN` values are considered to be a type of floating point data and are **not considered to be +missing data** in Polars. This means: - `NaN` values are **not** counted with the function `null_count`; and -- `NaN` values are filled when you use the specialised function `fill_nan` method but are **not** filled with the function `fill_null`. +- `NaN` values are filled when you use the specialised function `fill_nan` method but are **not** + filled with the function `fill_null`. -Polars has the functions `is_nan` and `fill_nan`, which work in a similar way to the functions `is_null` and `fill_null`. -Unlike with missing data, Polars does not hold any metadata regarding the `NaN` values, so the function `is_nan` entails actual computation. +Polars has the functions `is_nan` and `fill_nan`, which work in a similar way to the functions +`is_null` and `fill_null`. Unlike with missing data, Polars does not hold any metadata regarding the +`NaN` values, so the function `is_nan` entails actual computation. -One further difference between the values `null` and `NaN` is that numerical aggregating functions, like `mean` and `sum`, skip the missing values when computing the result, whereas the value `NaN` is considered for the computation and typically propagates into the result. -If desirable, this behavior can be avoided by replacing the occurrences of the value `NaN` with the value `null`: +One further difference between the values `null` and `NaN` is that numerical aggregating functions, +like `mean` and `sum`, skip the missing values when computing the result, whereas the value `NaN` is +considered for the computation and typically propagates into the result. If desirable, this behavior +can be avoided by replacing the occurrences of the value `NaN` with the value `null`: {{code_block('user-guide/expressions/missing-data','nanfill',['fill_nan'])}} diff --git a/docs/source/user-guide/expressions/numpy-functions.md b/docs/source/user-guide/expressions/numpy-functions.md index b140d5ff458e..971500f2d943 100644 --- a/docs/source/user-guide/expressions/numpy-functions.md +++ b/docs/source/user-guide/expressions/numpy-functions.md @@ -1,9 +1,10 @@ # Numpy functions -Polars expressions support NumPy [ufuncs](https://numpy.org/doc/stable/reference/ufuncs.html). -See [the NumPy documentation for a list of all supported NumPy functions](https://numpy.org/doc/stable/reference/ufuncs.html#available-ufuncs). +Polars expressions support NumPy [ufuncs](https://numpy.org/doc/stable/reference/ufuncs.html). See +[the NumPy documentation for a list of all supported NumPy functions](https://numpy.org/doc/stable/reference/ufuncs.html#available-ufuncs). -This means that if a function is not provided by Polars, we can use NumPy and we still have fast columnar operations through the NumPy API. +This means that if a function is not provided by Polars, we can use NumPy and we still have fast +columnar operations through the NumPy API. ## Example @@ -16,9 +17,12 @@ This means that if a function is not provided by Polars, we can use NumPy and we ## Interoperability Polars' series have support for NumPy universal functions (ufuncs) and generalized ufuncs. -Element-wise functions such as `np.exp`, `np.cos`, `np.div`, etc, all work with almost zero overhead. - -However, bear in mind that [Polars keeps track of missing values with a separate bitmask](missing-data.md) and NumPy does not receive this information. -This can lead to a window function or a `np.convolve` giving flawed or incomplete results, so an error will be raised if you pass a series with missing data to a generalized ufunc. -Convert a Polars series to a NumPy array with the function `to_numpy`. -Missing values will be replaced by `np.nan` during the conversion. +Element-wise functions such as `np.exp`, `np.cos`, `np.div`, etc, all work with almost zero +overhead. + +However, bear in mind that +[Polars keeps track of missing values with a separate bitmask](missing-data.md) and NumPy does not +receive this information. This can lead to a window function or a `np.convolve` giving flawed or +incomplete results, so an error will be raised if you pass a series with missing data to a +generalized ufunc. Convert a Polars series to a NumPy array with the function `to_numpy`. Missing +values will be replaced by `np.nan` during the conversion. diff --git a/docs/source/user-guide/expressions/strings.md b/docs/source/user-guide/expressions/strings.md index facdb5b80c54..e441517cc934 100644 --- a/docs/source/user-guide/expressions/strings.md +++ b/docs/source/user-guide/expressions/strings.md @@ -1,15 +1,19 @@ # Strings -The following section discusses operations performed on string data, which is a frequently used data type when working with dataframes. -String processing functions are available in the namespace `str`. +The following section discusses operations performed on string data, which is a frequently used data +type when working with dataframes. String processing functions are available in the namespace `str`. -Working with strings in other dataframe libraries can be highly inefficient due to the fact that strings have unpredictable lengths. -Polars mitigates these inefficiencies by [following the Arrow Columnar Format specification](../concepts/data-types-and-structures.md#data-types-internals), so you can write performant data queries on string data too. +Working with strings in other dataframe libraries can be highly inefficient due to the fact that +strings have unpredictable lengths. Polars mitigates these inefficiencies by +[following the Arrow Columnar Format specification](../concepts/data-types-and-structures.md#data-types-internals), +so you can write performant data queries on string data too. ## The string namespace -When working with string data you will likely need to access the namespace `str`, which aggregates 40+ functions that let you work with strings. -As an example of how to access functions from within that namespace, the snippet below shows how to compute the length of the strings in a column in terms of the number of bytes and the number of characters: +When working with string data you will likely need to access the namespace `str`, which aggregates +40+ functions that let you work with strings. As an example of how to access functions from within +that namespace, the snippet below shows how to compute the length of the strings in a column in +terms of the number of bytes and the number of characters: {{code_block('user-guide/expressions/strings','df',['str.len_bytes','str.len_chars'])}} @@ -23,18 +27,21 @@ As an example of how to access functions from within that namespace, the snippet ## Parsing strings -Polars offers multiple methods for checking and parsing elements of a string column, namely checking for the existence of given substrings or patterns, and counting, extracting, or replacing, them. -We will demonstrate some of these operations in the upcoming examples. +Polars offers multiple methods for checking and parsing elements of a string column, namely checking +for the existence of given substrings or patterns, and counting, extracting, or replacing, them. We +will demonstrate some of these operations in the upcoming examples. ### Check for the existence of a pattern -We can use the function `contains` to check for the presence of a pattern within a string. -By default, the argument to the function `contains` is interpreted as a regular expression. -If you want to specify a literal substring, set the parameter `literal` to `True`. +We can use the function `contains` to check for the presence of a pattern within a string. By +default, the argument to the function `contains` is interpreted as a regular expression. If you want +to specify a literal substring, set the parameter `literal` to `True`. -For the special cases where you want to check if the strings start or end with a fixed substring, you can use the functions `starts_with` or `ends_with`, respectively. +For the special cases where you want to check if the strings start or end with a fixed substring, +you can use the functions `starts_with` or `ends_with`, respectively. -{{code_block('user-guide/expressions/strings','existence',['str.contains', 'str.starts_with','str.ends_with'])}} +{{code_block('user-guide/expressions/strings','existence',['str.contains', +'str.starts_with','str.ends_with'])}} ```python exec="on" result="text" session="expressions/strings" --8<-- "python/user-guide/expressions/strings.py:existence" @@ -42,13 +49,16 @@ For the special cases where you want to check if the strings start or end with a ### Regex specification -Polars relies on the Rust crate `regex` to work with regular expressions, so you may need to [refer to the syntax documentation](https://docs.rs/regex/latest/regex/#syntax) to see what features and flags are supported. -In particular, note that the flavor of regex supported by Polars is different from Python's module `re`. +Polars relies on the Rust crate `regex` to work with regular expressions, so you may need to +[refer to the syntax documentation](https://docs.rs/regex/latest/regex/#syntax) to see what features +and flags are supported. In particular, note that the flavor of regex supported by Polars is +different from Python's module `re`. ### Extract a pattern -The function `extract` allows us to extract patterns from the string values in a column. -The function `extract` accepts a regex pattern with one or more capture groups and extracts the capture group specified as the second argument. +The function `extract` allows us to extract patterns from the string values in a column. The +function `extract` accepts a regex pattern with one or more capture groups and extracts the capture +group specified as the second argument. {{code_block('user-guide/expressions/strings','extract',['str.extract'])}} @@ -56,9 +66,10 @@ The function `extract` accepts a regex pattern with one or more capture groups a --8<-- "python/user-guide/expressions/strings.py:extract" ``` -To extract all occurrences of a pattern within a string, we can use the function `extract_all`. -In the example below, we extract all numbers from a string using the regex pattern `(\d+)`, which matches one or more digits. -The resulting output of the function `extract_all` is a list containing all instances of the matched pattern within the string. +To extract all occurrences of a pattern within a string, we can use the function `extract_all`. In +the example below, we extract all numbers from a string using the regex pattern `(\d+)`, which +matches one or more digits. The resulting output of the function `extract_all` is a list containing +all instances of the matched pattern within the string. {{code_block('user-guide/expressions/strings','extract_all',['str.extract_all'])}} @@ -68,9 +79,11 @@ The resulting output of the function `extract_all` is a list containing all inst ### Replace a pattern -Akin to the functions `extract` and `extract_all`, Polars provides the functions `replace` and `replace_all`. -These accept a regex pattern or a literal substring (if the parameter `literal` is set to `True`) and perform the replacements specified. -The function `replace` will make at most one replacement whereas the function `replace_all` will make all the non-overlapping replacements it finds. +Akin to the functions `extract` and `extract_all`, Polars provides the functions `replace` and +`replace_all`. These accept a regex pattern or a literal substring (if the parameter `literal` is +set to `True`) and perform the replacements specified. The function `replace` will make at most one +replacement whereas the function `replace_all` will make all the non-overlapping replacements it +finds. {{code_block('user-guide/expressions/strings','replace',['str.replace', 'str.replace_all'])}} @@ -82,9 +95,11 @@ The function `replace` will make at most one replacement whereas the function `r ### Case conversion -Converting the casing of a string is a common operation and Polars supports it out of the box with the functions `to_lowercase`, `to_titlecase`, and `to_uppercase`: +Converting the casing of a string is a common operation and Polars supports it out of the box with +the functions `to_lowercase`, `to_titlecase`, and `to_uppercase`: -{{code_block('user-guide/expressions/strings','casing', ['str.to_lowercase', 'str.to_titlecase', 'str.to_uppercase'])}} +{{code_block('user-guide/expressions/strings','casing', ['str.to_lowercase', 'str.to_titlecase', +'str.to_uppercase'])}} ```python exec="on" result="text" session="expressions/strings" --8<-- "python/user-guide/expressions/strings.py:casing" @@ -92,7 +107,8 @@ Converting the casing of a string is a common operation and Polars supports it o ### Stripping characters from the ends -Polars provides five functions in the namespace `str` that let you strip characters from the ends of the string: +Polars provides five functions in the namespace `str` that let you strip characters from the ends of +the string: | Function | Behaviour | | ------------------- | --------------------------------------------------------------------- | @@ -102,28 +118,37 @@ Polars provides five functions in the namespace `str` that let you strip charact | `strip_prefix` | Removes an exact substring prefix if present. | | `strip_suffix` | Removes an exact substring suffix if present. | -??? info "Similarity to Python string methods" -`strip_chars` is similar to Python's string method `strip` and `strip_prefix`/`strip_suffix` are similar to Python's string methods `removeprefix` and `strip_suffix`, respectively. +??? info "Similarity to Python string methods" `strip_chars` is similar to Python's string method +`strip` and `strip_prefix`/`strip_suffix` are similar to Python's string methods `removeprefix` and +`strip_suffix`, respectively. -It is important to understand that the first three functions interpret their string argument as a set of characters whereas the functions `strip_prefix` and `strip_suffix` do interpret their string argument as a literal string. +It is important to understand that the first three functions interpret their string argument as a +set of characters whereas the functions `strip_prefix` and `strip_suffix` do interpret their string +argument as a literal string. -{{code_block('user-guide/expressions/strings', 'strip', ['str.strip_chars', 'str.strip_chars_end', 'str.strip_chars_start', 'str.strip_prefix', 'str.strip_suffix'])}} +{{code_block('user-guide/expressions/strings', 'strip', ['str.strip_chars', 'str.strip_chars_end', +'str.strip_chars_start', 'str.strip_prefix', 'str.strip_suffix'])}} ```python exec="on" result="text" session="expressions/strings" --8<-- "python/user-guide/expressions/strings.py:strip" ``` -If no argument is provided, the three functions `strip_chars`, `strip_chars_end`, and `strip_chars_start`, remove whitespace by default. +If no argument is provided, the three functions `strip_chars`, `strip_chars_end`, and +`strip_chars_start`, remove whitespace by default. ### Slicing -Besides [extracting substrings as specified by patterns](#extract-a-pattern), you can also slice strings at specified offsets to produce substrings. -The general-purpose function for slicing is `slice` and it takes the starting offset and the optional _length_ of the slice. -If the length of the slice is not specified or if it's past the end of the string, Polars slices the string all the way to the end. +Besides [extracting substrings as specified by patterns](#extract-a-pattern), you can also slice +strings at specified offsets to produce substrings. The general-purpose function for slicing is +`slice` and it takes the starting offset and the optional _length_ of the slice. If the length of +the slice is not specified or if it's past the end of the string, Polars slices the string all the +way to the end. -The functions `head` and `tail` are specialised versions used for slicing the beginning and end of a string, respectively. +The functions `head` and `tail` are specialised versions used for slicing the beginning and end of a +string, respectively. -{{code_block('user-guide/expressions/strings', 'slice', [], ['str.slice', 'str.head', 'str.tail'], ['str.str_slice', 'str.str_head', 'str.str_tail'])}} +{{code_block('user-guide/expressions/strings', 'slice', [], ['str.slice', 'str.head', 'str.tail'], +['str.str_slice', 'str.str_head', 'str.str_tail'])}} ```python exec="on" result="text" session="expressions/strings" --8<-- "python/user-guide/expressions/strings.py:slice" @@ -131,5 +156,6 @@ The functions `head` and `tail` are specialised versions used for slicing the be ## API documentation -In addition to the examples covered above, Polars offers various other string manipulation functions. -To explore these additional methods, you can go to the API documentation of your chosen programming language for Polars. +In addition to the examples covered above, Polars offers various other string manipulation +functions. To explore these additional methods, you can go to the API documentation of your chosen +programming language for Polars. diff --git a/docs/source/user-guide/expressions/structs.md b/docs/source/user-guide/expressions/structs.md index 7643c2a70c01..e6c4afc552ba 100644 --- a/docs/source/user-guide/expressions/structs.md +++ b/docs/source/user-guide/expressions/structs.md @@ -2,13 +2,15 @@ The data type `Struct` is a composite data type that can store multiple fields in a single column. -??? tip "Python analogy" -For Python users, the data type `Struct` is kind of like a Python dictionary. -Even better, if you are familiar with Python typing, you can think of the data type `Struct` as `typing.TypedDict`. +??? tip "Python analogy" For Python users, the data type `Struct` is kind of like a Python +dictionary. Even better, if you are familiar with Python typing, you can think of the data type +`Struct` as `typing.TypedDict`. -In this page of the user guide we will see situations in which the data type `Struct` arises, we will understand why it does arise, and we will see how to work with `Struct` values. +In this page of the user guide we will see situations in which the data type `Struct` arises, we +will understand why it does arise, and we will see how to work with `Struct` values. -Let's start with a dataframe that captures the average rating of a few movies across some states in the US: +Let's start with a dataframe that captures the average rating of a few movies across some states in +the US: {{code_block('user-guide/expressions/structs','ratings_df',['DataFrame'])}} @@ -18,8 +20,9 @@ Let's start with a dataframe that captures the average rating of a few movies ac ## Encountering the data type `Struct` -A common operation that will lead to a `Struct` column is the ever so popular `value_counts` function that is commonly used in exploratory data analysis. -Checking the number of times a state appears in the data is done as so: +A common operation that will lead to a `Struct` column is the ever so popular `value_counts` +function that is commonly used in exploratory data analysis. Checking the number of times a state +appears in the data is done as so: {{code_block('user-guide/expressions/structs','state_value_counts',['value_counts'])}} @@ -27,9 +30,9 @@ Checking the number of times a state appears in the data is done as so: --8<-- "python/user-guide/expressions/structs.py:state_value_counts" ``` -Quite unexpected an output, especially if coming from tools that do not have such a data type. -We're not in peril, though. -To get back to a more familiar output, all we need to do is use the function `unnest` on the `Struct` column: +Quite unexpected an output, especially if coming from tools that do not have such a data type. We're +not in peril, though. To get back to a more familiar output, all we need to do is use the function +`unnest` on the `Struct` column: {{code_block('user-guide/expressions/structs','struct_unnest',['unnest'])}} @@ -66,8 +69,8 @@ Subsequent incongruences can result in `null` values or in errors: ## Extracting individual values of a `Struct` -Let's say that we needed to obtain just the field `"Movie"` from the `Struct` in the series that we created above. -We can use the function `field` to do so: +Let's say that we needed to obtain just the field `"Movie"` from the `Struct` in the series that we +created above. We can use the function `field` to do so: {{code_block('user-guide/expressions/structs','series_struct_extract',['struct.field'])}} @@ -77,8 +80,8 @@ We can use the function `field` to do so: ## Renaming individual fields of a `Struct` -What if we need to rename individual fields of a `Struct` column? -We use the function `rename_fields`: +What if we need to rename individual fields of a `Struct` column? We use the function +`rename_fields`: {{code_block('user-guide/expressions/structs','series_struct_rename',['struct.rename_fields'])}} @@ -86,8 +89,9 @@ We use the function `rename_fields`: --8<-- "python/user-guide/expressions/structs.py:series_struct_rename" ``` -To be able to actually see that the field names were change, we will create a dataframe where the only column is the result and then we use the function `unnest` so that each field becomes its own column. -The column names will reflect the renaming operation we just did: +To be able to actually see that the field names were change, we will create a dataframe where the +only column is the result and then we use the function `unnest` so that each field becomes its own +column. The column names will reflect the renaming operation we just did: {{code_block('user-guide/expressions/structs','struct-rename-check',['struct.rename_fields'])}} @@ -99,8 +103,8 @@ The column names will reflect the renaming operation we just did: ### Identifying duplicate rows -Let's get back to the `ratings` data. -We want to identify cases where there are duplicates at a “Movie” and “Theatre” level. +Let's get back to the `ratings` data. We want to identify cases where there are duplicates at a +“Movie” and “Theatre” level. This is where the data type `Struct` shines: @@ -114,8 +118,9 @@ We can identify the unique cases at this level also with `is_unique`! ### Multi-column ranking -Suppose, given that we know there are duplicates, we want to choose which rating gets a higher priority. -We can say that the column “Count” is the most important, and if there is a tie in the column “Count” then we consider the column “Avg_Rating”. +Suppose, given that we know there are duplicates, we want to choose which rating gets a higher +priority. We can say that the column “Count” is the most important, and if there is a tie in the +column “Count” then we consider the column “Avg_Rating”. We can then do: @@ -125,14 +130,16 @@ We can then do: --8<-- "python/user-guide/expressions/structs.py:struct_ranking" ``` -That's a pretty complex set of requirements done very elegantly in Polars! -To learn more about the function `over`, used above, [see the user guide section on window functions](window-functions.md). +That's a pretty complex set of requirements done very elegantly in Polars! To learn more about the +function `over`, used above, [see the user guide section on window functions](window-functions.md). ### Using multiple columns in a single expression -As mentioned earlier, the data type `Struct` is also useful if you need to pass multiple columns as input to an expression. -As an example, suppose we want to compute [the Ackermann function](https://en.wikipedia.org/wiki/Ackermann_function) on two columns of a dataframe. -There is no way of composing Polars expressions to compute the Ackermann function[^1], so we define a custom function: +As mentioned earlier, the data type `Struct` is also useful if you need to pass multiple columns as +input to an expression. As an example, suppose we want to compute +[the Ackermann function](https://en.wikipedia.org/wiki/Ackermann_function) on two columns of a +dataframe. There is no way of composing Polars expressions to compute the Ackermann function[^1], so +we define a custom function: {{code_block('user-guide/expressions/structs', 'ack', [])}} @@ -140,7 +147,9 @@ There is no way of composing Polars expressions to compute the Ackermann functio --8<-- "python/user-guide/expressions/structs.py:ack" ``` -Now, to compute the values of the Ackermann function on those arguments, we start by creating a `Struct` with fields `m` and `n` and then use the function `map_elements` to apply the function `ack` to each value: +Now, to compute the values of the Ackermann function on those arguments, we start by creating a +`Struct` with fields `m` and `n` and then use the function `map_elements` to apply the function +`ack` to each value: {{code_block('user-guide/expressions/structs','struct-ack',[], ['map_elements'], ['apply'])}} @@ -148,6 +157,8 @@ Now, to compute the values of the Ackermann function on those arguments, we star --8<-- "python/user-guide/expressions/structs.py:struct-ack" ``` -Refer to [this section of the user guide to learn more about applying user-defined Python functions to your data](user-defined-python-functions.md). +Refer to +[this section of the user guide to learn more about applying user-defined Python functions to your data](user-defined-python-functions.md). -[^1]: To say that something cannot be done is quite a bold claim. If you prove us wrong, please let us know! +[^1]: To say that something cannot be done is quite a bold claim. If you prove us wrong, please let +us know! diff --git a/docs/source/user-guide/expressions/user-defined-python-functions.md b/docs/source/user-guide/expressions/user-defined-python-functions.md index b99a413e8bdd..1cf250e2d73f 100644 --- a/docs/source/user-guide/expressions/user-defined-python-functions.md +++ b/docs/source/user-guide/expressions/user-defined-python-functions.md @@ -2,18 +2,21 @@ -Polars expressions are quite powerful and flexible, so there is much less need for custom Python functions compared to other libraries. -Still, you may need to pass an expression's state to a third party library or apply your black box function to data in Polars. +Polars expressions are quite powerful and flexible, so there is much less need for custom Python +functions compared to other libraries. Still, you may need to pass an expression's state to a third +party library or apply your black box function to data in Polars. In this part of the documentation we'll be using two APIs that allows you to do this: -- [:material-api: `map_elements`](https://docs.pola.rs/py-polars/html/reference/expressions/api/polars.Expr.map_elements.html): Call a function separately on each value in the `Series`. -- [:material-api: `map_batches`](https://docs.pola.rs/py-polars/html/reference/expressions/api/polars.Expr.map_batches.html): Always passes the full `Series` to the function. +- [:material-api: `map_elements`](https://docs.pola.rs/py-polars/html/reference/expressions/api/polars.Expr.map_elements.html): + Call a function separately on each value in the `Series`. +- [:material-api: `map_batches`](https://docs.pola.rs/py-polars/html/reference/expressions/api/polars.Expr.map_batches.html): + Always passes the full `Series` to the function. ## Processing individual values with `map_elements()` -Let's start with the simplest case: we want to process each value in a `Series` individually. -Here is our data: +Let's start with the simplest case: we want to process each value in a `Series` individually. Here +is our data: {{code_block('user-guide/expressions/user-defined-functions','dataframe',[])}} @@ -32,17 +35,20 @@ We'll call `math.log()` on each individual value: While this works, `map_elements()` has two problems: -1. **Limited to individual items:** Often you'll want to have a calculation that needs to operate on the whole `Series`, rather than individual items one by one. -2. **Performance overhead:** Even if you do want to process each item individually, calling a function for each individual item is slow; all those extra function calls add a lot of overhead. +1. **Limited to individual items:** Often you'll want to have a calculation that needs to operate on + the whole `Series`, rather than individual items one by one. +2. **Performance overhead:** Even if you do want to process each item individually, calling a + function for each individual item is slow; all those extra function calls add a lot of overhead. Let's start by solving the first problem, and then we'll see how to solve the second problem. ## Processing a whole `Series` with `map_batches()` -We want to run a custom function on the contents of a whole `Series`. -For demonstration purposes, let's say we want to calculate the difference between the mean of a `Series` and each value. +We want to run a custom function on the contents of a whole `Series`. For demonstration purposes, +let's say we want to calculate the difference between the mean of a `Series` and each value. -We can use the `map_batches()` API to run this function on either the full `Series` or individual groups in a `group_by()`: +We can use the `map_batches()` API to run this function on either the full `Series` or individual +groups in a `group_by()`: {{code_block('user-guide/expressions/user-defined-functions','diff_from_mean',[])}} @@ -52,15 +58,19 @@ We can use the `map_batches()` API to run this function on either the full `Seri ## Fast operations with user-defined functions -The problem with a pure-Python implementation is that it's slow. -In general, you want to minimize how much Python code you call if you want fast results. +The problem with a pure-Python implementation is that it's slow. In general, you want to minimize +how much Python code you call if you want fast results. -To maximize speed, you'll want to make sure that you're using a function written in a compiled language. -For numeric calculations Polars supports a pair of interfaces defined by NumPy called ["ufuncs"](https://numpy.org/doc/stable/reference/ufuncs.html) and ["generalized ufuncs"](https://numpy.org/neps/nep-0005-generalized-ufuncs.html). -The former runs on each item individually, and the latter accepts a whole NumPy array, which allows for more flexible operations. +To maximize speed, you'll want to make sure that you're using a function written in a compiled +language. For numeric calculations Polars supports a pair of interfaces defined by NumPy called +["ufuncs"](https://numpy.org/doc/stable/reference/ufuncs.html) and +["generalized ufuncs"](https://numpy.org/neps/nep-0005-generalized-ufuncs.html). The former runs on +each item individually, and the latter accepts a whole NumPy array, which allows for more flexible +operations. -[NumPy](https://numpy.org/doc/stable/reference/ufuncs.html) and other libraries like [SciPy](https://docs.scipy.org/doc/scipy/reference/special.html#module-scipy.special) come with pre-written ufuncs you can use with Polars. -For example: +[NumPy](https://numpy.org/doc/stable/reference/ufuncs.html) and other libraries like +[SciPy](https://docs.scipy.org/doc/scipy/reference/special.html#module-scipy.special) come with +pre-written ufuncs you can use with Polars. For example: {{code_block('user-guide/expressions/user-defined-functions','np_log',[])}} @@ -68,20 +78,25 @@ For example: --8<-- "python/user-guide/expressions/user-defined-functions.py:np_log" ``` -Notice that we can use `map_batches()`, because `numpy.log()` is able to run on both individual items and on whole NumPy arrays. -This means it will run much faster than our original example, since we only have a single Python call and then all processing happens in a fast low-level language. +Notice that we can use `map_batches()`, because `numpy.log()` is able to run on both individual +items and on whole NumPy arrays. This means it will run much faster than our original example, since +we only have a single Python call and then all processing happens in a fast low-level language. ## Example: A fast custom function using Numba The pre-written functions NumPy provides are helpful, but our goal is to write our own functions. -For example, let's say we want a fast version of our `diff_from_mean()` example above. -The easiest way to write this in Python is to use [Numba](https://numba.readthedocs.io/en/stable/), which allows you to write custom functions in (a subset) of Python while still getting the benefit of compiled code. - -In particular, Numba provides a decorator called [`@guvectorize`](https://numba.readthedocs.io/en/stable/user/vectorize.html#the-guvectorize-decorator). -This creates a generalized ufunc by compiling a Python function to fast machine code, in a way that allows it to be used by Polars. - -In the following example the `diff_from_mean_numba()` will be compiled to fast machine code at import time, which will take a little time. -After that all calls to the function will run quickly. +For example, let's say we want a fast version of our `diff_from_mean()` example above. The easiest +way to write this in Python is to use [Numba](https://numba.readthedocs.io/en/stable/), which allows +you to write custom functions in (a subset) of Python while still getting the benefit of compiled +code. + +In particular, Numba provides a decorator called +[`@guvectorize`](https://numba.readthedocs.io/en/stable/user/vectorize.html#the-guvectorize-decorator). +This creates a generalized ufunc by compiling a Python function to fast machine code, in a way that +allows it to be used by Polars. + +In the following example the `diff_from_mean_numba()` will be compiled to fast machine code at +import time, which will take a little time. After that all calls to the function will run quickly. The `Series` will be converted to a NumPy array before being passed to the function: {{code_block('user-guide/expressions/user-defined-functions','diff_from_mean_numba',[])}} @@ -92,22 +107,27 @@ The `Series` will be converted to a NumPy array before being passed to the funct ## Missing data is not allowed when calling generalized ufuncs -Before being passed to a user-defined function like `diff_from_mean_numba()`, a `Series` will be converted to a NumPy array. -Unfortunately, NumPy arrays don't have a concept of missing data. -If there is missing data in the original `Series`, this means the resulting array won't actually match the `Series`. +Before being passed to a user-defined function like `diff_from_mean_numba()`, a `Series` will be +converted to a NumPy array. Unfortunately, NumPy arrays don't have a concept of missing data. If +there is missing data in the original `Series`, this means the resulting array won't actually match +the `Series`. -If you're calculating results item by item, this doesn't matter. -For example, `numpy.log()` gets called on each individual value separately, so those missing values don't change the calculation. -But if the result of a user-defined function depend on multiple values in the `Series`, it's not clear what exactly should happen with the missing values. +If you're calculating results item by item, this doesn't matter. For example, `numpy.log()` gets +called on each individual value separately, so those missing values don't change the calculation. +But if the result of a user-defined function depend on multiple values in the `Series`, it's not +clear what exactly should happen with the missing values. -Therefore, when calling generalized ufuncs such as Numba functions decorated with `@guvectorize`, Polars will raise an error if you try to pass in a `Series` with missing data. -How do you get rid of missing data? -Either [fill it in](missing-data.md) or [drop it](https://docs.pola.rs/py-polars/html/reference/dataframe/api/polars.DataFrame.drop_nulls.html) before calling your custom function. +Therefore, when calling generalized ufuncs such as Numba functions decorated with `@guvectorize`, +Polars will raise an error if you try to pass in a `Series` with missing data. How do you get rid of +missing data? Either [fill it in](missing-data.md) or +[drop it](https://docs.pola.rs/py-polars/html/reference/dataframe/api/polars.DataFrame.drop_nulls.html) +before calling your custom function. ## Combining multiple column values -If you want to pass multiple columns to a user-defined function, you can use `Struct`s, which are [covered in detail in a different section](structs.md). -The basic idea is to combine multiple columns into a `Struct`, and then the function can extract the columns back out: +If you want to pass multiple columns to a user-defined function, you can use `Struct`s, which are +[covered in detail in a different section](structs.md). The basic idea is to combine multiple +columns into a `Struct`, and then the function can extract the columns back out: {{code_block('user-guide/expressions/user-defined-functions','combine',[])}} @@ -117,8 +137,10 @@ The basic idea is to combine multiple columns into a `Struct`, and then the func ## Streaming calculations -Passing the full `Series` to the user-defined function has a cost: it may use a lot of memory, as its contents are copied into a NumPy array. -You can use the `is_elementwise=True` argument to [:material-api: `map_batches`](https://docs.pola.rs/py-polars/html/reference/expressions/api/polars.Expr.map_batches.html) to stream results into the function, which means it might not get all values at once. +Passing the full `Series` to the user-defined function has a cost: it may use a lot of memory, as +its contents are copied into a NumPy array. You can use the `is_elementwise=True` argument to +[:material-api: `map_batches`](https://docs.pola.rs/py-polars/html/reference/expressions/api/polars.Expr.map_batches.html) +to stream results into the function, which means it might not get all values at once. !!! note @@ -128,9 +150,10 @@ You can use the `is_elementwise=True` argument to [:material-api: `map_batches`] ## Return types -Custom Python functions are often black boxes; Polars doesn't know what your function is doing or what it will return. -The return data type is therefore automatically inferred. We do that by waiting for the first non-null value. That value will then be used -to determine the type of the resulting `Series`. +Custom Python functions are often black boxes; Polars doesn't know what your function is doing or +what it will return. The return data type is therefore automatically inferred. We do that by waiting +for the first non-null value. That value will then be used to determine the type of the resulting +`Series`. The mapping of Python types to Polars data types is as follows: @@ -150,4 +173,6 @@ Rust types map as follows: - `String` or `str` -> `String` - `Vec` -> `List[tp]` (where the inner type is inferred with the same rules) -You can pass a `return_dtype` argument to [:material-api: `map_batches`](https://docs.pola.rs/py-polars/html/reference/expressions/api/polars.Expr.map_batches.html) if you want to override the inferred type. +You can pass a `return_dtype` argument to +[:material-api: `map_batches`](https://docs.pola.rs/py-polars/html/reference/expressions/api/polars.Expr.map_batches.html) +if you want to override the inferred type. diff --git a/docs/source/user-guide/expressions/window-functions.md b/docs/source/user-guide/expressions/window-functions.md index 5cdcb13b60c6..7acc249785aa 100644 --- a/docs/source/user-guide/expressions/window-functions.md +++ b/docs/source/user-guide/expressions/window-functions.md @@ -1,8 +1,7 @@ # Window functions -Window functions are expressions with superpowers. -They allow you to perform aggregations on groups within the context `select`. -Let's get a feel for what that means. +Window functions are expressions with superpowers. They allow you to perform aggregations on groups +within the context `select`. Let's get a feel for what that means. First, we load a Pokémon dataset: @@ -14,10 +13,11 @@ First, we load a Pokémon dataset: ## Operations per group -Window functions are ideal when we want to perform an operation within a group. -For instance, suppose we want to rank our Pokémon by the column “Speed”. -However, instead of a global ranking, we want to rank the speed within each group defined by the column “Type 1”. -We write the expression to rank the data by the column “Speed” and then we add the function `over` to specify that this should happen over the unique values of the column “Type 1”: +Window functions are ideal when we want to perform an operation within a group. For instance, +suppose we want to rank our Pokémon by the column “Speed”. However, instead of a global ranking, we +want to rank the speed within each group defined by the column “Type 1”. We write the expression to +rank the data by the column “Speed” and then we add the function `over` to specify that this should +happen over the unique values of the column “Type 1”: {{code_block('user-guide/expressions/window','rank',['over'])}} @@ -25,18 +25,23 @@ We write the expression to rank the data by the column “Speed” and then we a --8<-- "python/user-guide/expressions/window.py:rank" ``` -To help visualise this operation, you may imagine that Polars selects the subsets of the data that share the same value for the column “Type 1” and then computes the ranking expression only for those values. -Then, the results for that specific group are projected back to the original rows and Polars does this for all of the existing groups. -The diagram below highlights the ranking computation for the Pokémon with “Type 1” equal to “Grass”. +To help visualise this operation, you may imagine that Polars selects the subsets of the data that +share the same value for the column “Type 1” and then computes the ranking expression only for those +values. Then, the results for that specific group are projected back to the original rows and Polars +does this for all of the existing groups. The diagram below highlights the ranking computation for +the Pokémon with “Type 1” equal to “Grass”.
--8<-- "docs/source/user-guide/expressions/speed_rank_by_type.svg"
-Note how the row for the Pokémon “Golbat” has a “Speed” value of `90`, which is greater than the value `80` of the Pokémon “Venusaur”, and yet the latter was ranked 1 because “Golbat” and “Venusar” do not share the same value for the column “Type 1”. +Note how the row for the Pokémon “Golbat” has a “Speed” value of `90`, which is greater than the +value `80` of the Pokémon “Venusaur”, and yet the latter was ranked 1 because “Golbat” and “Venusar” +do not share the same value for the column “Type 1”. -The function `over` accepts an arbitrary number of expressions to specify the groups over which to perform the computations. -We can repeat the ranking above, but over the combination of the columns “Type 1” and “Type 2” for a more fine-grained ranking: +The function `over` accepts an arbitrary number of expressions to specify the groups over which to +perform the computations. We can repeat the ranking above, but over the combination of the columns +“Type 1” and “Type 2” for a more fine-grained ranking: {{code_block('user-guide/expressions/window','rank-multiple',['over'])}} @@ -44,7 +49,9 @@ We can repeat the ranking above, but over the combination of the columns “Type --8<-- "python/user-guide/expressions/window.py:rank-multiple" ``` -In general, the results you get with the function `over` can also be achieved with [an aggregation](aggregation.md) followed by a call to the function `explode`, although the rows would be in a different order: +In general, the results you get with the function `over` can also be achieved with +[an aggregation](aggregation.md) followed by a call to the function `explode`, although the rows +would be in a different order: {{code_block('user-guide/expressions/window','rank-explode',['explode'])}} @@ -54,31 +61,35 @@ In general, the results you get with the function `over` can also be achieved wi This shows that, usually, `group_by` and `over` produce results of different shapes: -- `group_by` usually produces a resulting dataframe with as many rows as groups used for aggregating; and +- `group_by` usually produces a resulting dataframe with as many rows as groups used for + aggregating; and - `over` usually produces a dataframe with the same number of rows as the original. -The function `over` does not always produce results with the same number of rows as the original dataframe, and that is what we explore next. +The function `over` does not always produce results with the same number of rows as the original +dataframe, and that is what we explore next. ## Mapping results to dataframe rows -The function `over` accepts a parameter `mapping_strategy` that determines how the results of the expression over the group are mapped back to the rows of the dataframe. +The function `over` accepts a parameter `mapping_strategy` that determines how the results of the +expression over the group are mapped back to the rows of the dataframe. ### `group_to_rows` -The default behaviour is `"group_to_rows"`: -the result of the expression over the group should be the same length as the group and the results are mapped back to the rows of that group. +The default behaviour is `"group_to_rows"`: the result of the expression over the group should be +the same length as the group and the results are mapped back to the rows of that group. -If the order of the rows is not relevant, the option `"explode"` is more performant. -Instead of mapping the resulting values to the original rows, Polars creates a new dataframe where values from the same group are next to each other. -To help understand the distinction, consider the following dataframe: +If the order of the rows is not relevant, the option `"explode"` is more performant. Instead of +mapping the resulting values to the original rows, Polars creates a new dataframe where values from +the same group are next to each other. To help understand the distinction, consider the following +dataframe: ```python exec="on" result="text" session="user-guide/window" --8<-- "python/user-guide/expressions/window.py:athletes" ``` -We can sort the athletes by rank within their own countries. -If we do so, the Dutch athletes were in the second, third, and sixth, rows, and they will remain there. -What will change is the order of the names of the athletes, which goes from “B”, “C”, and “F”, to “B”, “F”, and “C”: +We can sort the athletes by rank within their own countries. If we do so, the Dutch athletes were in +the second, third, and sixth, rows, and they will remain there. What will change is the order of the +names of the athletes, which goes from “B”, “C”, and “F”, to “B”, “F”, and “C”: {{code_block('user-guide/expressions/window','athletes-sort-over-country',['over'])}} @@ -94,15 +105,18 @@ The diagram below represents this transformation: ### `explode` -If we set the parameter `mapping_strategy` to `"explode"`, then athletes of the same country are grouped together, but the final order of the rows – with respect to the countries – will not be the same, as the diagram shows: +If we set the parameter `mapping_strategy` to `"explode"`, then athletes of the same country are +grouped together, but the final order of the rows – with respect to the countries – will not be the +same, as the diagram shows:
--8<-- "docs/source/user-guide/expressions/athletes_over_country_explode.svg"
-Because Polars does not need to keep track of the positions of the rows of each group, using `"explode"` is typically faster than `"group_to_rows"`. -However, using `"explode"` also requires more care because it implies reordering the other columns that we wish to keep. -The code that produces this result follows +Because Polars does not need to keep track of the positions of the rows of each group, using +`"explode"` is typically faster than `"group_to_rows"`. However, using `"explode"` also requires +more care because it implies reordering the other columns that we wish to keep. The code that +produces this result follows {{code_block('user-guide/expressions/window','athletes-explode',['over'])}} @@ -112,7 +126,8 @@ The code that produces this result follows ### `join` -Another possible value for the parameter `mapping_strategy` is `"join"`, which aggregates the resulting values in a list and repeats the list over all rows of the same group: +Another possible value for the parameter `mapping_strategy` is `"join"`, which aggregates the +resulting values in a list and repeats the list over all rows of the same group: {{code_block('user-guide/expressions/window','athletes-join',['over'])}} @@ -122,7 +137,8 @@ Another possible value for the parameter `mapping_strategy` is `"join"`, which a ## Windowed aggregation expressions -In case the expression applied to the values of a group produces a scalar value, the scalar is broadcast across the rows of the group: +In case the expression applied to the values of a group produces a scalar value, the scalar is +broadcast across the rows of the group: {{code_block('user-guide/expressions/window','pokemon-mean',['over'])}} @@ -136,8 +152,10 @@ For more exercises, below are some window functions for us to compute: - sort all Pokémon by type; - select the first `3` Pokémon per type as `"Type 1"`; -- sort the Pokémon within a type by speed in descending order and select the first `3` as `"fastest/group"`; -- sort the Pokémon within a type by attack in descending order and select the first `3` as `"strongest/group"`; and +- sort the Pokémon within a type by speed in descending order and select the first `3` as + `"fastest/group"`; +- sort the Pokémon within a type by attack in descending order and select the first `3` as + `"strongest/group"`; and - sort the Pokémon within a type by name and select the first `3` as `"sorted_by_alphabet"`. {{code_block('user-guide/expressions/window','examples',['over'])}} diff --git a/docs/source/user-guide/getting-started.md b/docs/source/user-guide/getting-started.md index b0c18b2562b1..a59d32c6b301 100644 --- a/docs/source/user-guide/getting-started.md +++ b/docs/source/user-guide/getting-started.md @@ -1,6 +1,10 @@ # Getting started -This chapter is here to help you get started with Polars. It covers all the fundamental features and functionalities of the library, making it easy for new users to familiarise themselves with the basics from initial installation and setup to core functionalities. If you're already an advanced user or familiar with dataframes, feel free to skip ahead to the [next chapter about installation options](installation.md). +This chapter is here to help you get started with Polars. It covers all the fundamental features and +functionalities of the library, making it easy for new users to familiarise themselves with the +basics from initial installation and setup to core functionalities. If you're already an advanced +user or familiar with dataframes, feel free to skip ahead to the +[next chapter about installation options](installation.md). ## Installing Polars @@ -22,7 +26,9 @@ This chapter is here to help you get started with Polars. It covers all the fund ## Reading & writing -Polars supports reading and writing for common file formats (e.g., csv, json, parquet), cloud storage (S3, Azure Blob, BigQuery) and databases (e.g., postgres, mysql). Below, we create a small dataframe and show how to write it to disk and read it back. +Polars supports reading and writing for common file formats (e.g., csv, json, parquet), cloud +storage (S3, Azure Blob, BigQuery) and databases (e.g., postgres, mysql). Below, we create a small +dataframe and show how to write it to disk and read it back. {{code_block('user-guide/getting-started','df',['DataFrame'])}} @@ -30,7 +36,8 @@ Polars supports reading and writing for common file formats (e.g., csv, json, pa --8<-- "python/user-guide/getting-started.py:df" ``` -In the example below we write the dataframe to a csv file called `output.csv`. After that, we read it back using `read_csv` and then print the result for inspection. +In the example below we write the dataframe to a csv file called `output.csv`. After that, we read +it back using `read_csv` and then print the result for inspection. {{code_block('user-guide/getting-started','csv',['read_csv','write_csv'])}} @@ -38,11 +45,13 @@ In the example below we write the dataframe to a csv file called `output.csv`. A --8<-- "python/user-guide/getting-started.py:csv" ``` -For more examples on the CSV file format and other data formats, see the [IO section](io/index.md) of the user guide. +For more examples on the CSV file format and other data formats, see the [IO section](io/index.md) +of the user guide. ## Expressions and contexts -_Expressions_ are one of the main strengths of Polars because they provide a modular and flexible way of expressing data transformations. +_Expressions_ are one of the main strengths of Polars because they provide a modular and flexible +way of expressing data transformations. Here is an example of a Polars expression: @@ -50,8 +59,10 @@ Here is an example of a Polars expression: pl.col("weight") / (pl.col("height") ** 2) ``` -As you might be able to guess, this expression takes the column named “weight” and divides its values by the square of the values in the column “height”, computing a person's BMI. -Note that the code above expresses an abstract computation: it's only inside a Polars _context_ that the expression materalizes into a series with the results. +As you might be able to guess, this expression takes the column named “weight” and divides its +values by the square of the values in the column “height”, computing a person's BMI. Note that the +code above expresses an abstract computation: it's only inside a Polars _context_ that the +expression materalizes into a series with the results. Below, we will show examples of Polars expressions inside different contexts: @@ -60,12 +71,13 @@ Below, we will show examples of Polars expressions inside different contexts: - `filter` - `group_by` -For a more [detailed exploration of expressions and contexts see the respective user guide section](concepts/expressions-and-contexts.md). +For a more +[detailed exploration of expressions and contexts see the respective user guide section](concepts/expressions-and-contexts.md). ### `select` -The context `select` allows you to select and manipulate columns from a dataframe. -In the simplest case, each expression you provide will map to a column in the result dataframe: +The context `select` allows you to select and manipulate columns from a dataframe. In the simplest +case, each expression you provide will map to a column in the result dataframe: {{code_block('user-guide/getting-started','select',['select','alias','Expr.dt'])}} @@ -73,9 +85,10 @@ In the simplest case, each expression you provide will map to a column in the re --8<-- "python/user-guide/getting-started.py:select" ``` -Polars also supports a feature called “expression expansion”, in which one expression acts as shorthand for multiple expressions. -In the example below, we use expression expansion to manipulate the columns “weight” and “height” with a single expression. -When using expression expansion you can use `.name.suffix` to add a suffix to the names of the original columns: +Polars also supports a feature called “expression expansion”, in which one expression acts as +shorthand for multiple expressions. In the example below, we use expression expansion to manipulate +the columns “weight” and “height” with a single expression. When using expression expansion you can +use `.name.suffix` to add a suffix to the names of the original columns: {{code_block('user-guide/getting-started','expression-expansion',['select','alias','Expr.name'])}} @@ -83,12 +96,16 @@ When using expression expansion you can use `.name.suffix` to add a suffix to th --8<-- "python/user-guide/getting-started.py:expression-expansion" ``` -You can check other sections of the user guide to learn more about [basic operations](expressions/basic-operations.md) or [column selections in expression expansion](expressions/expression-expansion.md). +You can check other sections of the user guide to learn more about +[basic operations](expressions/basic-operations.md) or +[column selections in expression expansion](expressions/expression-expansion.md). ### `with_columns` -The context `with_columns` is very similar to the context `select` but `with_columns` adds columns to the dataframe instead of selecting them. -Notice how the resulting dataframe contains the four columns of the original dataframe plus the two new columns introduced by the expressions inside `with_columns`: +The context `with_columns` is very similar to the context `select` but `with_columns` adds columns +to the dataframe instead of selecting them. Notice how the resulting dataframe contains the four +columns of the original dataframe plus the two new columns introduced by the expressions inside +`with_columns`: {{code_block('user-guide/getting-started','with_columns',['with_columns'])}} @@ -96,12 +113,14 @@ Notice how the resulting dataframe contains the four columns of the original dat --8<-- "python/user-guide/getting-started.py:with_columns" ``` -In the example above we also decided to use named expressions instead of the method `alias` to specify the names of the new columns. -Other contexts like `select` and `group_by` also accept named expressions. +In the example above we also decided to use named expressions instead of the method `alias` to +specify the names of the new columns. Other contexts like `select` and `group_by` also accept named +expressions. ### `filter` -The context `filter` allows us to create a second dataframe with a subset of the rows of the original one: +The context `filter` allows us to create a second dataframe with a subset of the rows of the +original one: {{code_block('user-guide/getting-started','filter',['filter','Expr.dt'])}} @@ -109,7 +128,8 @@ The context `filter` allows us to create a second dataframe with a subset of the --8<-- "python/user-guide/getting-started.py:filter" ``` -You can also provide multiple predicate expressions as separate parameters, which is more convenient than putting them all together with `&`: +You can also provide multiple predicate expressions as separate parameters, which is more convenient +than putting them all together with `&`: {{code_block('user-guide/getting-started','filter-multiple',['filter','is_between'])}} @@ -119,8 +139,9 @@ You can also provide multiple predicate expressions as separate parameters, whic ### `group_by` -The context `group_by` can be used to group together the rows of the dataframe that share the same value across one or more expressions. -The example below counts how many people were born in each decade: +The context `group_by` can be used to group together the rows of the dataframe that share the same +value across one or more expressions. The example below counts how many people were born in each +decade: {{code_block('user-guide/getting-started','group_by',['group_by','alias','Expr.dt'])}} @@ -128,10 +149,12 @@ The example below counts how many people were born in each decade: --8<-- "python/user-guide/getting-started.py:group_by" ``` -The keyword argument `maintain_order` forces Polars to present the resulting groups in the same order as they appear in the original dataframe. -This slows down the grouping operation but is used here to ensure reproducibility of the examples. +The keyword argument `maintain_order` forces Polars to present the resulting groups in the same +order as they appear in the original dataframe. This slows down the grouping operation but is used +here to ensure reproducibility of the examples. -After using the context `group_by` we can use `agg` to compute aggregations over the resulting groups: +After using the context `group_by` we can use `agg` to compute aggregations over the resulting +groups: {{code_block('user-guide/getting-started','group_by-agg',['group_by','agg'])}} @@ -141,8 +164,9 @@ After using the context `group_by` we can use `agg` to compute aggregations over ### More complex queries -Contexts and the expressions within can be chained to create more complex queries according to your needs. -In the example below we combine some of the contexts we have seen so far to create a more complex query: +Contexts and the expressions within can be chained to create more complex queries according to your +needs. In the example below we combine some of the contexts we have seen so far to create a more +complex query: {{code_block('user-guide/getting-started','complex',['group_by','agg','select','with_columns','Expr.str','Expr.list'])}} @@ -152,13 +176,14 @@ In the example below we combine some of the contexts we have seen so far to crea ## Combining dataframes -Polars provides a number of tools to combine two dataframes. -In this section, we show an example of a join and an example of a concatenation. +Polars provides a number of tools to combine two dataframes. In this section, we show an example of +a join and an example of a concatenation. ### Joining dataframes -Polars provides many different join algorithms. -The example below shows how to use a left outer join to combine two dataframes when a column can be used as a unique identifier to establish a correspondence between rows across the dataframes: +Polars provides many different join algorithms. The example below shows how to use a left outer join +to combine two dataframes when a column can be used as a unique identifier to establish a +correspondence between rows across the dataframes: {{code_block('user-guide/getting-started','join',['join'])}} @@ -166,12 +191,14 @@ The example below shows how to use a left outer join to combine two dataframes w --8<-- "python/user-guide/getting-started.py:join" ``` -Polars provides many different join algorithms that you can learn about in the [joins section of the user guide](transformations/joins.md). +Polars provides many different join algorithms that you can learn about in the +[joins section of the user guide](transformations/joins.md). ### Concatenating dataframes -Concatenating dataframes creates a taller or wider dataframe, depending on the method used. -Assuming we have a second dataframe with data from other people, we could use vertical concatenation to create a taller dataframe: +Concatenating dataframes creates a taller or wider dataframe, depending on the method used. Assuming +we have a second dataframe with data from other people, we could use vertical concatenation to +create a taller dataframe: {{code_block('user-guide/getting-started','concat',['concat'])}} @@ -179,5 +206,6 @@ Assuming we have a second dataframe with data from other people, we could use ve --8<-- "python/user-guide/getting-started.py:concat" ``` -Polars provides vertical and horizontal concatenation, as well as diagonal concatenation. -You can learn more about these in the [concatenations section of the user guide](transformations/concatenation.md). +Polars provides vertical and horizontal concatenation, as well as diagonal concatenation. You can +learn more about these in the +[concatenations section of the user guide](transformations/concatenation.md). diff --git a/docs/source/user-guide/gpu-support.md b/docs/source/user-guide/gpu-support.md index b350b94113d1..b68c2e16969b 100644 --- a/docs/source/user-guide/gpu-support.md +++ b/docs/source/user-guide/gpu-support.md @@ -1,6 +1,8 @@ # GPU Support [Open Beta] -Polars provides an in-memory, GPU-accelerated execution engine for Python users of the Lazy API on NVIDIA GPUs using [RAPIDS cuDF](https://docs.rapids.ai/api/cudf/stable/). This functionality is available in Open Beta and is undergoing rapid development. +Polars provides an in-memory, GPU-accelerated execution engine for Python users of the Lazy API on +NVIDIA GPUs using [RAPIDS cuDF](https://docs.rapids.ai/api/cudf/stable/). This functionality is +available in Open Beta and is undergoing rapid development. ### System Requirements @@ -12,7 +14,8 @@ See the [RAPIDS installation guide](https://docs.rapids.ai/install#system-req) f ### Installation -You can install the GPU backend for Polars with a feature flag as part of a normal [installation](installation.md). +You can install the GPU backend for Polars with a feature flag as part of a normal +[installation](installation.md). === ":fontawesome-brands-python: Python" @@ -31,7 +34,8 @@ pip install --extra-index-url=https://pypi.nvidia.com polars[gpu] ### Usage -Having built a query using the lazy API [as normal](lazy/index.md), GPU-enabled execution is requested by running `.collect(engine="gpu")` instead of `.collect()`. +Having built a query using the lazy API [as normal](lazy/index.md), GPU-enabled execution is +requested by running `.collect(engine="gpu")` instead of `.collect()`. {{ code_header("python", [], []) }} @@ -47,7 +51,9 @@ print(result) --8<-- "python/user-guide/lazy/gpu.py:simple-result" ``` -For more detailed control over the execution, for example to specify which GPU to use on a multi-GPU node, we can provide a `GPUEngine` object. By default, the GPU engine will use a configuration applicable to most use cases. +For more detailed control over the execution, for example to specify which GPU to use on a multi-GPU +node, we can provide a `GPUEngine` object. By default, the GPU engine will use a configuration +applicable to most use cases. {{ code_header("python", [], []) }} @@ -64,13 +70,18 @@ print(result) ### How It Works -When you use the GPU-accelerated engine, Polars creates and optimizes a query plan and dispatches to a [RAPIDS](https://rapids.ai/) cuDF-based physical execution engine to compute the results on NVIDIA GPUs. The final result is returned as a normal CPU-backed Polars dataframe. +When you use the GPU-accelerated engine, Polars creates and optimizes a query plan and dispatches to +a [RAPIDS](https://rapids.ai/) cuDF-based physical execution engine to compute the results on NVIDIA +GPUs. The final result is returned as a normal CPU-backed Polars dataframe. ### What's Supported on the GPU? -GPU support is currently in Open Beta and the engine is undergoing rapid development. The engine currently supports many, but not all, of the core expressions and data types. +GPU support is currently in Open Beta and the engine is undergoing rapid development. The engine +currently supports many, but not all, of the core expressions and data types. -Since expressions are composable, it's not feasible to list a full matrix of expressions supported on the GPU. Instead, we provide a list of the high-level categories of expressions and interfaces that are currently supported and not supported. +Since expressions are composable, it's not feasible to list a full matrix of expressions supported +on the GPU. Instead, we provide a list of the high-level categories of expressions and interfaces +that are currently supported and not supported. #### Supported @@ -99,9 +110,14 @@ Since expressions are composable, it's not feasible to list a full matrix of exp #### Did my query use the GPU? -The release of the GPU engine in Open Beta implies that we expect things to work well, but there are still some rough edges we're working on. In particular the full breadth of the Polars expression API is not yet supported. With fallback to the CPU, your query _should_ complete, but you might not observe any change in the time it takes to execute. There are two ways to get more information on whether the query ran on the GPU. +The release of the GPU engine in Open Beta implies that we expect things to work well, but there are +still some rough edges we're working on. In particular the full breadth of the Polars expression API +is not yet supported. With fallback to the CPU, your query _should_ complete, but you might not +observe any change in the time it takes to execute. There are two ways to get more information on +whether the query ran on the GPU. -When running in verbose mode, any queries that cannot execute on the GPU will issue a `PerformanceWarning`: +When running in verbose mode, any queries that cannot execute on the GPU will issue a +`PerformanceWarning`: {{ code_header("python", [], []) }} @@ -126,7 +142,8 @@ print() print(q.collect()) ``` -To disable fallback, and have the GPU engine raise an exception if a query is unsupported, we can pass an appropriately configured `GPUEngine` object: +To disable fallback, and have the GPU engine raise an exception if a query is unsupported, we can +pass an appropriately configured `GPUEngine` object: {{ code_header("python", [], []) }} @@ -142,28 +159,47 @@ Traceback (most recent call last): polars.exceptions.ComputeError: 'cuda' conversion failed: NotImplementedError: Grouped rolling window not implemented ``` -Currently, only the proximal cause of failure to execute on the GPU is reported, we plan to extend this functionality to report all unsupported operations for a query. +Currently, only the proximal cause of failure to execute on the GPU is reported, we plan to extend +this functionality to report all unsupported operations for a query. ### Testing -The Polars and NVIDIA RAPIDS teams run comprehensive unit and integration tests to ensure that the GPU-accelerated Polars backend works smoothly. +The Polars and NVIDIA RAPIDS teams run comprehensive unit and integration tests to ensure that the +GPU-accelerated Polars backend works smoothly. -The **full** Polars test suite is run on every commit made to the GPU engine, ensuring consistency of results. +The **full** Polars test suite is run on every commit made to the GPU engine, ensuring consistency +of results. -The GPU engine currently passes 99.2% of the Polars unit tests with CPU fallback enabled. Without CPU fallback, the GPU engine passes 88.8% of the Polars unit tests. With fallback, there are approximately 100 failing tests: around 40 of these fail due to mismatching debug output; there are some cases where the GPU engine produces the a correct result but uses a different data type; the remainder are cases where we do not correctly determine that a query is unsupported and therefore fail at runtime, instead of falling back. +The GPU engine currently passes 99.2% of the Polars unit tests with CPU fallback enabled. Without +CPU fallback, the GPU engine passes 88.8% of the Polars unit tests. With fallback, there are +approximately 100 failing tests: around 40 of these fail due to mismatching debug output; there are +some cases where the GPU engine produces the a correct result but uses a different data type; the +remainder are cases where we do not correctly determine that a query is unsupported and therefore +fail at runtime, instead of falling back. ### When Should I Use a GPU? -Based on our benchmarking, you're most likely to observe speedups using the GPU engine when your workflow's profile is dominated by grouped aggregations and joins. In contrast I/O bound queries typically show similar performance on GPU and CPU. GPUs typically have less RAM than CPU systems, therefore very large datasets will fail due to out of memory errors. Based on our testing, raw datasets of 50-100 GiB fit (depending on the workflow) well with a GPU with 80GiB of memory. +Based on our benchmarking, you're most likely to observe speedups using the GPU engine when your +workflow's profile is dominated by grouped aggregations and joins. In contrast I/O bound queries +typically show similar performance on GPU and CPU. GPUs typically have less RAM than CPU systems, +therefore very large datasets will fail due to out of memory errors. Based on our testing, raw +datasets of 50-100 GiB fit (depending on the workflow) well with a GPU with 80GiB of memory. ### CPU-GPU Interoperability -Both the CPU and GPU engine use the Apache Arrow columnar memory specification, making it possible to quickly move data between the CPU and GPU. Additionally, files written by one engine can be read by the other engine. +Both the CPU and GPU engine use the Apache Arrow columnar memory specification, making it possible +to quickly move data between the CPU and GPU. Additionally, files written by one engine can be read +by the other engine. -When using GPU mode, your workflow won't fail if something isn't supported. When you run `collect(engine="gpu")`, the optimized query plan is inspected to see whether it can be executed on the GPU. If it can't, it will transparently fall back to the standard Polars engine and run on the CPU. +When using GPU mode, your workflow won't fail if something isn't supported. When you run +`collect(engine="gpu")`, the optimized query plan is inspected to see whether it can be executed on +the GPU. If it can't, it will transparently fall back to the standard Polars engine and run on the +CPU. -GPU execution is only available in the Lazy API, so materialized DataFrames will reside in CPU memory when the query execution finishes. +GPU execution is only available in the Lazy API, so materialized DataFrames will reside in CPU +memory when the query execution finishes. ### Providing feedback -Please report issues, and missing features, on the Polars [issue tracker](https://github.com/pola-rs/polars/issues). +Please report issues, and missing features, on the Polars +[issue tracker](https://github.com/pola-rs/polars/issues). diff --git a/docs/source/user-guide/installation.md b/docs/source/user-guide/installation.md index 0cecd7cd5f4b..8690408da179 100644 --- a/docs/source/user-guide/installation.md +++ b/docs/source/user-guide/installation.md @@ -1,6 +1,7 @@ # Installation -Polars is a library and installation is as simple as invoking the package manager of the corresponding programming language. +Polars is a library and installation is as simple as invoking the package manager of the +corresponding programming language. === ":fontawesome-brands-python: Python" @@ -23,8 +24,8 @@ Polars is a library and installation is as simple as invoking the package manage ## Big Index -By default, Polars dataframes are limited to $2^{32}$ rows (~4.3 billion). -Increase this limit to $2^{64}$ (~18 quintillion) by enabling the big index extension: +By default, Polars dataframes are limited to $2^{32}$ rows (~4.3 billion). Increase this limit to +$2^{64}$ (~18 quintillion) by enabling the big index extension: === ":fontawesome-brands-python: Python" @@ -44,7 +45,8 @@ Increase this limit to $2^{64}$ (~18 quintillion) by enabling the big index exte ## Legacy CPU -To install Polars for Python on an old CPU without [AVX](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions) support, run: +To install Polars for Python on an old CPU without +[AVX](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions) support, run: === ":fontawesome-brands-python: Python" @@ -70,11 +72,10 @@ To use the library, simply import it into your project: ## Feature flags -By using the above command you install the core of Polars onto your system. -However, depending on your use case, you might want to install the optional dependencies as well. -These are made optional to minimize the footprint. -The flags are different depending on the programming language. -Throughout the user guide we will mention when a functionality used requires an additional dependency. +By using the above command you install the core of Polars onto your system. However, depending on +your use case, you might want to install the optional dependencies as well. These are made optional +to minimize the footprint. The flags are different depending on the programming language. Throughout +the user guide we will mention when a functionality used requires an additional dependency. ### Python diff --git a/docs/source/user-guide/io/cloud-storage.md b/docs/source/user-guide/io/cloud-storage.md index f3b5d7a8fb09..f12ad4576ebd 100644 --- a/docs/source/user-guide/io/cloud-storage.md +++ b/docs/source/user-guide/io/cloud-storage.md @@ -1,8 +1,10 @@ # Cloud storage -Polars can read and write to AWS S3, Azure Blob Storage and Google Cloud Storage. The API is the same for all three storage providers. +Polars can read and write to AWS S3, Azure Blob Storage and Google Cloud Storage. The API is the +same for all three storage providers. -To read from cloud storage, additional dependencies may be needed depending on the use case and cloud storage provider: +To read from cloud storage, additional dependencies may be needed depending on the use case and +cloud storage provider: === ":fontawesome-brands-python: Python" @@ -24,7 +26,10 @@ Polars supports reading Parquet, CSV, IPC and NDJSON files from cloud storage: ## Scanning from cloud storage with query optimisation -Using `pl.scan_*` functions to read from cloud storage can benefit from [predicate and projection pushdowns](../lazy/optimizations.md), where the query optimizer will apply them before the file is downloaded. This can significantly reduce the amount of data that needs to be downloaded. The query evaluation is triggered by calling `collect`. +Using `pl.scan_*` functions to read from cloud storage can benefit from +[predicate and projection pushdowns](../lazy/optimizations.md), where the query optimizer will apply +them before the file is downloaded. This can significantly reduce the amount of data that needs to +be downloaded. The query evaluation is triggered by calling `collect`. {{code_block('user-guide/io/cloud-storage','scan_parquet_query',[])}} @@ -42,19 +47,23 @@ use for authentication. This can be done in a few ways: ### Using one of the available `CredentialProvider*` utility classes -- There may be a utility class `pl.CredentialProvider*` that provides the required authentication functionality. For example, `pl.CredentialProviderAWS` supports selecting AWS profiles, as well as assuming an IAM role: +- There may be a utility class `pl.CredentialProvider*` that provides the required authentication + functionality. For example, `pl.CredentialProviderAWS` supports selecting AWS profiles, as well as + assuming an IAM role: {{code_block('user-guide/io/cloud-storage','credential_provider_class',['scan_parquet'])}} ### Using a custom `credential_provider` function -- Some environments may require custom authentication logic (e.g. AWS IAM role-chaining). For these cases a Python function can be provided for Polars to use to retrieve credentials: +- Some environments may require custom authentication logic (e.g. AWS IAM role-chaining). For these + cases a Python function can be provided for Polars to use to retrieve credentials: {{code_block('user-guide/io/cloud-storage','credential_provider_custom_func',['scan_parquet'])}} ## Scanning with PyArrow -We can also scan from cloud storage using PyArrow. This is particularly useful for partitioned datasets such as Hive partitioning. +We can also scan from cloud storage using PyArrow. This is particularly useful for partitioned +datasets such as Hive partitioning. We first create a PyArrow dataset and then create a `LazyFrame` from the dataset. @@ -62,6 +71,7 @@ We first create a PyArrow dataset and then create a `LazyFrame` from the dataset ## Writing to cloud storage -We can write a `DataFrame` to cloud storage in Python using s3fs for S3, adlfs for Azure Blob Storage and gcsfs for Google Cloud Storage. In this example, we write a Parquet file to S3. +We can write a `DataFrame` to cloud storage in Python using s3fs for S3, adlfs for Azure Blob +Storage and gcsfs for Google Cloud Storage. In this example, we write a Parquet file to S3. {{code_block('user-guide/io/cloud-storage','write_parquet',['write_parquet'])}} diff --git a/docs/source/user-guide/io/csv.md b/docs/source/user-guide/io/csv.md index f654d970ac81..0681daddbd5e 100644 --- a/docs/source/user-guide/io/csv.md +++ b/docs/source/user-guide/io/csv.md @@ -12,10 +12,10 @@ Writing a CSV file is similar with the `write_csv` function: ## Scan -Polars allows you to _scan_ a CSV input. Scanning delays the actual parsing of the -file and instead returns a lazy computation holder called a `LazyFrame`. +Polars allows you to _scan_ a CSV input. Scanning delays the actual parsing of the file and instead +returns a lazy computation holder called a `LazyFrame`. {{code_block('user-guide/io/csv','scan',['scan_csv'])}} -If you want to know why this is desirable, you can read more about these Polars -optimizations [here](../concepts/lazy-api.md). +If you want to know why this is desirable, you can read more about these Polars optimizations +[here](../concepts/lazy-api.md). diff --git a/docs/source/user-guide/io/database.md b/docs/source/user-guide/io/database.md index 3eea8836a084..e9b3a6075799 100644 --- a/docs/source/user-guide/io/database.md +++ b/docs/source/user-guide/io/database.md @@ -6,32 +6,44 @@ Polars can read from a database using the `pl.read_database_uri` and `pl.read_da ### Difference between `read_database_uri` and `read_database` -Use `pl.read_database_uri` if you want to specify the database connection with a connection string called a `uri`. For example, the following snippet shows a query to read all columns from the `foo` table in a Postgres database where we use the `uri` to connect: +Use `pl.read_database_uri` if you want to specify the database connection with a connection string +called a `uri`. For example, the following snippet shows a query to read all columns from the `foo` +table in a Postgres database where we use the `uri` to connect: {{code_block('user-guide/io/database','read_uri',['read_database_uri'])}} -On the other hand, use `pl.read_database` if you want to connect via a connection engine created with a library like SQLAlchemy. +On the other hand, use `pl.read_database` if you want to connect via a connection engine created +with a library like SQLAlchemy. {{code_block('user-guide/io/database','read_cursor',['read_database'])}} -Note that `pl.read_database_uri` is likely to be faster than `pl.read_database` if you are using a SQLAlchemy or DBAPI2 connection as these connections may load the data row-wise into Python before copying the data again to the column-wise Apache Arrow format. +Note that `pl.read_database_uri` is likely to be faster than `pl.read_database` if you are using a +SQLAlchemy or DBAPI2 connection as these connections may load the data row-wise into Python before +copying the data again to the column-wise Apache Arrow format. ### Engines -Polars doesn't manage connections and data transfer from databases by itself. Instead, external libraries (known as _engines_) handle this. +Polars doesn't manage connections and data transfer from databases by itself. Instead, external +libraries (known as _engines_) handle this. -When using `pl.read_database`, you specify the engine when you create the connection object. When using `pl.read_database_uri`, you can specify one of two engines to read from the database: +When using `pl.read_database`, you specify the engine when you create the connection object. When +using `pl.read_database_uri`, you can specify one of two engines to read from the database: - [ConnectorX](https://github.com/sfu-db/connector-x) and - [ADBC](https://arrow.apache.org/docs/format/ADBC.html) -Both engines have native support for Apache Arrow and so can read data directly into a Polars `DataFrame` without copying the data. +Both engines have native support for Apache Arrow and so can read data directly into a Polars +`DataFrame` without copying the data. #### ConnectorX -ConnectorX is the default engine and [supports numerous databases](https://github.com/sfu-db/connector-x#sources) including Postgres, Mysql, SQL Server and Redshift. ConnectorX is written in Rust and stores data in Arrow format to allow for zero-copy to Polars. +ConnectorX is the default engine and +[supports numerous databases](https://github.com/sfu-db/connector-x#sources) including Postgres, +Mysql, SQL Server and Redshift. ConnectorX is written in Rust and stores data in Arrow format to +allow for zero-copy to Polars. -To read from one of the supported databases with `ConnectorX` you need to activate the additional dependency `ConnectorX` when installing Polars or install it manually with +To read from one of the supported databases with `ConnectorX` you need to activate the additional +dependency `ConnectorX` when installing Polars or install it manually with ```shell $ pip install connectorx @@ -39,15 +51,22 @@ $ pip install connectorx #### ADBC -ADBC (Arrow Database Connectivity) is an engine supported by the Apache Arrow project. ADBC aims to be both an API standard for connecting to databases and libraries implementing this standard in a range of languages. +ADBC (Arrow Database Connectivity) is an engine supported by the Apache Arrow project. ADBC aims to +be both an API standard for connecting to databases and libraries implementing this standard in a +range of languages. -It is still early days for ADBC so support for different databases is limited. At present, drivers for ADBC are only available for [Postgres](https://pypi.org/project/adbc-driver-postgresql/), [SQLite](https://pypi.org/project/adbc-driver-sqlite/) and [Snowflake](https://pypi.org/project/adbc-driver-snowflake/). To install ADBC, you need to install the driver for your database. For example, to install the driver for SQLite, you run: +It is still early days for ADBC so support for different databases is limited. At present, drivers +for ADBC are only available for [Postgres](https://pypi.org/project/adbc-driver-postgresql/), +[SQLite](https://pypi.org/project/adbc-driver-sqlite/) and +[Snowflake](https://pypi.org/project/adbc-driver-snowflake/). To install ADBC, you need to install +the driver for your database. For example, to install the driver for SQLite, you run: ```shell $ pip install adbc-driver-sqlite ``` -As ADBC is not the default engine, you must specify the engine as an argument to `pl.read_database_uri`. +As ADBC is not the default engine, you must specify the engine as an argument to +`pl.read_database_uri`. {{code_block('user-guide/io/database','adbc',['read_database_uri'])}} @@ -57,14 +76,16 @@ We can write to a database with Polars using the `pl.write_database` function. ### Engines -As with reading from a database above, Polars uses an _engine_ to write to a database. The currently supported engines are: +As with reading from a database above, Polars uses an _engine_ to write to a database. The currently +supported engines are: - [SQLAlchemy](https://www.sqlalchemy.org/) and - Arrow Database Connectivity (ADBC) #### SQLAlchemy -With the default engine SQLAlchemy you can write to any database supported by SQLAlchemy. To use this engine you need to install SQLAlchemy and Pandas +With the default engine SQLAlchemy you can write to any database supported by SQLAlchemy. To use +this engine you need to install SQLAlchemy and Pandas ```shell $ pip install SQLAlchemy pandas @@ -74,10 +95,13 @@ In this example, we write the `DataFrame` to a table called `records` in the dat {{code_block('user-guide/io/database','write',['write_database'])}} -In the SQLAlchemy approach, Polars converts the `DataFrame` to a Pandas `DataFrame` backed by PyArrow and then uses SQLAlchemy methods on a Pandas `DataFrame` to write to the database. +In the SQLAlchemy approach, Polars converts the `DataFrame` to a Pandas `DataFrame` backed by +PyArrow and then uses SQLAlchemy methods on a Pandas `DataFrame` to write to the database. #### ADBC -ADBC can also be used to write to a database. Writing is supported for the same databases that support reading with ADBC. As shown above, you need to install the appropriate ADBC driver for your database. +ADBC can also be used to write to a database. Writing is supported for the same databases that +support reading with ADBC. As shown above, you need to install the appropriate ADBC driver for your +database. {{code_block('user-guide/io/database','write_adbc',['write_database'])}} diff --git a/docs/source/user-guide/io/excel.md b/docs/source/user-guide/io/excel.md index f5515d789645..ce0a9a5ee208 100644 --- a/docs/source/user-guide/io/excel.md +++ b/docs/source/user-guide/io/excel.md @@ -1,19 +1,26 @@ # Excel -Polars can read and write to Excel files from Python. -From a performance perspective, we recommend using other formats if possible, such as Parquet or CSV files. +Polars can read and write to Excel files from Python. From a performance perspective, we recommend +using other formats if possible, such as Parquet or CSV files. ## Read -Polars does not have a native Excel reader. Instead, it uses an external library called an "engine" to parse Excel files into a form that Polars can parse. The available engines are: +Polars does not have a native Excel reader. Instead, it uses an external library called an "engine" +to parse Excel files into a form that Polars can parse. The available engines are: -- fastexcel: This engine is based on the Rust [calamine](https://github.com/tafia/calamine) crate and is (by far) the fastest reader. -- xlsx2csv: This reader parses the .xlsx file to an in-memory CSV that Polars then reads with its own CSV reader. -- openpyxl: Typically slower than xls2csv, but can provide more flexibility for files that are difficult to parse. +- fastexcel: This engine is based on the Rust [calamine](https://github.com/tafia/calamine) crate + and is (by far) the fastest reader. +- xlsx2csv: This reader parses the .xlsx file to an in-memory CSV that Polars then reads with its + own CSV reader. +- openpyxl: Typically slower than xls2csv, but can provide more flexibility for files that are + difficult to parse. -We recommend working with the default fastexcel engine. The xlsx2csv and openpyxl engines are slower but may have more features for parsing tricky data. These engines may be helpful if the fastexcel reader does not work for a specific Excel file. +We recommend working with the default fastexcel engine. The xlsx2csv and openpyxl engines are slower +but may have more features for parsing tricky data. These engines may be helpful if the fastexcel +reader does not work for a specific Excel file. -To use one of these engines, the appropriate Python package must be installed as an additional dependency. +To use one of these engines, the appropriate Python package must be installed as an additional +dependency. === ":fontawesome-brands-python: Python" @@ -21,11 +28,14 @@ To use one of these engines, the appropriate Python package must be installed as $ pip install fastexcel xlsx2csv openpyxl ``` -The default engine for reading .xslx files is fastexcel. This engine uses the Rust calamine crate to read .xslx files into an Apache Arrow in-memory representation that Polars can read without needing to copy the data. +The default engine for reading .xslx files is fastexcel. This engine uses the Rust calamine crate to +read .xslx files into an Apache Arrow in-memory representation that Polars can read without needing +to copy the data. {{code_block('user-guide/io/excel','read',['read_excel'])}} -We can specify the sheet name to read with the `sheet_name` argument. If we do not specify a sheet name, the first sheet will be read. +We can specify the sheet name to read with the `sheet_name` argument. If we do not specify a sheet +name, the first sheet will be read. {{code_block('user-guide/io/excel','read_sheet_name',['read_excel'])}} @@ -39,7 +49,8 @@ We need the xlswriter library installed as an additional dependency to write to $ pip install xlsxwriter ``` -Writing to Excel files is not currently available in Rust Polars, though it is possible to [use this crate](https://docs.rs/crate/xlsxwriter/latest) to write to Excel files from Rust. +Writing to Excel files is not currently available in Rust Polars, though it is possible to +[use this crate](https://docs.rs/crate/xlsxwriter/latest) to write to Excel files from Rust. Writing a `DataFrame` to an Excel file is done with the `write_excel` method: @@ -49,4 +60,5 @@ The name of the worksheet can be specified with the `worksheet` argument. {{code_block('user-guide/io/excel','write_sheet_name',['write_excel'])}} -Polars can create rich Excel files with multiple sheets and formatting. For more details, see the API docs for `write_excel`. +Polars can create rich Excel files with multiple sheets and formatting. For more details, see the +API docs for `write_excel`. diff --git a/docs/source/user-guide/io/hugging-face.md b/docs/source/user-guide/io/hugging-face.md index 16f705ae75fb..692586137831 100644 --- a/docs/source/user-guide/io/hugging-face.md +++ b/docs/source/user-guide/io/hugging-face.md @@ -14,11 +14,14 @@ Hugging Face: ### Path format -To scan from Hugging Face, a `hf://` path can be passed to the scan functions. The `hf://` path format is defined as `hf://BUCKET/REPOSITORY@REVISION/PATH`, where: +To scan from Hugging Face, a `hf://` path can be passed to the scan functions. The `hf://` path +format is defined as `hf://BUCKET/REPOSITORY@REVISION/PATH`, where: - `BUCKET` is one of `datasets` or `spaces` -- `REPOSITORY` is the location of the repository, this is usually in the format of `username/repo_name`. A branch can also be optionally specified by appending `@branch` -- `REVISION` is the name of the branch (or commit) to use. This is optional and defaults to `main` if not given. +- `REPOSITORY` is the location of the repository, this is usually in the format of + `username/repo_name`. A branch can also be optionally specified by appending `@branch` +- `REVISION` is the name of the branch (or commit) to use. This is optional and defaults to `main` + if not given. - `PATH` is a file or directory path, or a glob pattern from the repository root. Example `hf://` paths: @@ -32,9 +35,11 @@ Example `hf://` paths: ### Authentication -A Hugging Face API key can be passed to Polars to access private locations using either of the following methods: +A Hugging Face API key can be passed to Polars to access private locations using either of the +following methods: -- Passing a `token` in `storage_options` to the scan function, e.g. `scan_parquet(..., storage_options={'token': ''})` +- Passing a `token` in `storage_options` to the scan function, e.g. + `scan_parquet(..., storage_options={'token': ''})` - Setting the `HF_TOKEN` environment variable, e.g. `export HF_TOKEN=` ### Examples @@ -51,7 +56,8 @@ A Hugging Face API key can be passed to Polars to access private locations using --8<-- "python/user-guide/io/hugging-face.py:scan_iris_repr" ``` -See this file at [https://huggingface.co/datasets/nameexhaustion/polars-docs/blob/main/iris.csv](https://huggingface.co/datasets/nameexhaustion/polars-docs/blob/main/iris.csv) +See this file at +[https://huggingface.co/datasets/nameexhaustion/polars-docs/blob/main/iris.csv](https://huggingface.co/datasets/nameexhaustion/polars-docs/blob/main/iris.csv) #### NDJSON @@ -61,7 +67,8 @@ See this file at [https://huggingface.co/datasets/nameexhaustion/polars-docs/blo --8<-- "python/user-guide/io/hugging-face.py:scan_iris_repr" ``` -See this file at [https://huggingface.co/datasets/nameexhaustion/polars-docs/blob/main/iris.jsonl](https://huggingface.co/datasets/nameexhaustion/polars-docs/blob/main/iris.jsonl) +See this file at +[https://huggingface.co/datasets/nameexhaustion/polars-docs/blob/main/iris.jsonl](https://huggingface.co/datasets/nameexhaustion/polars-docs/blob/main/iris.jsonl) #### Parquet @@ -71,7 +78,8 @@ See this file at [https://huggingface.co/datasets/nameexhaustion/polars-docs/blo --8<-- "python/user-guide/io/hugging-face.py:scan_parquet_hive_repr" ``` -See this folder at [https://huggingface.co/datasets/nameexhaustion/polars-docs/tree/main/hive_dates/](https://huggingface.co/datasets/nameexhaustion/polars-docs/tree/main/hive_dates/) +See this folder at +[https://huggingface.co/datasets/nameexhaustion/polars-docs/tree/main/hive_dates/](https://huggingface.co/datasets/nameexhaustion/polars-docs/tree/main/hive_dates/) #### IPC @@ -81,4 +89,5 @@ See this folder at [https://huggingface.co/datasets/nameexhaustion/polars-docs/t --8<-- "python/user-guide/io/hugging-face.py:scan_ipc_repr" ``` -See this file at [https://huggingface.co/spaces/nameexhaustion/polars-docs/blob/main/orders.feather](https://huggingface.co/spaces/nameexhaustion/polars-docs/blob/main/orders.feather) +See this file at +[https://huggingface.co/spaces/nameexhaustion/polars-docs/blob/main/orders.feather](https://huggingface.co/spaces/nameexhaustion/polars-docs/blob/main/orders.feather) diff --git a/docs/source/user-guide/io/index.md b/docs/source/user-guide/io/index.md index 042561d284dc..a4f56d188f81 100644 --- a/docs/source/user-guide/io/index.md +++ b/docs/source/user-guide/io/index.md @@ -1,6 +1,7 @@ # IO -Reading and writing your data is crucial for a DataFrame library. In this chapter you will learn more on how to read and write to different file formats that are supported by Polars. +Reading and writing your data is crucial for a DataFrame library. In this chapter you will learn +more on how to read and write to different file formats that are supported by Polars. - [CSV](csv.md) - [Excel](excel.md) diff --git a/docs/source/user-guide/io/json.md b/docs/source/user-guide/io/json.md index ff35a708e1e6..043ffe48fd19 100644 --- a/docs/source/user-guide/io/json.md +++ b/docs/source/user-guide/io/json.md @@ -12,7 +12,8 @@ Reading a JSON file should look familiar: ### Newline Delimited JSON -JSON objects that are delimited by newlines can be read into Polars in a much more performant way than standard json. +JSON objects that are delimited by newlines can be read into Polars in a much more performant way +than standard json. Polars can read an NDJSON file into a `DataFrame` using the `read_ndjson` function: @@ -24,7 +25,7 @@ Polars can read an NDJSON file into a `DataFrame` using the `read_ndjson` functi ## Scan -Polars allows you to _scan_ a JSON input **only for newline delimited json**. Scanning delays the actual parsing of the -file and instead returns a lazy computation holder called a `LazyFrame`. +Polars allows you to _scan_ a JSON input **only for newline delimited json**. Scanning delays the +actual parsing of the file and instead returns a lazy computation holder called a `LazyFrame`. {{code_block('user-guide/io/json','scan',['scan_ndjson'])}} diff --git a/docs/source/user-guide/io/multiple.md b/docs/source/user-guide/io/multiple.md index e94e88319011..ab21409b254c 100644 --- a/docs/source/user-guide/io/multiple.md +++ b/docs/source/user-guide/io/multiple.md @@ -17,8 +17,8 @@ To read multiple files into a single `DataFrame`, we can use globbing patterns: --8<-- "python/user-guide/io/multiple.py:read" ``` -To see how this works we can take a look at the query plan. Below we see that all files are read separately and -concatenated into a single `DataFrame`. Polars will try to parallelize the reading. +To see how this works we can take a look at the query plan. Below we see that all files are read +separately and concatenated into a single `DataFrame`. Polars will try to parallelize the reading. {{code_block('user-guide/io/multiple','graph',['show_graph'])}} @@ -28,8 +28,8 @@ concatenated into a single `DataFrame`. Polars will try to parallelize the readi ## Reading and processing in parallel -If your files don't have to be in a single table you can also build a query plan for each file and execute them in parallel -on the Polars thread pool. +If your files don't have to be in a single table you can also build a query plan for each file and +execute them in parallel on the Polars thread pool. All query plan execution is embarrassingly parallel and doesn't require any communication. diff --git a/docs/source/user-guide/io/parquet.md b/docs/source/user-guide/io/parquet.md index e04c2bdde2e7..2906a54e3f78 100644 --- a/docs/source/user-guide/io/parquet.md +++ b/docs/source/user-guide/io/parquet.md @@ -1,8 +1,12 @@ # Parquet -Loading or writing [`Parquet` files](https://parquet.apache.org/) is lightning fast as the layout of data in a Polars `DataFrame` in memory mirrors the layout of a Parquet file on disk in many respects. +Loading or writing [`Parquet` files](https://parquet.apache.org/) is lightning fast as the layout of +data in a Polars `DataFrame` in memory mirrors the layout of a Parquet file on disk in many +respects. -Unlike CSV, Parquet is a columnar format. This means that the data is stored in columns rather than rows. This is a more efficient way of storing data as it allows for better compression and faster access to data. +Unlike CSV, Parquet is a columnar format. This means that the data is stored in columns rather than +rows. This is a more efficient way of storing data as it allows for better compression and faster +access to data. ## Read @@ -16,10 +20,15 @@ We can read a `Parquet` file into a `DataFrame` using the `read_parquet` functio ## Scan -Polars allows you to _scan_ a `Parquet` input. Scanning delays the actual parsing of the file and instead returns a lazy computation holder called a `LazyFrame`. +Polars allows you to _scan_ a `Parquet` input. Scanning delays the actual parsing of the file and +instead returns a lazy computation holder called a `LazyFrame`. {{code_block('user-guide/io/parquet','scan',['scan_parquet'])}} -If you want to know why this is desirable, you can read more about those Polars optimizations [here](../concepts/lazy-api.md). +If you want to know why this is desirable, you can read more about those Polars optimizations +[here](../concepts/lazy-api.md). -When we scan a `Parquet` file stored in the cloud, we can also apply predicate and projection pushdowns. This can significantly reduce the amount of data that needs to be downloaded. For scanning a Parquet file in the cloud, see [Cloud storage](cloud-storage.md/#scanning-from-cloud-storage-with-query-optimisation). +When we scan a `Parquet` file stored in the cloud, we can also apply predicate and projection +pushdowns. This can significantly reduce the amount of data that needs to be downloaded. For +scanning a Parquet file in the cloud, see +[Cloud storage](cloud-storage.md/#scanning-from-cloud-storage-with-query-optimisation). diff --git a/docs/source/user-guide/lazy/execution.md b/docs/source/user-guide/lazy/execution.md index da3e154270b2..cc70204e418c 100644 --- a/docs/source/user-guide/lazy/execution.md +++ b/docs/source/user-guide/lazy/execution.md @@ -4,7 +4,8 @@ Our example query on the Reddit dataset is: {{code_block('user-guide/lazy/execution','df',['scan_csv'])}} -If we were to run the code above on the Reddit CSV the query would not be evaluated. Instead Polars takes each line of code, adds it to the internal query graph and optimizes the query graph. +If we were to run the code above on the Reddit CSV the query would not be evaluated. Instead Polars +takes each line of code, adds it to the internal query graph and optimizes the query graph. When we execute the code Polars executes the optimized query graph by default. @@ -35,7 +36,8 @@ shape: (14_029, 6) Above we see that from the 10 million rows there are 14,029 rows that match our predicate. -With the default `collect` method Polars processes all of your data as one batch. This means that all the data has to fit into your available memory at the point of peak memory usage in your query. +With the default `collect` method Polars processes all of your data as one batch. This means that +all the data has to fit into your available memory at the point of peak memory usage in your query. !!! warning "Reusing `LazyFrame` objects" @@ -43,7 +45,9 @@ With the default `collect` method Polars processes all of your data as one batch ### Execution on larger-than-memory data -If your data requires more memory than you have available Polars may be able to process the data in batches using _streaming_ mode. To use streaming mode you simply pass the `streaming=True` argument to `collect` +If your data requires more memory than you have available Polars may be able to process the data in +batches using _streaming_ mode. To use streaming mode you simply pass the `streaming=True` argument +to `collect` {{code_block('user-guide/lazy/execution','stream',['scan_csv','collect'])}} @@ -51,10 +55,12 @@ We look at [streaming in more detail here](streaming.md). ### Execution on a partial dataset -While you're writing, optimizing or checking your query on a large dataset, querying all available data may lead to a slow development process. +While you're writing, optimizing or checking your query on a large dataset, querying all available +data may lead to a slow development process. -Instead, you can scan a subset of your partitions or use `.head`/`.collect` at the beginning and end of your query, respectively. -Keep in mind that the results of aggregations and filters on subsets of your data may not be representative of the result you would get on the full data. +Instead, you can scan a subset of your partitions or use `.head`/`.collect` at the beginning and end +of your query, respectively. Keep in mind that the results of aggregations and filters on subsets of +your data may not be representative of the result you would get on the full data. {{code_block('user-guide/lazy/execution','partial',['scan_csv','collect','head'])}} diff --git a/docs/source/user-guide/lazy/gpu.md b/docs/source/user-guide/lazy/gpu.md index 97529eb28f55..232bab3dbf72 100644 --- a/docs/source/user-guide/lazy/gpu.md +++ b/docs/source/user-guide/lazy/gpu.md @@ -1,8 +1,11 @@ # GPU Support -Polars provides an in-memory, GPU-accelerated execution engine for the Lazy API in Python using [RAPIDS cuDF](https://docs.rapids.ai/api/cudf/stable/) on NVIDIA GPUs. This functionality is available in Open Beta and is undergoing rapid development. +Polars provides an in-memory, GPU-accelerated execution engine for the Lazy API in Python using +[RAPIDS cuDF](https://docs.rapids.ai/api/cudf/stable/) on NVIDIA GPUs. This functionality is +available in Open Beta and is undergoing rapid development. -If you install Polars with the [GPU feature flag](../installation.md), you can trigger GPU-based execution by running `.collect(engine="gpu")` instead of `.collect()`. +If you install Polars with the [GPU feature flag](../installation.md), you can trigger GPU-based +execution by running `.collect(engine="gpu")` instead of `.collect()`. {{ code_header("python", [], []) }} diff --git a/docs/source/user-guide/lazy/index.md b/docs/source/user-guide/lazy/index.md index bbf50fb34e11..10912aaa6bbc 100644 --- a/docs/source/user-guide/lazy/index.md +++ b/docs/source/user-guide/lazy/index.md @@ -1,6 +1,8 @@ # Lazy -The Lazy chapter is a guide for working with `LazyFrames`. It covers the functionalities like how to use it and how to optimise it. You can also find more information about the query plan or gain more insight in the streaming capabilities. +The Lazy chapter is a guide for working with `LazyFrames`. It covers the functionalities like how to +use it and how to optimise it. You can also find more information about the query plan or gain more +insight in the streaming capabilities. - [Using lazy API](using.md) - [Optimisations](optimizations.md) diff --git a/docs/source/user-guide/lazy/optimizations.md b/docs/source/user-guide/lazy/optimizations.md index f86ac3041baa..70678f4dbf13 100644 --- a/docs/source/user-guide/lazy/optimizations.md +++ b/docs/source/user-guide/lazy/optimizations.md @@ -1,9 +1,10 @@ # Optimizations -If you use Polars' lazy API, Polars will run several optimizations on your query. Some of them are executed up front, -others are determined just in time as the materialized data comes in. +If you use Polars' lazy API, Polars will run several optimizations on your query. Some of them are +executed up front, others are determined just in time as the materialized data comes in. -Here is a non-complete overview of optimizations done by polars, what they do and how often they run. +Here is a non-complete overview of optimizations done by polars, what they do and how often they +run. | Optimization | Explanation | runs | | -------------------------- | ------------------------------------------------------------------------------------------------------------ | ----------------------------- | diff --git a/docs/source/user-guide/lazy/query-plan.md b/docs/source/user-guide/lazy/query-plan.md index 597974aea328..ae7799b751f5 100644 --- a/docs/source/user-guide/lazy/query-plan.md +++ b/docs/source/user-guide/lazy/query-plan.md @@ -5,7 +5,8 @@ For any lazy query Polars has both: - a non-optimized plan with the set of steps code as we provided it and - an optimized plan with changes made by the query optimizer -We can understand both the non-optimized and optimized query plans with visualization and by printing them as text. +We can understand both the non-optimized and optimized query plans with visualization and by +printing them as text.
```python exec="on" result="text" session="user-guide/lazy/query-plan" @@ -25,7 +26,8 @@ Below we consider the following query: ### Graphviz visualization -To create visualizations of the query plan, [Graphviz should be installed](https://graphviz.org/download/) and added to your PATH. +To create visualizations of the query plan, +[Graphviz should be installed](https://graphviz.org/download/) and added to your PATH. First we visualize the non-optimized plan by setting `optimized=False`. @@ -59,7 +61,8 @@ FILTER [(col("comment_karma")) > (0)] FROM WITH_COLUMNS: PROJECT */6 COLUMNS ``` -The printed plan should also be read from bottom to top. This non-optimized plan is roughly equal to: +The printed plan should also be read from bottom to top. This non-optimized plan is roughly equal +to: - read from the `data/reddit.csv` file - read all 6 columns (where the * wildcard in PROJECT \*/6 COLUMNS means take all columns) @@ -95,4 +98,6 @@ The optimized plan is to: - apply the filter on the `comment_karma` column while the CSV is being read line-by-line - transform the `name` column to uppercase -In this case the query optimizer has identified that the `filter` can be applied while the CSV is read from disk rather than reading the whole file into memory and then applying the filter. This optimization is called _Predicate Pushdown_. +In this case the query optimizer has identified that the `filter` can be applied while the CSV is +read from disk rather than reading the whole file into memory and then applying the filter. This +optimization is called _Predicate Pushdown_. diff --git a/docs/source/user-guide/lazy/schemas.md b/docs/source/user-guide/lazy/schemas.md index d63f1af115ad..df7f8d4f6991 100644 --- a/docs/source/user-guide/lazy/schemas.md +++ b/docs/source/user-guide/lazy/schemas.md @@ -1,6 +1,7 @@ # Schema -The schema of a Polars `DataFrame` or `LazyFrame` sets out the names of the columns and their datatypes. You can see the schema with the `.collect_schema` method on a `DataFrame` or `LazyFrame` +The schema of a Polars `DataFrame` or `LazyFrame` sets out the names of the columns and their +datatypes. You can see the schema with the `.collect_schema` method on a `DataFrame` or `LazyFrame` {{code_block('user-guide/lazy/schema','schema',['LazyFrame'])}} @@ -13,13 +14,17 @@ The schema plays an important role in the lazy API. ## Type checking in the lazy API -One advantage of the lazy API is that Polars will check the schema before any data is processed. This check happens when you execute your lazy query. +One advantage of the lazy API is that Polars will check the schema before any data is processed. +This check happens when you execute your lazy query. -We see how this works in the following simple example where we call the `.round` expression on the string column `foo`. +We see how this works in the following simple example where we call the `.round` expression on the +string column `foo`. {{code_block('user-guide/lazy/schema','lazyround',['with_columns'])}} -The `.round` expression is only valid for columns with a numeric data type. Calling `.round` on a string column means the operation will raise an `InvalidOperationError` when we evaluate the query with `collect`. This schema check happens before the data is processed when we call `collect`. +The `.round` expression is only valid for columns with a numeric data type. Calling `.round` on a +string column means the operation will raise an `InvalidOperationError` when we evaluate the query +with `collect`. This schema check happens before the data is processed when we call `collect`. {{code_block('user-guide/lazy/schema','typecheck',[])}} @@ -28,15 +33,21 @@ The `.round` expression is only valid for columns with a numeric data type. Call --8<-- "python/user-guide/lazy/schema.py:typecheck" ``` -If we executed this query in eager mode the error would only be found once the data had been processed in all earlier steps. +If we executed this query in eager mode the error would only be found once the data had been +processed in all earlier steps. -When we execute a lazy query Polars checks for any potential `InvalidOperationError` before the time-consuming step of actually processing the data in the pipeline. +When we execute a lazy query Polars checks for any potential `InvalidOperationError` before the +time-consuming step of actually processing the data in the pipeline. ## The lazy API must know the schema -In the lazy API the Polars query optimizer must be able to infer the schema at every step of a query plan. This means that operations where the schema is not knowable in advance cannot be used with the lazy API. +In the lazy API the Polars query optimizer must be able to infer the schema at every step of a query +plan. This means that operations where the schema is not knowable in advance cannot be used with the +lazy API. -The classic example of an operation where the schema is not knowable in advance is a `.pivot` operation. In a `.pivot` the new column names come from data in one of the columns. As these column names cannot be known in advance a `.pivot` is not available in the lazy API. +The classic example of an operation where the schema is not knowable in advance is a `.pivot` +operation. In a `.pivot` the new column names come from data in one of the columns. As these column +names cannot be known in advance a `.pivot` is not available in the lazy API. ## Dealing with operations not available in the lazy API diff --git a/docs/source/user-guide/lazy/using.md b/docs/source/user-guide/lazy/using.md index d777557da550..62a58ae07ee1 100644 --- a/docs/source/user-guide/lazy/using.md +++ b/docs/source/user-guide/lazy/using.md @@ -1,6 +1,7 @@ # Usage -With the lazy API, Polars doesn't run each query line-by-line but instead processes the full query end-to-end. To get the most out of Polars it is important that you use the lazy API because: +With the lazy API, Polars doesn't run each query line-by-line but instead processes the full query +end-to-end. To get the most out of Polars it is important that you use the lazy API because: - the lazy API allows Polars to apply automatic query optimization with the query optimizer - the lazy API allows you to work with larger than memory datasets using streaming @@ -10,7 +11,8 @@ Here we see how to use the lazy API starting from either a file or an existing ` ## Using the lazy API from a file -In the ideal case we would use the lazy API right from a file as the query optimizer may help us to reduce the amount of data we read from the file. +In the ideal case we would use the lazy API right from a file as the query optimizer may help us to +reduce the amount of data we read from the file. We create a lazy query from the Reddit CSV data and apply some transformations. @@ -26,11 +28,13 @@ In this query we tell Polars that we want to: - convert the `name` column to uppercase - apply a filter to the `comment_karma` column -The lazy query will not be executed at this point. See this page on [executing lazy queries](execution.md) for more on running lazy queries. +The lazy query will not be executed at this point. See this page on +[executing lazy queries](execution.md) for more on running lazy queries. ## Using the lazy API from a `DataFrame` -An alternative way to access the lazy API is to call `.lazy` on a `DataFrame` that has already been created in memory. +An alternative way to access the lazy API is to call `.lazy` on a `DataFrame` that has already been +created in memory. {{code_block('user-guide/lazy/using','fromdf',['lazy'])}} diff --git a/docs/source/user-guide/migration/pandas.md b/docs/source/user-guide/migration/pandas.md index 3d1f0996bdad..61498f071295 100644 --- a/docs/source/user-guide/migration/pandas.md +++ b/docs/source/user-guide/migration/pandas.md @@ -1,54 +1,52 @@ # Coming from Pandas -Here we set out the key points that anyone who has experience with pandas and wants to -try Polars should know. We include both differences in the concepts the libraries are -built on and differences in how you should write Polars code compared to pandas -code. +Here we set out the key points that anyone who has experience with pandas and wants to try Polars +should know. We include both differences in the concepts the libraries are built on and differences +in how you should write Polars code compared to pandas code. ## Differences in concepts between Polars and pandas ### Polars does not have a multi-index/index -pandas gives a label to each row with an index. Polars does not use an index and -each row is indexed by its integer position in the table. +pandas gives a label to each row with an index. Polars does not use an index and each row is indexed +by its integer position in the table. -Polars aims to have predictable results and readable queries, as such we think an index does not help us reach that -objective. We believe the semantics of a query should not change by the state of an index or a `reset_index` call. +Polars aims to have predictable results and readable queries, as such we think an index does not +help us reach that objective. We believe the semantics of a query should not change by the state of +an index or a `reset_index` call. -In Polars a DataFrame will always be a 2D table with heterogeneous data-types. The data-types may have nesting, but the -table itself will not. -Operations like resampling will be done by specialized functions or methods that act like 'verbs' on a table explicitly -stating the columns that that 'verb' operates on. As such, it is our conviction that not having indices make things simpler, -more explicit, more readable and less error-prone. +In Polars a DataFrame will always be a 2D table with heterogeneous data-types. The data-types may +have nesting, but the table itself will not. Operations like resampling will be done by specialized +functions or methods that act like 'verbs' on a table explicitly stating the columns that that +'verb' operates on. As such, it is our conviction that not having indices make things simpler, more +explicit, more readable and less error-prone. -Note that an 'index' data structure as known in databases will be used by Polars as an optimization technique. +Note that an 'index' data structure as known in databases will be used by Polars as an optimization +technique. ### Polars adheres to the Apache Arrow memory format to represent data in memory while pandas uses NumPy arrays Polars represents data in memory according to the Arrow memory spec while pandas represents data in -memory with NumPy arrays. Apache Arrow is an emerging standard for in-memory columnar -analytics that can accelerate data load times, reduce memory usage and accelerate -calculations. +memory with NumPy arrays. Apache Arrow is an emerging standard for in-memory columnar analytics that +can accelerate data load times, reduce memory usage and accelerate calculations. Polars can convert data to NumPy format with the `to_numpy` method. ### Polars has more support for parallel operations than pandas -Polars exploits the strong support for concurrency in Rust to run many operations in -parallel. While some operations in pandas are multi-threaded the core of the library -is single-threaded and an additional library such as `Dask` must be used to parallelize -operations. +Polars exploits the strong support for concurrency in Rust to run many operations in parallel. While +some operations in pandas are multi-threaded the core of the library is single-threaded and an +additional library such as `Dask` must be used to parallelize operations. ### Polars can lazily evaluate queries and apply query optimization -Eager evaluation is when code is evaluated as soon as you run the code. Lazy evaluation -is when running a line of code means that the underlying logic is added to a query plan -rather than being evaluated. +Eager evaluation is when code is evaluated as soon as you run the code. Lazy evaluation is when +running a line of code means that the underlying logic is added to a query plan rather than being +evaluated. -Polars supports eager evaluation and lazy evaluation whereas pandas only supports -eager evaluation. The lazy evaluation mode is powerful because Polars carries out -automatic query optimization when it examines the query plan and looks for ways to -accelerate the query or reduce memory usage. +Polars supports eager evaluation and lazy evaluation whereas pandas only supports eager evaluation. +The lazy evaluation mode is powerful because Polars carries out automatic query optimization when it +examines the query plan and looks for ways to accelerate the query or reduce memory usage. `Dask` also supports lazy evaluation when it generates a query plan. @@ -60,18 +58,18 @@ Users coming from pandas generally need to know one thing... polars != pandas ``` -If your Polars code looks like it could be pandas code, it might run, but it likely -runs slower than it should. +If your Polars code looks like it could be pandas code, it might run, but it likely runs slower than +it should. Let's go through some typical pandas code and see how we might rewrite it in Polars. ### Selecting data -As there is no index in Polars there is no `.loc` or `iloc` method in Polars - and -there is also no `SettingWithCopyWarning` in Polars. +As there is no index in Polars there is no `.loc` or `iloc` method in Polars - and there is also no +`SettingWithCopyWarning` in Polars. -However, the best way to select data in Polars is to use the expression API. For -example, if you want to select a column in pandas, you can do one of the following: +However, the best way to select data in Polars is to use the expression API. For example, if you +want to select a column in pandas, you can do one of the following: ```python df['a'] @@ -84,64 +82,59 @@ but in Polars you would use the `.select` method: df.select('a') ``` -If you want to select rows based on the values then in Polars you use the `.filter` -method: +If you want to select rows based on the values then in Polars you use the `.filter` method: ```python df.filter(pl.col('a') < 10) ``` -As noted in the section on expressions below, Polars can run operations in `.select` -and `filter` in parallel and Polars can carry out query optimization on the full set -of data selection criteria. +As noted in the section on expressions below, Polars can run operations in `.select` and `filter` in +parallel and Polars can carry out query optimization on the full set of data selection criteria. ### Be lazy -Working in lazy evaluation mode is straightforward and should be your default in -Polars as the lazy mode allows Polars to do query optimization. +Working in lazy evaluation mode is straightforward and should be your default in Polars as the lazy +mode allows Polars to do query optimization. -We can run in lazy mode by either using an implicitly lazy function (such as `scan_csv`) -or explicitly using the `lazy` method. +We can run in lazy mode by either using an implicitly lazy function (such as `scan_csv`) or +explicitly using the `lazy` method. -Take the following simple example where we read a CSV file from disk and do a group by. -The CSV file has numerous columns but we just want to do a group by on one of the id -columns (`id1`) and then sum by a value column (`v1`). In pandas this would be: +Take the following simple example where we read a CSV file from disk and do a group by. The CSV file +has numerous columns but we just want to do a group by on one of the id columns (`id1`) and then sum +by a value column (`v1`). In pandas this would be: ```python df = pd.read_csv(csv_file, usecols=['id1','v1']) grouped_df = df.loc[:,['id1','v1']].groupby('id1').sum('v1') ``` -In Polars you can build this query in lazy mode with query optimization and evaluate -it by replacing the eager pandas function `read_csv` with the implicitly lazy Polars -function `scan_csv`: +In Polars you can build this query in lazy mode with query optimization and evaluate it by replacing +the eager pandas function `read_csv` with the implicitly lazy Polars function `scan_csv`: ```python df = pl.scan_csv(csv_file) grouped_df = df.group_by('id1').agg(pl.col('v1').sum()).collect() ``` -Polars optimizes this query by identifying that only the `id1` and `v1` columns are -relevant and so will only read these columns from the CSV. By calling the `.collect` -method at the end of the second line we instruct Polars to eagerly evaluate the query. +Polars optimizes this query by identifying that only the `id1` and `v1` columns are relevant and so +will only read these columns from the CSV. By calling the `.collect` method at the end of the second +line we instruct Polars to eagerly evaluate the query. -If you do want to run this query in eager mode you can just replace `scan_csv` with -`read_csv` in the Polars code. +If you do want to run this query in eager mode you can just replace `scan_csv` with `read_csv` in +the Polars code. -Read more about working with lazy evaluation in the -[lazy API](../lazy/using.md) section. +Read more about working with lazy evaluation in the [lazy API](../lazy/using.md) section. ### Express yourself -A typical pandas script consists of multiple data transformations that are executed -sequentially. However, in Polars these transformations can be executed in parallel -using expressions. +A typical pandas script consists of multiple data transformations that are executed sequentially. +However, in Polars these transformations can be executed in parallel using expressions. #### Column assignment -We have a dataframe `df` with a column called `value`. We want to add two new columns, a -column called `tenXValue` where the `value` column is multiplied by 10 and a column -called `hundredXValue` where the `value` column is multiplied by 100. +We have a dataframe `df` with a column called `value`. We want to add two new columns, a column +called `tenXValue` where the `value` column is multiplied by 10 and a column called `hundredXValue` +where the `value` column is multiplied by 100. In pandas this would be: @@ -167,9 +160,9 @@ These column assignments are executed in parallel. #### Column assignment based on predicate -In this case we have a dataframe `df` with columns `a`,`b` and `c`. We want to re-assign -the values in column `a` based on a condition. When the value in column `c` is equal to -2 then we replace the value in `a` with the value in `b`. +In this case we have a dataframe `df` with columns `a`,`b` and `c`. We want to re-assign the values +in column `a` based on a condition. When the value in column `c` is equal to 2 then we replace the +value in `a` with the value in `b`. In pandas this would be: @@ -187,8 +180,8 @@ df.with_columns( ) ``` -Polars can compute every branch of an `if -> then -> otherwise` in -parallel. This is valuable, when the branches get more expensive to compute. +Polars can compute every branch of an `if -> then -> otherwise` in parallel. This is valuable, when +the branches get more expensive to compute. #### Filtering @@ -214,14 +207,13 @@ df.filter( ) ``` -The query optimizer in Polars can also detect if you write multiple filters separately -and combine them into a single filter in the optimized plan. +The query optimizer in Polars can also detect if you write multiple filters separately and combine +them into a single filter in the optimized plan. ## pandas transform -The pandas documentation demonstrates an operation on a group by called `transform`. In -this case we have a dataframe `df` and we want a new column showing the number of rows -in each group. +The pandas documentation demonstrates an operation on a group by called `transform`. In this case we +have a dataframe `df` and we want a new column showing the number of rows in each group. In pandas we have: @@ -234,8 +226,8 @@ df = pd.DataFrame({ df["size"] = df.groupby("c")["type"].transform(len) ``` -Here pandas does a group by on `"c"`, takes column `"type"`, computes the group length -and then joins the result back to the original `DataFrame` producing: +Here pandas does a group by on `"c"`, takes column `"type"`, computes the group length and then +joins the result back to the original `DataFrame` producing: ``` c type size @@ -273,12 +265,12 @@ shape: (7, 3) └─────┴──────┴──────┘ ``` -Because we can store the whole operation in a single expression, we can combine several -`window` functions and even combine different groups! +Because we can store the whole operation in a single expression, we can combine several `window` +functions and even combine different groups! -Polars will cache window expressions that are applied over the same group, so storing -them in a single `with_columns` is both convenient **and** optimal. In the following example -we look at a case where we are calculating group statistics over `"c"` twice: +Polars will cache window expressions that are applied over the same group, so storing them in a +single `with_columns` is both convenient **and** optimal. In the following example we look at a case +where we are calculating group statistics over `"c"` twice: ```python df.with_columns( @@ -307,18 +299,24 @@ shape: (7, 5) ## Missing data -pandas uses `NaN` and/or `None` values to indicate missing values depending on the dtype of the column. In addition the behaviour in pandas varies depending on whether the default dtypes or optional nullable arrays are used. In Polars missing data corresponds to a `null` value for all data types. +pandas uses `NaN` and/or `None` values to indicate missing values depending on the dtype of the +column. In addition the behaviour in pandas varies depending on whether the default dtypes or +optional nullable arrays are used. In Polars missing data corresponds to a `null` value for all data +types. -For float columns Polars permits the use of `NaN` values. These `NaN` values are not considered to be missing data but instead a special floating point value. +For float columns Polars permits the use of `NaN` values. These `NaN` values are not considered to +be missing data but instead a special floating point value. -In pandas an integer column with missing values is cast to be a float column with `NaN` values for the missing values (unless using optional nullable integer dtypes). In Polars any missing values in an integer column are simply `null` values and the column remains an integer column. +In pandas an integer column with missing values is cast to be a float column with `NaN` values for +the missing values (unless using optional nullable integer dtypes). In Polars any missing values in +an integer column are simply `null` values and the column remains an integer column. See the [missing data](../expressions/missing-data.md) section for more details. ## Pipe littering -A common usage in pandas is utilizing `pipe` to apply some function to a `DataFrame`. Copying this coding style to Polars -is unidiomatic and leads to suboptimal query plans. +A common usage in pandas is utilizing `pipe` to apply some function to a `DataFrame`. Copying this +coding style to Polars is unidiomatic and leads to suboptimal query plans. The snippet below shows a common pattern in pandas. @@ -343,11 +341,12 @@ def add_ham(df: pd.DataFrame) -> pd.DataFrame: ) ``` -If we do this in polars, we would create 3 `with_columns` contexts, that forces Polars to run the 3 pipes sequentially, -utilizing zero parallelism. +If we do this in polars, we would create 3 `with_columns` contexts, that forces Polars to run the 3 +pipes sequentially, utilizing zero parallelism. -The way to get similar abstractions in polars is creating functions that create expressions. -The snippet below creates 3 expressions that run on a single context and thus are allowed to run in parallel. +The way to get similar abstractions in polars is creating functions that create expressions. The +snippet below creates 3 expressions that run on a single context and thus are allowed to run in +parallel. ```python def get_foo(input_column: str) -> pl.Expr: @@ -367,7 +366,8 @@ df.with_columns( ) ``` -If you need the schema in the functions that generate the expressions, you can utilize a single `pipe`: +If you need the schema in the functions that generate the expressions, you can utilize a single +`pipe`: ```python from collections import OrderedDict @@ -399,5 +399,5 @@ lf.pipe(lambda lf: lf.with_columns( ) ``` -Another benefit of writing functions that return expressions, is that these functions are composable as expressions can -be chained and partially applied, leading to much more flexibility in the design. +Another benefit of writing functions that return expressions, is that these functions are composable +as expressions can be chained and partially applied, leading to much more flexibility in the design. diff --git a/docs/source/user-guide/migration/spark.md b/docs/source/user-guide/migration/spark.md index a27cc3058204..3de968b9463a 100644 --- a/docs/source/user-guide/migration/spark.md +++ b/docs/source/user-guide/migration/spark.md @@ -2,7 +2,9 @@ ## Column-based API vs. Row-based API -Whereas the `Spark` `DataFrame` is analogous to a collection of rows, a Polars `DataFrame` is closer to a collection of columns. This means that you can combine columns in Polars in ways that are not possible in `Spark`, because `Spark` preserves the relationship of the data in each row. +Whereas the `Spark` `DataFrame` is analogous to a collection of rows, a Polars `DataFrame` is closer +to a collection of columns. This means that you can combine columns in Polars in ways that are not +possible in `Spark`, because `Spark` preserves the relationship of the data in each row. Consider this sample dataset: @@ -52,9 +54,12 @@ shape: (2, 2) └─────┴─────┘ ``` -The expressions on columns `foo` and `bar` are completely independent. Since the expression on `bar` returns a single value, that value is repeated for each value output by the expression on `foo`. But `a` and `b` have no relation to the data that produced the sum of `9`. +The expressions on columns `foo` and `bar` are completely independent. Since the expression on `bar` +returns a single value, that value is repeated for each value output by the expression on `foo`. But +`a` and `b` have no relation to the data that produced the sum of `9`. -To do something similar in `Spark`, you'd need to compute the sum separately and provide it as a literal: +To do something similar in `Spark`, you'd need to compute the sum separately and provide it as a +literal: ```python from pyspark.sql.functions import col, sum, lit @@ -89,7 +94,8 @@ Output: ### Example 2: Combining Two `head`s -In Polars you can combine two different `head` expressions on the same DataFrame, provided that they return the same number of values. +In Polars you can combine two different `head` expressions on the same DataFrame, provided that they +return the same number of values. ```python df.select( @@ -113,9 +119,11 @@ shape: (3, 2) └─────┴─────┘ ``` -Again, the two `head` expressions here are completely independent, and the pairing of `a` to `5` and `b` to `4` results purely from the juxtaposition of the two columns output by the expressions. +Again, the two `head` expressions here are completely independent, and the pairing of `a` to `5` and +`b` to `4` results purely from the juxtaposition of the two columns output by the expressions. -To accomplish something similar in `Spark`, you would need to generate an artificial key that enables you to join the values in this way. +To accomplish something similar in `Spark`, you would need to generate an artificial key that +enables you to join the values in this way. ```python from pyspark.sql import Window diff --git a/docs/source/user-guide/misc/arrow.md b/docs/source/user-guide/misc/arrow.md index 941cbd9fb6c9..23985e9d94c6 100644 --- a/docs/source/user-guide/misc/arrow.md +++ b/docs/source/user-guide/misc/arrow.md @@ -2,8 +2,8 @@ ## Using pyarrow -Polars can move data in and out of arrow zero copy. This can be done either via pyarrow -or natively. Let's first start by showing the pyarrow solution: +Polars can move data in and out of arrow zero copy. This can be done either via pyarrow or natively. +Let's first start by showing the pyarrow solution: {{code_block('user-guide/misc/arrow','to_arrow',[])}} @@ -33,7 +33,9 @@ Importing from pyarrow can be achieved with `pl.from_arrow`. ## Using the Arrow PyCapsule Interface -As of Polars v1.3 and higher, Polars implements the [Arrow PyCapsule Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html), a protocol for sharing Arrow data across Python libraries. +As of Polars v1.3 and higher, Polars implements the +[Arrow PyCapsule Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html), +a protocol for sharing Arrow data across Python libraries. ### Exporting data from Polars to pyarrow @@ -54,7 +56,8 @@ foo: [[1,2,3]] bar: [["ham","spam","jam"]] ``` -To convert a Polars `Series` to a `pyarrow.ChunkedArray`, use the `pyarrow.chunked_array` constructor. +To convert a Polars `Series` to a `pyarrow.ChunkedArray`, use the `pyarrow.chunked_array` +constructor. {{code_block('user-guide/misc/arrow_pycapsule','to_arrow_series',[])}} @@ -68,7 +71,8 @@ To convert a Polars `Series` to a `pyarrow.ChunkedArray`, use the `pyarrow.chunk ] ``` -You can also pass a `Series` to the `pyarrow.array` constructor to create a contiguous array. Note that this will not be zero-copy if the underlying `Series` had multiple chunks. +You can also pass a `Series` to the `pyarrow.array` constructor to create a contiguous array. Note +that this will not be zero-copy if the underlying `Series` had multiple chunks. {{code_block('user-guide/misc/arrow_pycapsule','to_arrow_array_rechunk',[])}} @@ -99,7 +103,8 @@ shape: (3, 2) └─────┴──────┘ ``` -Similarly, we can pass the pyarrow `ChunkedArray` or `Array` back to Polars by using the `polars.Series` constructor: +Similarly, we can pass the pyarrow `ChunkedArray` or `Array` back to Polars by using the +`polars.Series` constructor: {{code_block('user-guide/misc/arrow_pycapsule','to_polars_series',[])}} @@ -115,11 +120,16 @@ Series: '' [i64] ### Usage with other arrow libraries -There's a [growing list](https://github.com/apache/arrow/issues/39195#issuecomment-2245718008) of libraries that support the PyCapsule Interface directly. Polars `Series` and `DataFrame` objects work automatically with every such library. +There's a [growing list](https://github.com/apache/arrow/issues/39195#issuecomment-2245718008) of +libraries that support the PyCapsule Interface directly. Polars `Series` and `DataFrame` objects +work automatically with every such library. ### For library maintainers -If you're developing a library that you wish to integrate with Polars, it's suggested to implement the [Arrow PyCapsule Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html) yourself. This comes with a number of benefits: +If you're developing a library that you wish to integrate with Polars, it's suggested to implement +the +[Arrow PyCapsule Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html) +yourself. This comes with a number of benefits: - Zero-copy exchange for both Polars Series and DataFrame - No required dependency on pyarrow. @@ -129,8 +139,10 @@ If you're developing a library that you wish to integrate with Polars, it's sugg ## Using Polars directly -Polars can also consume and export to and import from the [Arrow C Data Interface](https://arrow.apache.org/docs/format/CDataInterface.html) -directly. This is recommended for libraries that don't support the Arrow PyCapsule Interface and want to interop with Polars without requiring a pyarrow installation. +Polars can also consume and export to and import from the +[Arrow C Data Interface](https://arrow.apache.org/docs/format/CDataInterface.html) directly. This is +recommended for libraries that don't support the Arrow PyCapsule Interface and want to interop with +Polars without requiring a pyarrow installation. - To export `ArrowArray` C structs, Polars exposes: `Series._export_arrow_to_c`. - To import an `ArrowArray` C struct, Polars exposes `Series._import_arrow_from_c`. diff --git a/docs/source/user-guide/misc/comparison.md b/docs/source/user-guide/misc/comparison.md index 3ae31fe0077d..3520a26819f7 100644 --- a/docs/source/user-guide/misc/comparison.md +++ b/docs/source/user-guide/misc/comparison.md @@ -1,22 +1,38 @@ # Comparison with other tools -These are several libraries and tools that share similar functionalities with Polars. This often leads to questions from data experts about what the differences are. Below is a short comparison between some of the more popular data processing tools and Polars, to help data experts make a deliberate decision on which tool to use. +These are several libraries and tools that share similar functionalities with Polars. This often +leads to questions from data experts about what the differences are. Below is a short comparison +between some of the more popular data processing tools and Polars, to help data experts make a +deliberate decision on which tool to use. -You can find performance benchmarks (h2oai benchmark) of these tools here: [Polars blog post](https://pola.rs/posts/benchmarks/) or a more recent benchmark [done by DuckDB](https://duckdblabs.github.io/db-benchmark/) +You can find performance benchmarks (h2oai benchmark) of these tools here: +[Polars blog post](https://pola.rs/posts/benchmarks/) or a more recent benchmark +[done by DuckDB](https://duckdblabs.github.io/db-benchmark/) ### Pandas -Pandas stands as a widely-adopted and comprehensive tool in Python data analysis, renowned for its rich feature set and strong community support. However, due to its single threaded nature, it can struggle with performance and memory usage on medium and large datasets. +Pandas stands as a widely-adopted and comprehensive tool in Python data analysis, renowned for its +rich feature set and strong community support. However, due to its single threaded nature, it can +struggle with performance and memory usage on medium and large datasets. -In contrast, Polars is optimised for high-performance multithreaded computing on single nodes, providing significant improvements in speed and memory efficiency, particularly for medium to large data operations. Its more composable and stricter API results in greater expressiveness and fewer schema-related bugs. +In contrast, Polars is optimised for high-performance multithreaded computing on single nodes, +providing significant improvements in speed and memory efficiency, particularly for medium to large +data operations. Its more composable and stricter API results in greater expressiveness and fewer +schema-related bugs. ### Dask -Dask extends Pandas' capabilities to large, distributed datasets. Dask mimics Pandas' API, offering a familiar environment for Pandas users, but with the added benefit of parallel and distributed computing. +Dask extends Pandas' capabilities to large, distributed datasets. Dask mimics Pandas' API, offering +a familiar environment for Pandas users, but with the added benefit of parallel and distributed +computing. -While Dask excels at scaling Pandas workflows across clusters, it only supports a subset of the Pandas API and therefore cannot be used for all use cases. Polars offers a more versatile API that delivers strong performance within the constraints of a single node. +While Dask excels at scaling Pandas workflows across clusters, it only supports a subset of the +Pandas API and therefore cannot be used for all use cases. Polars offers a more versatile API that +delivers strong performance within the constraints of a single node. -The choice between Dask and Polars often comes down to familiarity with the Pandas API and the need for distributed processing for extremely large datasets versus the need for efficiency and speed in a vertically scaled environment for a wide range of use cases. +The choice between Dask and Polars often comes down to familiarity with the Pandas API and the need +for distributed processing for extremely large datasets versus the need for efficiency and speed in +a vertically scaled environment for a wide range of use cases. ### Modin @@ -24,12 +40,24 @@ Similar to Dask. In 2023, Snowflake acquired Ponder, the organisation that maint ### Spark -Spark (specifically PySpark) represents a different approach to large-scale data processing. While Polars has an optimised performance for single-node environments, Spark is designed for distributed data processing across clusters, making it suitable for extremely large datasets. +Spark (specifically PySpark) represents a different approach to large-scale data processing. While +Polars has an optimised performance for single-node environments, Spark is designed for distributed +data processing across clusters, making it suitable for extremely large datasets. -However, Spark's distributed nature can introduce complexity and overhead, especially for small datasets and tasks that can run on a single machine. Another consideration is collaboration between data scientists and engineers. As they typically work with different tools (Pandas and Pyspark), refactoring is often required by engineers to deploy data scientists' data processing pipelines. Polars offers a single syntax that, due to vertical scaling, works in local environments and on a single machine in the cloud. +However, Spark's distributed nature can introduce complexity and overhead, especially for small +datasets and tasks that can run on a single machine. Another consideration is collaboration between +data scientists and engineers. As they typically work with different tools (Pandas and Pyspark), +refactoring is often required by engineers to deploy data scientists' data processing pipelines. +Polars offers a single syntax that, due to vertical scaling, works in local environments and on a +single machine in the cloud. -The choice between Polars and Spark often depends on the scale of data and the specific requirements of the processing task. If you need to process TBs of data, Spark is a better choice. +The choice between Polars and Spark often depends on the scale of data and the specific requirements +of the processing task. If you need to process TBs of data, Spark is a better choice. ### DuckDB -Polars and DuckDB have many similarities. However, DuckDB is focused on providing an in-process SQL OLAP database management system, while Polars is focused on providing a scalable `DataFrame` interface to many languages. The different front-ends lead to different optimisation strategies and different algorithm prioritisation. The interoperability between both is zero-copy. DuckDB offers a guide on [how to integrate with Polars](https://duckdb.org/docs/guides/python/polars.html). +Polars and DuckDB have many similarities. However, DuckDB is focused on providing an in-process SQL +OLAP database management system, while Polars is focused on providing a scalable `DataFrame` +interface to many languages. The different front-ends lead to different optimisation strategies and +different algorithm prioritisation. The interoperability between both is zero-copy. DuckDB offers a +guide on [how to integrate with Polars](https://duckdb.org/docs/guides/python/polars.html). diff --git a/docs/source/user-guide/misc/multiprocessing.md b/docs/source/user-guide/misc/multiprocessing.md index 6a10f8d61443..2be53d3bc1b9 100644 --- a/docs/source/user-guide/misc/multiprocessing.md +++ b/docs/source/user-guide/misc/multiprocessing.md @@ -1,31 +1,37 @@ # Multiprocessing -TLDR: if you find that using Python's built-in `multiprocessing` module together with Polars results in a Polars error about multiprocessing methods, you should make sure you are using `spawn`, not `fork`, as the starting method: +TLDR: if you find that using Python's built-in `multiprocessing` module together with Polars results +in a Polars error about multiprocessing methods, you should make sure you are using `spawn`, not +`fork`, as the starting method: {{code_block('user-guide/misc/multiprocess','recommendation',[])}} ## When not to use multiprocessing -Before we dive into the details, it is important to emphasize that Polars has been built from the start to use all your CPU cores. -It does this by executing computations which can be done in parallel in separate threads. -For example, requesting two expressions in a `select` statement can be done in parallel, with the results only being combined at the end. -Another example is aggregating a value within groups using `group_by().agg()`, each group can be evaluated separately. -It is very unlikely that the `multiprocessing` module can improve your code performance in these cases. -If you're using the GPU Engine with Polars you should also avoid manual multiprocessing. When used simultaneously, they can compete -for system memory and processing power, leading to reduced performance. +Before we dive into the details, it is important to emphasize that Polars has been built from the +start to use all your CPU cores. It does this by executing computations which can be done in +parallel in separate threads. For example, requesting two expressions in a `select` statement can be +done in parallel, with the results only being combined at the end. Another example is aggregating a +value within groups using `group_by().agg()`, each group can be evaluated separately. It is +very unlikely that the `multiprocessing` module can improve your code performance in these cases. If +you're using the GPU Engine with Polars you should also avoid manual multiprocessing. When used +simultaneously, they can compete for system memory and processing power, leading to reduced +performance. See [the optimizations section](../lazy/optimizations.md) for more optimizations. ## When to use multiprocessing -Although Polars is multithreaded, other libraries may be single-threaded. -When the other library is the bottleneck, and the problem at hand is parallelizable, it makes sense to use multiprocessing to gain a speed up. +Although Polars is multithreaded, other libraries may be single-threaded. When the other library is +the bottleneck, and the problem at hand is parallelizable, it makes sense to use multiprocessing to +gain a speed up. ## The problem with the default multiprocessing config ### Summary -The [Python multiprocessing documentation](https://docs.python.org/3/library/multiprocessing.html) lists the three methods to create a process pool: +The [Python multiprocessing documentation](https://docs.python.org/3/library/multiprocessing.html) +lists the three methods to create a process pool: 1. spawn 1. fork @@ -33,66 +39,85 @@ The [Python multiprocessing documentation](https://docs.python.org/3/library/mul The description of fork is (as of 2022-10-15): -> The parent process uses os.fork() to fork the Python interpreter. The child process, when it begins, is effectively identical to the parent process. All resources of the parent are inherited by the child process. Note that safely forking a multithreaded process is problematic. +> The parent process uses os.fork() to fork the Python interpreter. The child process, when it +> begins, is effectively identical to the parent process. All resources of the parent are inherited +> by the child process. Note that safely forking a multithreaded process is problematic. > Available on Unix only. The default on Unix. -The short summary is: Polars is multithreaded as to provide strong performance out-of-the-box. -Thus, it cannot be combined with `fork`. -If you are on Unix (Linux, BSD, etc), you are using `fork`, unless you explicitly override it. +The short summary is: Polars is multithreaded as to provide strong performance out-of-the-box. Thus, +it cannot be combined with `fork`. If you are on Unix (Linux, BSD, etc), you are using `fork`, +unless you explicitly override it. -The reason you may not have encountered this before is that pure Python code, and most Python libraries, are (mostly) single threaded. -Alternatively, you are on Windows or MacOS, on which `fork` is not even available as a method (for MacOS it was up to Python 3.7). +The reason you may not have encountered this before is that pure Python code, and most Python +libraries, are (mostly) single threaded. Alternatively, you are on Windows or MacOS, on which `fork` +is not even available as a method (for MacOS it was up to Python 3.7). -Thus one should use `spawn`, or `forkserver`, instead. `spawn` is available on all platforms and the safest choice, and hence the recommended method. +Thus one should use `spawn`, or `forkserver`, instead. `spawn` is available on all platforms and the +safest choice, and hence the recommended method. ### Example -The problem with `fork` is in the copying of the parent's process. -Consider the example below, which is a slightly modified example posted on the [Polars issue tracker](https://github.com/pola-rs/polars/issues/3144): +The problem with `fork` is in the copying of the parent's process. Consider the example below, which +is a slightly modified example posted on the +[Polars issue tracker](https://github.com/pola-rs/polars/issues/3144): {{code_block('user-guide/misc/multiprocess','example1',[])}} Using `fork` as the method, instead of `spawn`, will cause a dead lock. -The fork method is equivalent to calling `os.fork()`, which is a system call as defined in [the POSIX standard](https://pubs.opengroup.org/onlinepubs/9699919799/functions/fork.html): +The fork method is equivalent to calling `os.fork()`, which is a system call as defined in +[the POSIX standard](https://pubs.opengroup.org/onlinepubs/9699919799/functions/fork.html): -> A process shall be created with a single thread. If a multi-threaded process calls fork(), the new process shall contain a replica of the calling thread and its entire address space, possibly including the states of mutexes and other resources. Consequently, to avoid errors, the child process may only execute async-signal-safe operations until such time as one of the exec functions is called. +> A process shall be created with a single thread. If a multi-threaded process calls fork(), the new +> process shall contain a replica of the calling thread and its entire address space, possibly +> including the states of mutexes and other resources. Consequently, to avoid errors, the child +> process may only execute async-signal-safe operations until such time as one of the exec functions +> is called. -In contrast, `spawn` will create a completely new fresh Python interpreter, and not inherit the state of mutexes. +In contrast, `spawn` will create a completely new fresh Python interpreter, and not inherit the +state of mutexes. -So what happens in the code example? -For reading the file with `pl.read_parquet` the file has to be locked. -Then `os.fork()` is called, copying the state of the parent process, including mutexes. -Thus all child processes will copy the file lock in an acquired state, leaving them hanging indefinitely waiting for the file lock to be released, which never happens. +So what happens in the code example? For reading the file with `pl.read_parquet` the file has to be +locked. Then `os.fork()` is called, copying the state of the parent process, including mutexes. Thus +all child processes will copy the file lock in an acquired state, leaving them hanging indefinitely +waiting for the file lock to be released, which never happens. -What makes debugging these issues tricky is that `fork` can work. -Change the example to not having the call to `pl.read_parquet`: +What makes debugging these issues tricky is that `fork` can work. Change the example to not having +the call to `pl.read_parquet`: {{code_block('user-guide/misc/multiprocess','example2',[])}} -This works fine. -Therefore debugging these issues in larger code bases, i.e. not the small toy examples here, can be a real pain, as a seemingly unrelated change can break your multiprocessing code. -In general, one should therefore never use the `fork` start method with multithreaded libraries unless there are very specific requirements that cannot be met otherwise. +This works fine. Therefore debugging these issues in larger code bases, i.e. not the small toy +examples here, can be a real pain, as a seemingly unrelated change can break your multiprocessing +code. In general, one should therefore never use the `fork` start method with multithreaded +libraries unless there are very specific requirements that cannot be met otherwise. ### Pro's and cons of fork Based on the example, you may think, why is `fork` available in Python to start with? -First, probably because of historical reasons: `spawn` was added to Python in version 3.4, whilst `fork` has been part of Python from the 2.x series. - -Second, there are several limitations for `spawn` and `forkserver` that do not apply to `fork`, in particular all arguments should be pickable. -See the [Python multiprocessing docs](https://docs.python.org/3/library/multiprocessing.html#the-spawn-and-forkserver-start-methods) for more information. - -Third, because it is faster to create new processes compared to `spawn`, as `spawn` is effectively `fork` + creating a brand new Python process without the locks by calling [execv](https://pubs.opengroup.org/onlinepubs/9699919799/functions/exec.html). -Hence the warning in the Python docs that it is slower: there is more overhead to `spawn`. -However, in almost all cases, one would like to use multiple processes to speed up computations that take multiple minutes or even hours, meaning the overhead is negligible in the grand scheme of things. -And more importantly, it actually works in combination with multithreaded libraries. - -Fourth, `spawn` starts a new process, and therefore it requires code to be importable, in contrast to `fork`. -In particular, this means that when using `spawn` the relevant code should not be in the global scope, such as in Jupyter notebooks or in plain scripts. -Hence in the examples above, we define functions where we spawn within, and run those functions from a `__main__` clause. -This is not an issue for typical projects, but during quick experimentation in notebooks it could fail. +First, probably because of historical reasons: `spawn` was added to Python in version 3.4, whilst +`fork` has been part of Python from the 2.x series. + +Second, there are several limitations for `spawn` and `forkserver` that do not apply to `fork`, in +particular all arguments should be pickable. See the +[Python multiprocessing docs](https://docs.python.org/3/library/multiprocessing.html#the-spawn-and-forkserver-start-methods) +for more information. + +Third, because it is faster to create new processes compared to `spawn`, as `spawn` is effectively +`fork` + creating a brand new Python process without the locks by calling +[execv](https://pubs.opengroup.org/onlinepubs/9699919799/functions/exec.html). Hence the warning in +the Python docs that it is slower: there is more overhead to `spawn`. However, in almost all cases, +one would like to use multiple processes to speed up computations that take multiple minutes or even +hours, meaning the overhead is negligible in the grand scheme of things. And more importantly, it +actually works in combination with multithreaded libraries. + +Fourth, `spawn` starts a new process, and therefore it requires code to be importable, in contrast +to `fork`. In particular, this means that when using `spawn` the relevant code should not be in the +global scope, such as in Jupyter notebooks or in plain scripts. Hence in the examples above, we +define functions where we spawn within, and run those functions from a `__main__` clause. This is +not an issue for typical projects, but during quick experimentation in notebooks it could fail. ## References diff --git a/docs/source/user-guide/misc/styling.md b/docs/source/user-guide/misc/styling.md index 57a8d44d75e1..d64d3741e15c 100644 --- a/docs/source/user-guide/misc/styling.md +++ b/docs/source/user-guide/misc/styling.md @@ -1,6 +1,9 @@ # Styling -Data in a Polars `DataFrame` can be styled for presentation use the `DataFrame.style` property. This returns a `GT` object from [Great Tables](https://posit-dev.github.io/great-tables/articles/intro.html), which enables structuring, formatting, and styling for table display. +Data in a Polars `DataFrame` can be styled for presentation use the `DataFrame.style` property. This +returns a `GT` object from +[Great Tables](https://posit-dev.github.io/great-tables/articles/intro.html), which enables +structuring, formatting, and styling for table display. {{code_block('user-guide/misc/styling','dataframe',[])}} diff --git a/docs/source/user-guide/misc/visualization.md b/docs/source/user-guide/misc/visualization.md index 5832fa0c9e5f..28cbb4ee668e 100644 --- a/docs/source/user-guide/misc/visualization.md +++ b/docs/source/user-guide/misc/visualization.md @@ -2,8 +2,8 @@ Data in a Polars `DataFrame` can be visualized using common visualization libraries. -We illustrate plotting capabilities using the Iris dataset. We read a CSV and then -plot one column against another, colored by a yet another column. +We illustrate plotting capabilities using the Iris dataset. We read a CSV and then plot one column +against another, colored by a yet another column. {{code_block('user-guide/misc/visualization','dataframe',[])}} @@ -37,13 +37,12 @@ import altair as alt ) ``` -and is only provided for convenience, and to signal that Altair is known to work well with -Polars. +and is only provided for convenience, and to signal that Altair is known to work well with Polars. ## hvPlot -If you import `hvplot.polars`, then it registers a `hvplot` -method which you can use to create interactive plots using [hvPlot](https://hvplot.holoviz.org/). +If you import `hvplot.polars`, then it registers a `hvplot` method which you can use to create +interactive plots using [hvPlot](https://hvplot.holoviz.org/). {{code_block('user-guide/misc/visualization','hvplot_show_plot',[])}} @@ -53,12 +52,13 @@ method which you can use to create interactive plots using [hvPlot](https://hvpl ## Matplotlib -To create a scatter plot we can pass columns of a `DataFrame` directly to Matplotlib as a `Series` for each column. -Matplotlib does not have explicit support for Polars objects but can accept a Polars `Series` by -converting it to a NumPy array (which is zero-copy for numeric data without null values). +To create a scatter plot we can pass columns of a `DataFrame` directly to Matplotlib as a `Series` +for each column. Matplotlib does not have explicit support for Polars objects but can accept a +Polars `Series` by converting it to a NumPy array (which is zero-copy for numeric data without null +values). -Note that because the column `'species'` isn't numeric, we need to first convert it to numeric values so that -it can be passed as an argument to `c`. +Note that because the column `'species'` isn't numeric, we need to first convert it to numeric +values so that it can be passed as an argument to `c`. {{code_block('user-guide/misc/visualization','matplotlib_show_plot',[])}} @@ -68,8 +68,11 @@ it can be passed as an argument to `c`. ## Seaborn and Plotly -[Seaborn](https://seaborn.pydata.org/) and [Plotly](https://plotly.com/) can accept a Polars `DataFrame` by leveraging the [dataframe interchange protocol](https://data-apis.org/dataframe-api/), which offers zero-copy conversion where possible. Note -that the protocol does not support all Polars data types (e.g. `List`) so your mileage may vary here. +[Seaborn](https://seaborn.pydata.org/) and [Plotly](https://plotly.com/) can accept a Polars +`DataFrame` by leveraging the +[dataframe interchange protocol](https://data-apis.org/dataframe-api/), which offers zero-copy +conversion where possible. Note that the protocol does not support all Polars data types (e.g. +`List`) so your mileage may vary here. ### Seaborn diff --git a/docs/source/user-guide/plugins/your-first-polars-plugin.md b/docs/source/user-guide/plugins/your-first-polars-plugin.md index eb95ed7115f7..cc06fbf04337 100644 --- a/docs/source/user-guide/plugins/your-first-polars-plugin.md +++ b/docs/source/user-guide/plugins/your-first-polars-plugin.md @@ -2,10 +2,10 @@ -Expression plugins are the preferred way to create user defined functions. They allow you to compile a Rust function -and register that as an expression into the Polars library. The Polars engine will dynamically link your function at runtime -and your expression will run almost as fast as native expressions. Note that this works without any interference of Python -and thus no GIL contention. +Expression plugins are the preferred way to create user defined functions. They allow you to compile +a Rust function and register that as an expression into the Polars library. The Polars engine will +dynamically link your function at runtime and your expression will run almost as fast as native +expressions. Note that this works without any interference of Python and thus no GIL contention. They will benefit from the same benefits default expressions have: @@ -17,11 +17,13 @@ To get started we will see what is needed to create a custom expression. ## Our first custom expression: Pig Latin -For our first expression we are going to create a pig latin converter. Pig latin is a silly language where in every word -the first letter is removed, added to the back and finally "ay" is added. So the word "pig" would convert to "igpay". +For our first expression we are going to create a pig latin converter. Pig latin is a silly language +where in every word the first letter is removed, added to the back and finally "ay" is added. So the +word "pig" would convert to "igpay". -We could of course already do that with expressions, e.g. `col("name").str.slice(1) + col("name").str.slice(0, 1) + "ay"`, -but a specialized function for this would perform better and allows us to learn about the plugins. +We could of course already do that with expressions, e.g. +`col("name").str.slice(1) + col("name").str.slice(0, 1) + "ay"`, but a specialized function for this +would perform better and allows us to learn about the plugins. ### Setting up @@ -46,9 +48,10 @@ serde = { version = "*", features = ["derive"] } ### Writing the expression -In this library we create a helper function that converts a `&str` to pig-latin, and we create the function that we will -expose as an expression. To expose a function we must add the `#[polars_expr(output_type=DataType)]` attribute and the function -must always accept `inputs: &[Series]` as its first argument. +In this library we create a helper function that converts a `&str` to pig-latin, and we create the +function that we will expose as an expression. To expose a function we must add the +`#[polars_expr(output_type=DataType)]` attribute and the function must always accept +`inputs: &[Series]` as its first argument. ```rust // src/expressions.rs @@ -70,13 +73,15 @@ fn pig_latinnify(inputs: &[Series]) -> PolarsResult { } ``` -Note that we use `apply_into_string_amortized`, as opposed to `apply_values`, to avoid allocating a new string for -each row. If your plugin takes in multiple inputs, operates elementwise, and produces a `String` output, -then you may want to look at the `binary_elementwise_into_string_amortized` utility function in `polars::prelude::arity`. +Note that we use `apply_into_string_amortized`, as opposed to `apply_values`, to avoid allocating a +new string for each row. If your plugin takes in multiple inputs, operates elementwise, and produces +a `String` output, then you may want to look at the `binary_elementwise_into_string_amortized` +utility function in `polars::prelude::arity`. -This is all that is needed on the Rust side. On the Python side we must setup a folder with the same name as defined in -the `Cargo.toml`, in this case "expression_lib". We will create a folder in the same directory as our Rust `src` folder -named `expression_lib` and we create an `expression_lib/__init__.py`. The resulting file structure should look something like this: +This is all that is needed on the Rust side. On the Python side we must setup a folder with the same +name as defined in the `Cargo.toml`, in this case "expression_lib". We will create a folder in the +same directory as our Rust `src` folder named `expression_lib` and we create an +`expression_lib/__init__.py`. The resulting file structure should look something like this: ``` ├── 📁 expression_lib/ # name must match "lib.name" in Cargo.toml @@ -90,11 +95,13 @@ named `expression_lib` and we create an `expression_lib/__init__.py`. The result └── pyproject.toml ``` -Then we create a new class `Language` that will hold the expressions for our new `expr.language` namespace. The function -name of our expression can be registered. Note that it is important that this name is correct, otherwise the main Polars -package cannot resolve the function name. Furthermore we can set additional keyword arguments that explain to Polars how -this expression behaves. In this case we tell Polars that this function is elementwise. This allows Polars to run this -expression in batches. Whereas for other operations this would not be allowed, think for instance of a sort, or a slice. +Then we create a new class `Language` that will hold the expressions for our new `expr.language` +namespace. The function name of our expression can be registered. Note that it is important that +this name is correct, otherwise the main Polars package cannot resolve the function name. +Furthermore we can set additional keyword arguments that explain to Polars how this expression +behaves. In this case we tell Polars that this function is elementwise. This allows Polars to run +this expression in batches. Whereas for other operations this would not be allowed, think for +instance of a sort, or a slice. ```python # expression_lib/__init__.py @@ -117,7 +124,8 @@ def pig_latinnify(expr: IntoExpr) -> pl.Expr: ) ``` -We can then compile this library in our environment by installing `maturin` and running `maturin develop --release`. +We can then compile this library in our environment by installing `maturin` and running +`maturin develop --release`. And that's it. Our expression is ready to use! @@ -133,7 +141,9 @@ df = pl.DataFrame( out = df.with_columns(pig_latin=pig_latinnify("convert")) ``` -Alternatively, you can [register a custom namespace](https://docs.pola.rs/api/python/stable/reference/api/polars.api.register_expr_namespace.html#polars.api.register_expr_namespace), which enables you to write: +Alternatively, you can +[register a custom namespace](https://docs.pola.rs/api/python/stable/reference/api/polars.api.register_expr_namespace.html#polars.api.register_expr_namespace), +which enables you to write: ```python out = df.with_columns( @@ -143,8 +153,8 @@ out = df.with_columns( ## Accepting kwargs -If you want to accept `kwargs` (keyword arguments) in a polars expression, all you have to do is define a Rust `struct` -and make sure that it derives `serde::Deserialize`. +If you want to accept `kwargs` (keyword arguments) in a polars expression, all you have to do is +define a Rust `struct` and make sure that it derives `serde::Deserialize`. ```rust /// Provide your own kwargs struct with the proper schema and accept that type @@ -208,11 +218,13 @@ def append_args( ## Output data types -Output data types of course don't have to be fixed. They often depend on the input types of an expression. To accommodate -this you can provide the `#[polars_expr()]` macro with an `output_type_func` argument that points to a function. This -function can map input fields `&[Field]` to an output `Field` (name and data type). +Output data types of course don't have to be fixed. They often depend on the input types of an +expression. To accommodate this you can provide the `#[polars_expr()]` macro with an +`output_type_func` argument that points to a function. This function can map input fields `&[Field]` +to an output `Field` (name and data type). -In the snippet below is an example where we use the utility `FieldsMapper` to help with this mapping. +In the snippet below is an example where we use the utility `FieldsMapper` to help with this +mapping. ```rust use polars_plan::dsl::FieldsMapper; @@ -246,24 +258,31 @@ fn haversine(inputs: &[Series]) -> PolarsResult { } ``` -That's all you need to know to get started. Take a look at [this repo](https://github.com/pola-rs/pyo3-polars/tree/main/example/derive_expression) to see how this all fits together, and at [this tutorial](https://marcogorelli.github.io/polars-plugins-tutorial/) -to gain a more thorough understanding. +That's all you need to know to get started. Take a look at +[this repo](https://github.com/pola-rs/pyo3-polars/tree/main/example/derive_expression) to see how +this all fits together, and at +[this tutorial](https://marcogorelli.github.io/polars-plugins-tutorial/) to gain a more thorough +understanding. ## Community plugins Here is a curated (non-exhaustive) list of community-implemented plugins. -- [polars-xdt](https://github.com/pola-rs/polars-xdt) Polars plugin with extra datetime-related functionality - which isn't quite in-scope for the main library -- [polars-distance](https://github.com/ion-elgreco/polars-distance) Polars plugin for pairwise distance functions -- [polars-ds](https://github.com/abstractqqq/polars_ds_extension) Polars extension aiming to simplify common numerical/string data analysis procedures -- [polars-hash](https://github.com/ion-elgreco/polars-hash) Stable non-cryptographic and cryptographic hashing functions for Polars -- [polars-reverse-geocode](https://github.com/MarcoGorelli/polars-reverse-geocode) Offline reverse geocoder for finding the closest city - to a given (latitude, longitude) pair +- [polars-xdt](https://github.com/pola-rs/polars-xdt) Polars plugin with extra datetime-related + functionality which isn't quite in-scope for the main library +- [polars-distance](https://github.com/ion-elgreco/polars-distance) Polars plugin for pairwise + distance functions +- [polars-ds](https://github.com/abstractqqq/polars_ds_extension) Polars extension aiming to + simplify common numerical/string data analysis procedures +- [polars-hash](https://github.com/ion-elgreco/polars-hash) Stable non-cryptographic and + cryptographic hashing functions for Polars +- [polars-reverse-geocode](https://github.com/MarcoGorelli/polars-reverse-geocode) Offline reverse + geocoder for finding the closest city to a given (latitude, longitude) pair ## Other material - [Ritchie Vink - Keynote on Polars Plugins](https://youtu.be/jKW-CBV7NUM) -- [Polars plugins tutorial](https://marcogorelli.github.io/polars-plugins-tutorial/) Learn how to write a plugin by - going through some very simple and minimal examples -- [cookiecutter-polars-plugin](https://github.com/MarcoGorelli/cookiecutter-polars-plugins) Project template for Polars Plugins +- [Polars plugins tutorial](https://marcogorelli.github.io/polars-plugins-tutorial/) Learn how to + write a plugin by going through some very simple and minimal examples +- [cookiecutter-polars-plugin](https://github.com/MarcoGorelli/cookiecutter-polars-plugins) Project + template for Polars Plugins diff --git a/docs/source/user-guide/sql/create.md b/docs/source/user-guide/sql/create.md index a5a1922b7f23..3fb272161ec3 100644 --- a/docs/source/user-guide/sql/create.md +++ b/docs/source/user-guide/sql/create.md @@ -1,6 +1,8 @@ # CREATE -In Polars, the `SQLContext` provides a way to execute SQL statements against `LazyFrames` and `DataFrames` using SQL syntax. One of the SQL statements that can be executed using `SQLContext` is the `CREATE TABLE` statement, which is used to create a new table. +In Polars, the `SQLContext` provides a way to execute SQL statements against `LazyFrames` and +`DataFrames` using SQL syntax. One of the SQL statements that can be executed using `SQLContext` is +the `CREATE TABLE` statement, which is used to create a new table. The syntax for the `CREATE TABLE` statement in Polars is as follows: @@ -10,7 +12,8 @@ AS SELECT ... ``` -In this syntax, `table_name` is the name of the new table that will be created, and `SELECT ...` is a SELECT statement that defines the data that will be inserted into the table. +In this syntax, `table_name` is the name of the new table that will be created, and `SELECT ...` is +a SELECT statement that defines the data that will be inserted into the table. Here's an example of how to use the `CREATE TABLE` statement in Polars: @@ -21,7 +24,9 @@ Here's an example of how to use the `CREATE TABLE` statement in Polars: --8<-- "python/user-guide/sql/create.py:create" ``` -In this example, we use the `execute()` method of the `SQLContext` to execute a `CREATE TABLE` statement that creates a new table called `older_people` based on a SELECT statement that selects all rows from the `my_table` DataFrame where the `age` column is greater than 30. +In this example, we use the `execute()` method of the `SQLContext` to execute a `CREATE TABLE` +statement that creates a new table called `older_people` based on a SELECT statement that selects +all rows from the `my_table` DataFrame where the `age` column is greater than 30. !!! note Result diff --git a/docs/source/user-guide/sql/cte.md b/docs/source/user-guide/sql/cte.md index 1129f6d19230..90ec01aa3e33 100644 --- a/docs/source/user-guide/sql/cte.md +++ b/docs/source/user-guide/sql/cte.md @@ -1,8 +1,12 @@ # Common Table Expressions -Common Table Expressions (CTEs) are a feature of SQL that allow you to define a temporary named result set that can be referenced within a SQL statement. CTEs provide a way to break down complex SQL queries into smaller, more manageable pieces, making them easier to read, write, and maintain. +Common Table Expressions (CTEs) are a feature of SQL that allow you to define a temporary named +result set that can be referenced within a SQL statement. CTEs provide a way to break down complex +SQL queries into smaller, more manageable pieces, making them easier to read, write, and maintain. -A CTE is defined using the `WITH` keyword followed by a comma-separated list of subqueries, each of which defines a named result set that can be used in subsequent queries. The syntax for a CTE is as follows: +A CTE is defined using the `WITH` keyword followed by a comma-separated list of subqueries, each of +which defines a named result set that can be used in subsequent queries. The syntax for a CTE is as +follows: ``` WITH cte_name AS ( @@ -11,11 +15,17 @@ WITH cte_name AS ( SELECT ... ``` -In this syntax, `cte_name` is the name of the CTE, and `subquery` is the subquery that defines the result set. The CTE can then be referenced in subsequent queries as if it were a table or view. +In this syntax, `cte_name` is the name of the CTE, and `subquery` is the subquery that defines the +result set. The CTE can then be referenced in subsequent queries as if it were a table or view. -CTEs are particularly useful when working with complex queries that involve multiple levels of subqueries, as they allow you to break down the query into smaller, more manageable pieces that are easier to understand and debug. Additionally, CTEs can help improve query performance by allowing the database to optimize and cache the results of subqueries, reducing the number of times they need to be executed. +CTEs are particularly useful when working with complex queries that involve multiple levels of +subqueries, as they allow you to break down the query into smaller, more manageable pieces that are +easier to understand and debug. Additionally, CTEs can help improve query performance by allowing +the database to optimize and cache the results of subqueries, reducing the number of times they need +to be executed. -Polars supports Common Table Expressions (CTEs) using the WITH clause in SQL syntax. Below is an example +Polars supports Common Table Expressions (CTEs) using the WITH clause in SQL syntax. Below is an +example {{code_block('user-guide/sql/cte','cte',['SQLregister','SQLexecute'])}} @@ -24,4 +34,7 @@ Polars supports Common Table Expressions (CTEs) using the WITH clause in SQL syn --8<-- "python/user-guide/sql/cte.py:cte" ``` -In this example, we use the `execute()` method of the `SQLContext` to execute a SQL query that includes a CTE. The CTE selects all rows from the `my_table` LazyFrame where the `age` column is greater than 30 and gives it the alias `older_people`. We then execute a second SQL query that selects all rows from the `older_people` CTE where the `name` column starts with the letter 'C'. +In this example, we use the `execute()` method of the `SQLContext` to execute a SQL query that +includes a CTE. The CTE selects all rows from the `my_table` LazyFrame where the `age` column is +greater than 30 and gives it the alias `older_people`. We then execute a second SQL query that +selects all rows from the `older_people` CTE where the `name` column starts with the letter 'C'. diff --git a/docs/source/user-guide/sql/intro.md b/docs/source/user-guide/sql/intro.md index 0b762f16cdd9..0c475c9d1cf2 100644 --- a/docs/source/user-guide/sql/intro.md +++ b/docs/source/user-guide/sql/intro.md @@ -1,9 +1,10 @@ # Introduction While Polars supports interaction with SQL, it's recommended that users familiarize themselves with -the [expression syntax](../concepts/expressions-and-contexts.md#expressions) to produce more readable and expressive code. As the DataFrame -interface is primary, new features are typically added to the expression API first. However, if you already have an -existing SQL codebase or prefer the use of SQL, Polars does offers support for this. +the [expression syntax](../concepts/expressions-and-contexts.md#expressions) to produce more +readable and expressive code. As the DataFrame interface is primary, new features are typically +added to the expression API first. However, if you already have an existing SQL codebase or prefer +the use of SQL, Polars does offers support for this. !!! note Execution @@ -11,8 +12,9 @@ existing SQL codebase or prefer the use of SQL, Polars does offers support for t ## Context -Polars uses the `SQLContext` object to manage SQL queries. The context contains a mapping of `DataFrame` and `LazyFrame` -identifier names to their corresponding datasets[^1]. The example below starts a `SQLContext`: +Polars uses the `SQLContext` object to manage SQL queries. The context contains a mapping of +`DataFrame` and `LazyFrame` identifier names to their corresponding datasets[^1]. The example below +starts a `SQLContext`: {{code_block('user-guide/sql/intro','context',['SQLContext'])}} @@ -46,7 +48,8 @@ We can also register Pandas DataFrames by converting them to Polars first. Converting a Pandas DataFrame backed by Numpy will trigger a potentially expensive conversion; however, if the Pandas DataFrame is already backed by Arrow then the conversion will be significantly cheaper (and in some cases close to free). -Once the `SQLContext` is initialized, we can register additional Dataframes or unregister existing Dataframes with: +Once the `SQLContext` is initialized, we can register additional Dataframes or unregister existing +Dataframes with: - `register` - `register_globals` @@ -55,13 +58,13 @@ Once the `SQLContext` is initialized, we can register additional Dataframes or u ## Execute queries and collect results -SQL queries are always executed in lazy mode to take advantage of the full set of query planning optimizations, so we -have two options to collect the result: +SQL queries are always executed in lazy mode to take advantage of the full set of query planning +optimizations, so we have two options to collect the result: -- Set the parameter `eager_execution` to True in `SQLContext`; this ensures that Polars automatically collects the - LazyFrame results from `execute` calls. -- Set the parameter `eager` to True when executing a query with `execute`, or explicitly collect the result - using `collect`. +- Set the parameter `eager_execution` to True in `SQLContext`; this ensures that Polars + automatically collects the LazyFrame results from `execute` calls. +- Set the parameter `eager` to True when executing a query with `execute`, or explicitly collect the + result using `collect`. We execute SQL queries by calling `execute` on a `SQLContext`. @@ -73,18 +76,17 @@ We execute SQL queries by calling `execute` on a `SQLContext`. ## Execute queries from multiple sources -SQL queries can be executed just as easily from multiple sources. -In the example below, we register: +SQL queries can be executed just as easily from multiple sources. In the example below, we register: - a CSV file (loaded lazily) - a NDJSON file (loaded lazily) - a Pandas DataFrame -And join them together using SQL. -Lazy reading allows to only load the necessary rows and columns from the files. +And join them together using SQL. Lazy reading allows to only load the necessary rows and columns +from the files. -In the same way, it's possible to register cloud datalakes (S3, Azure Data Lake). A PyArrow dataset can point to the -datalake, then Polars can read it with `scan_pyarrow_dataset`. +In the same way, it's possible to register cloud datalakes (S3, Azure Data Lake). A PyArrow dataset +can point to the datalake, then Polars can read it with `scan_pyarrow_dataset`. {{code_block('user-guide/sql/intro','execute_multiple_sources',['SQLregister','SQLexecute'])}} @@ -98,7 +100,8 @@ datalake, then Polars can read it with `scan_pyarrow_dataset`. ## Compatibility -Polars does not support the complete SQL specification, but it does support a subset of the most common statement types. +Polars does not support the complete SQL specification, but it does support a subset of the most +common statement types. !!! note Dialect @@ -107,7 +110,8 @@ Polars does not support the complete SQL specification, but it does support a su For example, here is a non-exhaustive list of some of the supported functionality: - Write a `CREATE` statements: `CREATE TABLE xxx AS ...` -- Write a `SELECT` statements containing:`WHERE`,`ORDER`,`LIMIT`,`GROUP BY`,`UNION` and `JOIN` clauses ... +- Write a `SELECT` statements containing:`WHERE`,`ORDER`,`LIMIT`,`GROUP BY`,`UNION` and `JOIN` + clauses ... - Write Common Table Expressions (CTE's) such as: `WITH tablename AS` - Explain a query: `EXPLAIN SELECT ...` - List registered tables: `SHOW TABLES` diff --git a/docs/source/user-guide/sql/select.md b/docs/source/user-guide/sql/select.md index b25a44aeb49d..df223705a770 100644 --- a/docs/source/user-guide/sql/select.md +++ b/docs/source/user-guide/sql/select.md @@ -1,13 +1,17 @@ # SELECT -In Polars SQL, the `SELECT` statement is used to retrieve data from a table into a `DataFrame`. The basic syntax of a `SELECT` statement in Polars SQL is as follows: +In Polars SQL, the `SELECT` statement is used to retrieve data from a table into a `DataFrame`. The +basic syntax of a `SELECT` statement in Polars SQL is as follows: ```sql SELECT column1, column2, ... FROM table_name; ``` -Here, `column1`, `column2`, etc. are the columns that you want to select from the table. You can also use the wildcard `*` to select all columns. `table_name` is the name of the table or that you want to retrieve data from. In the sections below we will cover some of the more common SELECT variants +Here, `column1`, `column2`, etc. are the columns that you want to select from the table. You can +also use the wildcard `*` to select all columns. `table_name` is the name of the table or that you +want to retrieve data from. In the sections below we will cover some of the more common SELECT +variants {{code_block('user-guide/sql/select','df',['SQLregister','SQLexecute'])}} @@ -18,7 +22,8 @@ Here, `column1`, `column2`, etc. are the columns that you want to select from th ### GROUP BY -The `GROUP BY` statement is used to group rows in a table by one or more columns and compute aggregate functions on each group. +The `GROUP BY` statement is used to group rows in a table by one or more columns and compute +aggregate functions on each group. {{code_block('user-guide/sql/select','group_by',['SQLexecute'])}} @@ -28,7 +33,8 @@ The `GROUP BY` statement is used to group rows in a table by one or more columns ### ORDER BY -The `ORDER BY` statement is used to sort the result set of a query by one or more columns in ascending or descending order. +The `ORDER BY` statement is used to sort the result set of a query by one or more columns in +ascending or descending order. {{code_block('user-guide/sql/select','orderby',['SQLexecute'])}} @@ -53,7 +59,9 @@ Polars provides a wide range of SQL functions, including: - Aggregation functions: `SUM`, `AVG`, `MIN`, `MAX`, `COUNT`, `STDDEV`, `FIRST` etc. - Array functions: `EXPLODE`, `UNNEST`,`ARRAY_SUM`,`ARRAY_REVERSE`, etc. -For a full list of supported functions go the [API documentation](https://docs.rs/polars-sql/latest/src/polars_sql/keywords.rs.html). The example below demonstrates how to use a function in a query +For a full list of supported functions go the +[API documentation](https://docs.rs/polars-sql/latest/src/polars_sql/keywords.rs.html). The example +below demonstrates how to use a function in a query {{code_block('user-guide/sql/select','functions',['SQLexecute'])}} @@ -63,7 +71,9 @@ For a full list of supported functions go the [API documentation](https://docs.r ### Table Functions -In the examples earlier we first generated a DataFrame which we registered in the `SQLContext`. Polars also support directly reading from CSV, Parquet, JSON and IPC in your SQL query using table functions `read_xxx`. +In the examples earlier we first generated a DataFrame which we registered in the `SQLContext`. +Polars also support directly reading from CSV, Parquet, JSON and IPC in your SQL query using table +functions `read_xxx`. {{code_block('user-guide/sql/select','tablefunctions',['SQLexecute'])}} diff --git a/docs/source/user-guide/sql/show.md b/docs/source/user-guide/sql/show.md index 70453ebcb6dd..55a0496eadd3 100644 --- a/docs/source/user-guide/sql/show.md +++ b/docs/source/user-guide/sql/show.md @@ -1,6 +1,9 @@ # SHOW TABLES -In Polars, the `SHOW TABLES` statement is used to list all the tables that have been registered in the current `SQLContext`. When you register a DataFrame with the `SQLContext`, you give it a name that can be used to refer to the DataFrame in subsequent SQL statements. The `SHOW TABLES` statement allows you to see a list of all the registered tables, along with their names. +In Polars, the `SHOW TABLES` statement is used to list all the tables that have been registered in +the current `SQLContext`. When you register a DataFrame with the `SQLContext`, you give it a name +that can be used to refer to the DataFrame in subsequent SQL statements. The `SHOW TABLES` statement +allows you to see a list of all the registered tables, along with their names. The syntax for the `SHOW TABLES` statement in Polars is as follows: @@ -17,6 +20,11 @@ Here's an example of how to use the `SHOW TABLES` statement in Polars: --8<-- "python/user-guide/sql/show.py:show" ``` -In this example, we create two DataFrames and register them with the `SQLContext` using different names. We then execute a `SHOW TABLES` statement using the `execute()` method of the `SQLContext` object, which returns a DataFrame containing a list of all the registered tables and their names. The resulting DataFrame is then printed using the `print()` function. +In this example, we create two DataFrames and register them with the `SQLContext` using different +names. We then execute a `SHOW TABLES` statement using the `execute()` method of the `SQLContext` +object, which returns a DataFrame containing a list of all the registered tables and their names. +The resulting DataFrame is then printed using the `print()` function. -Note that the `SHOW TABLES` statement only lists tables that have been registered with the current `SQLContext`. If you register a DataFrame with a different `SQLContext` or in a different Python session, it will not appear in the list of tables returned by `SHOW TABLES`. +Note that the `SHOW TABLES` statement only lists tables that have been registered with the current +`SQLContext`. If you register a DataFrame with a different `SQLContext` or in a different Python +session, it will not appear in the list of tables returned by `SHOW TABLES`. diff --git a/docs/source/user-guide/transformations/concatenation.md b/docs/source/user-guide/transformations/concatenation.md index 044853bcd923..f43f8e781991 100644 --- a/docs/source/user-guide/transformations/concatenation.md +++ b/docs/source/user-guide/transformations/concatenation.md @@ -2,13 +2,19 @@ There are a number of ways to concatenate data from separate DataFrames: -- two dataframes with **the same columns** can be **vertically** concatenated to make a **longer** dataframe -- two dataframes with **non-overlapping columns** can be **horizontally** concatenated to make a **wider** dataframe -- two dataframes with **different numbers of rows and columns** can be **diagonally** concatenated to make a dataframe which might be longer and/ or wider. Where column names overlap values will be vertically concatenated. Where column names do not overlap new rows and columns will be added. Missing values will be set as `null` +- two dataframes with **the same columns** can be **vertically** concatenated to make a **longer** + dataframe +- two dataframes with **non-overlapping columns** can be **horizontally** concatenated to make a + **wider** dataframe +- two dataframes with **different numbers of rows and columns** can be **diagonally** concatenated + to make a dataframe which might be longer and/ or wider. Where column names overlap values will be + vertically concatenated. Where column names do not overlap new rows and columns will be added. + Missing values will be set as `null` ## Vertical concatenation - getting longer -In a vertical concatenation you combine all of the rows from a list of `DataFrames` into a single longer `DataFrame`. +In a vertical concatenation you combine all of the rows from a list of `DataFrames` into a single +longer `DataFrame`. {{code_block('user-guide/transformations/concatenation','vertical',['concat'])}} @@ -21,7 +27,8 @@ Vertical concatenation fails when the dataframes do not have the same column nam ## Horizontal concatenation - getting wider -In a horizontal concatenation you combine all of the columns from a list of `DataFrames` into a single wider `DataFrame`. +In a horizontal concatenation you combine all of the columns from a list of `DataFrames` into a +single wider `DataFrame`. {{code_block('user-guide/transformations/concatenation','horizontal',['concat'])}} @@ -31,8 +38,8 @@ In a horizontal concatenation you combine all of the columns from a list of `Dat Horizontal concatenation fails when dataframes have overlapping columns. -When dataframes have different numbers of rows, -columns will be padded with `null` values at the end up to the maximum length. +When dataframes have different numbers of rows, columns will be padded with `null` values at the end +up to the maximum length. {{code_block('user-guide/transformations/concatenation','horizontal_different_lengths',['concat'])}} @@ -42,7 +49,8 @@ columns will be padded with `null` values at the end up to the maximum length. ## Diagonal concatenation - getting longer, wider and `null`ier -In a diagonal concatenation you combine all of the row and columns from a list of `DataFrames` into a single longer and/or wider `DataFrame`. +In a diagonal concatenation you combine all of the row and columns from a list of `DataFrames` into +a single longer and/or wider `DataFrame`. {{code_block('user-guide/transformations/concatenation','cross',['concat'])}} @@ -52,9 +60,16 @@ In a diagonal concatenation you combine all of the row and columns from a list o Diagonal concatenation generates nulls when the column names do not overlap. -When the dataframe shapes do not match and we have an overlapping semantic key then [we can join the dataframes](joins.md) instead of concatenating them. +When the dataframe shapes do not match and we have an overlapping semantic key then +[we can join the dataframes](joins.md) instead of concatenating them. ## Rechunking -Before a concatenation we have two dataframes `df1` and `df2`. Each column in `df1` and `df2` is in one or more chunks in memory. By default, during concatenation the chunks in each column are not made contiguous. This makes the concat operation faster and consume less memory but it may slow down future operations that would benefit from having the data be in contiguous memory. The process of copying the fragmented chunks into a single new chunk is known as **rechunking**. Rechunking is an expensive operation. Prior to version 0.20.26, the default was to perform a rechunk but in new versions, the default is not to. -If you do want Polars to rechunk the concatenated `DataFrame` you specify `rechunk = True` when doing the concatenation. +Before a concatenation we have two dataframes `df1` and `df2`. Each column in `df1` and `df2` is in +one or more chunks in memory. By default, during concatenation the chunks in each column are not +made contiguous. This makes the concat operation faster and consume less memory but it may slow down +future operations that would benefit from having the data be in contiguous memory. The process of +copying the fragmented chunks into a single new chunk is known as **rechunking**. Rechunking is an +expensive operation. Prior to version 0.20.26, the default was to perform a rechunk but in new +versions, the default is not to. If you do want Polars to rechunk the concatenated `DataFrame` you +specify `rechunk = True` when doing the concatenation. diff --git a/docs/source/user-guide/transformations/index.md b/docs/source/user-guide/transformations/index.md index fa86181eb58d..452aaeb3e892 100644 --- a/docs/source/user-guide/transformations/index.md +++ b/docs/source/user-guide/transformations/index.md @@ -1,6 +1,7 @@ # Transformations -The focus of this section is to describe different types of data transformations and provide some examples on how to use them. +The focus of this section is to describe different types of data transformations and provide some +examples on how to use them. diff --git a/docs/source/user-guide/transformations/joins.md b/docs/source/user-guide/transformations/joins.md index 5b55386b70f0..3790fd6e1e87 100644 --- a/docs/source/user-guide/transformations/joins.md +++ b/docs/source/user-guide/transformations/joins.md @@ -1,16 +1,19 @@ # Joins -A join operation combines columns from one or more dataframes into a new dataframe. -The different “joining strategies” and matching criteria used by the different types of joins influence how columns are combined and also what rows are included in the result of the join operation. +A join operation combines columns from one or more dataframes into a new dataframe. The different +“joining strategies” and matching criteria used by the different types of joins influence how +columns are combined and also what rows are included in the result of the join operation. The most common type of join is an “equi join”, in which rows are matched by a key expression. -Polars supports several joining strategies for equi joins, which determine exactly how we handle the matching of rows. -Polars also supports “non-equi joins”, a type of join where the matching criterion is not an equality, and a type of join where rows are matched by key proximity, called “asof join”. +Polars supports several joining strategies for equi joins, which determine exactly how we handle the +matching of rows. Polars also supports “non-equi joins”, a type of join where the matching criterion +is not an equality, and a type of join where rows are matched by key proximity, called “asof join”. ## Quick reference table -The table below acts as a quick reference for people who know what they are looking for. -If you want to learn about joins in general and how to work with them in Polars, feel free to skip the table and keep reading below. +The table below acts as a quick reference for people who know what they are looking for. If you want +to learn about joins in general and how to work with them in Polars, feel free to skip the table and +keep reading below. === ":fontawesome-brands-python: Python" @@ -41,9 +44,9 @@ If you want to learn about joins in general and how to work with them in Polars, ## Equi joins -In an equi join, rows are matched by checking equality of a key expression. -You can do an equi join with the function `join` by specifying the name of the column to be used as key. -For the examples, we will be loading some (modified) Monopoly property data. +In an equi join, rows are matched by checking equality of a key expression. You can do an equi join +with the function `join` by specifying the name of the column to be used as key. For the examples, +we will be loading some (modified) Monopoly property data. First, we load a dataframe that contains property names and their colour group in the game: @@ -62,7 +65,8 @@ Next, we load a dataframe that contains property names and their price in the ga --8<-- "python/user-guide/transformations/joins.py:props_prices" ``` -Now, we join both dataframes to create a dataframe that contains property names, colour groups, and prices: +Now, we join both dataframes to create a dataframe that contains property names, colour groups, and +prices: {{code_block('user-guide/transformations/joins','equi-join',['join'])}} @@ -70,12 +74,14 @@ Now, we join both dataframes to create a dataframe that contains property names, --8<-- "python/user-guide/transformations/joins.py:equi-join" ``` -The result has four rows but both dataframes used in the operation had five rows. -Polars uses a joining strategy to determine what happens with rows that have multiple matches or with rows that have no match at all. -By default, Polars computes an “inner join” but there are [other join strategies that we show next](#join-strategies). +The result has four rows but both dataframes used in the operation had five rows. Polars uses a +joining strategy to determine what happens with rows that have multiple matches or with rows that +have no match at all. By default, Polars computes an “inner join” but there are +[other join strategies that we show next](#join-strategies). -In the example above, the two dataframes conveniently had the column we wish to use as key with the same name and with the values in the exact same format. -Suppose, for the sake of argument, that one of the dataframes had a differently named column and the other had the property names in lower case: +In the example above, the two dataframes conveniently had the column we wish to use as key with the +same name and with the values in the exact same format. Suppose, for the sake of argument, that one +of the dataframes had a differently named column and the other had the property names in lower case: {{code_block('user-guide/transformations/joins','props_groups2',['Expr.str'])}} @@ -89,7 +95,9 @@ Suppose, for the sake of argument, that one of the dataframes had a differently --8<-- "python/user-guide/transformations/joins.py:props_prices2" ``` -In a situation like this, where we may want to perform the same join as before, we can leverage `join`'s flexibility and specify arbitrary expressions to compute the joining key on the left and on the right, allowing one to compute row keys dynamically: +In a situation like this, where we may want to perform the same join as before, we can leverage +`join`'s flexibility and specify arbitrary expressions to compute the joining key on the left and on +the right, allowing one to compute row keys dynamically: {{code_block('user-guide/transformations/joins', 'join-key-expression', ['join', 'Expr.str'])}} @@ -97,17 +105,20 @@ In a situation like this, where we may want to perform the same join as before, --8<-- "python/user-guide/transformations/joins.py:join-key-expression" ``` -Because we are joining on the right with an expression, Polars preserves the column “property_name” from the left and the column “name” from the right so we can have access to the original values that the key expressions were applied to. +Because we are joining on the right with an expression, Polars preserves the column “property_name” +from the left and the column “name” from the right so we can have access to the original values that +the key expressions were applied to. ## Join strategies -When computing a join with `df1.join(df2, ...)`, we can specify one of many different join strategies. -A join strategy specifies what rows to keep from each dataframe based on whether they match rows from the other dataframe. +When computing a join with `df1.join(df2, ...)`, we can specify one of many different join +strategies. A join strategy specifies what rows to keep from each dataframe based on whether they +match rows from the other dataframe. ### Inner join -In an inner join the resulting dataframe only contains the rows from the left and right dataframes that matched. -That is the default strategy used by `join` and above we can see an example of that. +In an inner join the resulting dataframe only contains the rows from the left and right dataframes +that matched. That is the default strategy used by `join` and above we can see an example of that. We repeat the example here and explicitly specify the join strategy: {{code_block('user-guide/transformations/joins','inner-join',['join'])}} @@ -116,11 +127,13 @@ We repeat the example here and explicitly specify the join strategy: --8<-- "python/user-guide/transformations/joins.py:inner-join" ``` -The result does not include the row from `props_groups` that contains “The Shire” and the result also does not include the row from `props_prices` that contains “Sesame Street”. +The result does not include the row from `props_groups` that contains “The Shire” and the result +also does not include the row from `props_prices` that contains “Sesame Street”. ### Left join -A left outer join is a join where the result contains all the rows from the left dataframe and the rows of the right dataframe that matched any rows from the left dataframe. +A left outer join is a join where the result contains all the rows from the left dataframe and the +rows of the right dataframe that matched any rows from the left dataframe. {{code_block('user-guide/transformations/joins','left-join',['join'])}} @@ -128,12 +141,13 @@ A left outer join is a join where the result contains all the rows from the left --8<-- "python/user-guide/transformations/joins.py:left-join" ``` -If there are any rows from the left dataframe that have no matching rows on the right dataframe, they get the value `null` on the new columns. +If there are any rows from the left dataframe that have no matching rows on the right dataframe, +they get the value `null` on the new columns. ### Right join -Computationally speaking, a right outer join is exactly the same as a left outer join, but with the arguments swapped. -Here is an example: +Computationally speaking, a right outer join is exactly the same as a left outer join, but with the +arguments swapped. Here is an example: {{code_block('user-guide/transformations/joins','right-join',['join'])}} @@ -141,7 +155,8 @@ Here is an example: --8<-- "python/user-guide/transformations/joins.py:right-join" ``` -We show that `df1.join(df2, how="right", ...)` is the same as `df2.join(df1, how="left", ...)`, up to the order of the columns of the result, with the computation below: +We show that `df1.join(df2, how="right", ...)` is the same as `df2.join(df1, how="left", ...)`, up +to the order of the columns of the result, with the computation below: {{code_block('user-guide/transformations/joins','left-right-join-equals',['join'])}} @@ -151,7 +166,8 @@ We show that `df1.join(df2, how="right", ...)` is the same as `df2.join(df1, how ### Full join -A full outer join will keep all of the rows from the left and right dataframes, even if they don't have matching rows in the other dataframe: +A full outer join will keep all of the rows from the left and right dataframes, even if they don't +have matching rows in the other dataframe: {{code_block('user-guide/transformations/joins','full-join',['join'])}} @@ -159,9 +175,11 @@ A full outer join will keep all of the rows from the left and right dataframes, --8<-- "python/user-guide/transformations/joins.py:full-join" ``` -In this case, we see that we get two columns `property_name` and `property_name_right` to make up for the fact that we are matching on the column `property_name` of both dataframes and there are some names for which there are no matches. -The two columns help differentiate the source of each row data. -If we wanted to force `join` to coalesce the two columns `property_name` into a single column, we could set `coalesce=True` explicitly: +In this case, we see that we get two columns `property_name` and `property_name_right` to make up +for the fact that we are matching on the column `property_name` of both dataframes and there are +some names for which there are no matches. The two columns help differentiate the source of each row +data. If we wanted to force `join` to coalesce the two columns `property_name` into a single column, +we could set `coalesce=True` explicitly: {{code_block('user-guide/transformations/joins','full-join-coalesce',['join'])}} @@ -169,13 +187,17 @@ If we wanted to force `join` to coalesce the two columns `property_name` into a --8<-- "python/user-guide/transformations/joins.py:full-join-coalesce" ``` -When not set, the parameter `coalesce` is determined automatically from the join strategy and the key(s) specified, which is why the inner, left, and right, joins acted as if `coalesce=True`, even though we didn't set it. +When not set, the parameter `coalesce` is determined automatically from the join strategy and the +key(s) specified, which is why the inner, left, and right, joins acted as if `coalesce=True`, even +though we didn't set it. ### Semi join -A semi join will return the rows of the left dataframe that have a match in the right dataframe, but we do not actually join the matching rows: +A semi join will return the rows of the left dataframe that have a match in the right dataframe, but +we do not actually join the matching rows: -{{code_block('user-guide/transformations/joins', 'semi-join', [], ['join'], ['join-semi_anti_join_flag'])}} +{{code_block('user-guide/transformations/joins', 'semi-join', [], ['join'], +['join-semi_anti_join_flag'])}} ```python exec="on" result="text" session="transformations/joins" --8<-- "python/user-guide/transformations/joins.py:semi-join" @@ -185,9 +207,11 @@ A semi join acts as a sort of row filter based on a second dataframe. ### Anti join -Conversely, an anti join will return the rows of the left dataframe that do not have a match in the right dataframe: +Conversely, an anti join will return the rows of the left dataframe that do not have a match in the +right dataframe: -{{code_block('user-guide/transformations/joins', 'anti-join', [], ['join'], ['join-semi_anti_join_flag'])}} +{{code_block('user-guide/transformations/joins', 'anti-join', [], ['join'], +['join-semi_anti_join_flag'])}} ```python exec="on" result="text" session="transformations/joins" --8<-- "python/user-guide/transformations/joins.py:anti-join" @@ -195,8 +219,9 @@ Conversely, an anti join will return the rows of the left dataframe that do not ## Non-equi joins -In a non-equi join matches between the left and right dataframes are computed differently. -Instead of looking for matches on key expressions, we provide a single predicate that determines what rows of the left dataframe can be paired up with what rows of the right dataframe. +In a non-equi join matches between the left and right dataframes are computed differently. Instead +of looking for matches on key expressions, we provide a single predicate that determines what rows +of the left dataframe can be paired up with what rows of the right dataframe. For example, consider the following Monopoly players and their current cash: @@ -206,8 +231,8 @@ For example, consider the following Monopoly players and their current cash: --8<-- "python/user-guide/transformations/joins.py:players" ``` -Using a non-equi join we can easily build a dataframe with all the possible properties that each player could be interested in buying. -We use the function `join_where` to compute a non-equi join: +Using a non-equi join we can easily build a dataframe with all the possible properties that each +player could be interested in buying. We use the function `join_where` to compute a non-equi join: {{code_block('user-guide/transformations/joins','non-equi',['join_where'])}} @@ -215,7 +240,8 @@ We use the function `join_where` to compute a non-equi join: --8<-- "python/user-guide/transformations/joins.py:non-equi" ``` -You can provide multiple expressions as predicates but they all must use comparison operators that evaluate to a Boolean result and must refer to columns from both dataframes. +You can provide multiple expressions as predicates but they all must use comparison operators that +evaluate to a Boolean result and must refer to columns from both dataframes. !!! note @@ -223,11 +249,11 @@ You can provide multiple expressions as predicates but they all must use compari ## Asof join -An `asof` join is like a left join except that we match on nearest key rather than equal keys. -In Polars we can do an asof join with the `join_asof` method. +An `asof` join is like a left join except that we match on nearest key rather than equal keys. In +Polars we can do an asof join with the `join_asof` method. -For the asof join we will consider a scenario inspired by the stock market. -Suppose a stock market broker has a dataframe called `df_trades` showing transactions it has made for different stocks. +For the asof join we will consider a scenario inspired by the stock market. Suppose a stock market +broker has a dataframe called `df_trades` showing transactions it has made for different stocks. {{code_block('user-guide/transformations/joins','df_trades',[])}} @@ -243,8 +269,10 @@ The broker has another dataframe called `df_quotes` showing prices it has quoted --8<-- "python/user-guide/transformations/joins.py:df_quotes" ``` -You want to produce a dataframe showing for each trade the most recent quote provided _before_ the trade. You do this with `join_asof` (using the default `strategy = "backward"`). -To avoid joining between trades on one stock with a quote on another you must specify an exact preliminary join on the stock column with `by="stock"`. +You want to produce a dataframe showing for each trade the most recent quote provided _before_ the +trade. You do this with `join_asof` (using the default `strategy = "backward"`). To avoid joining +between trades on one stock with a quote on another you must specify an exact preliminary join on +the stock column with `by="stock"`. {{code_block('user-guide/transformations/joins','asof', [], ['join_asof'], ['join_asof_by'])}} @@ -252,10 +280,12 @@ To avoid joining between trades on one stock with a quote on another you must sp --8<-- "python/user-guide/transformations/joins.py:asof" ``` -If you want to make sure that only quotes within a certain time range are joined to the trades you can specify the `tolerance` argument. -In this case we want to make sure that the last preceding quote is within 1 minute of the trade so we set `tolerance = "1m"`. +If you want to make sure that only quotes within a certain time range are joined to the trades you +can specify the `tolerance` argument. In this case we want to make sure that the last preceding +quote is within 1 minute of the trade so we set `tolerance = "1m"`. -{{code_block('user-guide/transformations/joins','asof-tolerance', [], ['join_asof'], ['join_asof_by'])}} +{{code_block('user-guide/transformations/joins','asof-tolerance', [], ['join_asof'], +['join_asof_by'])}} ```python exec="on" result="text" session="transformations/joins" --8<-- "python/user-guide/transformations/joins.py:asof-tolerance" @@ -263,8 +293,11 @@ In this case we want to make sure that the last preceding quote is within 1 minu ## Cartesian product -Polars allows you to compute the [Cartesian product](https://en.wikipedia.org/wiki/Cartesian_product) of two dataframes, producing a dataframe where all rows of the left dataframe are paired up with all the rows of the right dataframe. -To compute the Cartesian product of two dataframes, you can pass the strategy `how="cross"` to the function `join` without specifying any of `on`, `left_on`, and `right_on`: +Polars allows you to compute the +[Cartesian product](https://en.wikipedia.org/wiki/Cartesian_product) of two dataframes, producing a +dataframe where all rows of the left dataframe are paired up with all the rows of the right +dataframe. To compute the Cartesian product of two dataframes, you can pass the strategy +`how="cross"` to the function `join` without specifying any of `on`, `left_on`, and `right_on`: {{code_block('user-guide/transformations/joins','cartesian-product',[],['join'],['cross_join'])}} diff --git a/docs/source/user-guide/transformations/pivot.md b/docs/source/user-guide/transformations/pivot.md index 161bb3f6d78e..417b053a02d5 100644 --- a/docs/source/user-guide/transformations/pivot.md +++ b/docs/source/user-guide/transformations/pivot.md @@ -9,9 +9,8 @@ Pivot a column in a `DataFrame` and perform one of the following aggregations: - mean - median -The pivot operation consists of a group by one, or multiple columns (these will be the -new y-axis), the column that will be pivoted (this will be the new x-axis) and an -aggregation. +The pivot operation consists of a group by one, or multiple columns (these will be the new y-axis), +the column that will be pivoted (this will be the new x-axis) and an aggregation. ## Dataset @@ -32,12 +31,12 @@ aggregation. ## Lazy -A Polars `LazyFrame` always need to know the schema of a computation statically (before collecting the query). -As a pivot's output schema depends on the data, and it is therefore impossible to determine the schema without -running the query. +A Polars `LazyFrame` always need to know the schema of a computation statically (before collecting +the query). As a pivot's output schema depends on the data, and it is therefore impossible to +determine the schema without running the query. -Polars could have abstracted this fact for you just like Spark does, but we don't want you to shoot yourself in the foot -with a shotgun. The cost should be clear upfront. +Polars could have abstracted this fact for you just like Spark does, but we don't want you to shoot +yourself in the foot with a shotgun. The cost should be clear upfront. {{code_block('user-guide/transformations/pivot','lazy',['pivot'])}} diff --git a/docs/source/user-guide/transformations/time-series/filter.md b/docs/source/user-guide/transformations/time-series/filter.md index 1f57d8866fbd..45c30a89cbf7 100644 --- a/docs/source/user-guide/transformations/time-series/filter.md +++ b/docs/source/user-guide/transformations/time-series/filter.md @@ -1,8 +1,10 @@ # Filtering -Filtering date columns works in the same way as with other types of columns using the `.filter` method. +Filtering date columns works in the same way as with other types of columns using the `.filter` +method. -Polars uses Python's native `datetime`, `date` and `timedelta` for equality comparisons between the datatypes `pl.Datetime`, `pl.Date` and `pl.Duration`. +Polars uses Python's native `datetime`, `date` and `timedelta` for equality comparisons between the +datatypes `pl.Datetime`, `pl.Date` and `pl.Duration`. In the following example we use a time series of Apple stock prices. @@ -14,8 +16,8 @@ In the following example we use a time series of Apple stock prices. ## Filtering by single dates -We can filter by a single date by casting the desired date string to a `Date` object -in a filter expression: +We can filter by a single date by casting the desired date string to a `Date` object in a filter +expression: {{code_block('user-guide/transformations/time-series/filter','filter',['filter'])}} @@ -27,7 +29,8 @@ Note we are using the lowercase `datetime` method rather than the uppercase `Dat ## Filtering by a date range -We can filter by a range of dates using the `is_between` method in a filter expression with the start and end dates: +We can filter by a range of dates using the `is_between` method in a filter expression with the +start and end dates: {{code_block('user-guide/transformations/time-series/filter','range',['filter','is_between'])}} @@ -37,9 +40,9 @@ We can filter by a range of dates using the `is_between` method in a filter expr ## Filtering with negative dates -Say you are working with an archeologist and are dealing in negative dates. -Polars can parse and store them just fine, but the Python `datetime` library -does not. So for filtering, you should use attributes in the `.dt` namespace: +Say you are working with an archeologist and are dealing in negative dates. Polars can parse and +store them just fine, but the Python `datetime` library does not. So for filtering, you should use +attributes in the `.dt` namespace: {{code_block('user-guide/transformations/time-series/filter','negative',['str.to_date'])}} diff --git a/docs/source/user-guide/transformations/time-series/parsing.md b/docs/source/user-guide/transformations/time-series/parsing.md index fa67895736b6..df91e9d022bb 100644 --- a/docs/source/user-guide/transformations/time-series/parsing.md +++ b/docs/source/user-guide/transformations/time-series/parsing.md @@ -1,19 +1,24 @@ # Parsing -Polars has native support for parsing time series data and doing more sophisticated operations such as temporal grouping and resampling. +Polars has native support for parsing time series data and doing more sophisticated operations such +as temporal grouping and resampling. ## Datatypes Polars has the following datetime datatypes: -- `Date`: Date representation e.g. 2014-07-08. It is internally represented as days since UNIX epoch encoded by a 32-bit signed integer. -- `Datetime`: Datetime representation e.g. 2014-07-08 07:00:00. It is internally represented as a 64 bit integer since the Unix epoch and can have different units such as ns, us, ms. -- `Duration`: A time delta type that is created when subtracting `Date/Datetime`. Similar to `timedelta` in Python. +- `Date`: Date representation e.g. 2014-07-08. It is internally represented as days since UNIX epoch + encoded by a 32-bit signed integer. +- `Datetime`: Datetime representation e.g. 2014-07-08 07:00:00. It is internally represented as a 64 + bit integer since the Unix epoch and can have different units such as ns, us, ms. +- `Duration`: A time delta type that is created when subtracting `Date/Datetime`. Similar to + `timedelta` in Python. - `Time`: Time representation, internally represented as nanoseconds since midnight. ## Parsing dates from a file -When loading from a CSV file Polars attempts to parse dates and times if the `try_parse_dates` flag is set to `True`: +When loading from a CSV file Polars attempts to parse dates and times if the `try_parse_dates` flag +is set to `True`: {{code_block('user-guide/transformations/time-series/parsing','df',['read_csv'])}} @@ -26,7 +31,8 @@ On the other hand binary formats such as parquet have a schema that is respected ## Casting strings to dates -You can also cast a column of datetimes encoded as strings to a datetime type. You do this by calling the string `str.to_date` method and passing the format of the date string: +You can also cast a column of datetimes encoded as strings to a datetime type. You do this by +calling the string `str.to_date` method and passing the format of the date string: {{code_block('user-guide/transformations/time-series/parsing','cast',['read_csv','str.to_date'])}} @@ -38,7 +44,8 @@ You can also cast a column of datetimes encoded as strings to a datetime type. Y ## Extracting date features from a date column -You can extract data features such as the year or day from a date column using the `.dt` namespace on a date column: +You can extract data features such as the year or day from a date column using the `.dt` namespace +on a date column: {{code_block('user-guide/transformations/time-series/parsing','extract',['dt.year'])}} @@ -48,8 +55,8 @@ You can extract data features such as the year or day from a date column using t ## Mixed offsets -If you have mixed offsets (say, due to crossing daylight saving time), -then you can use `utc=True` and then convert to your time zone: +If you have mixed offsets (say, due to crossing daylight saving time), then you can use `utc=True` +and then convert to your time zone: {{code_block('user-guide/transformations/time-series/parsing','mixed',['str.to_datetime','dt.convert_time_zone'])}} diff --git a/docs/source/user-guide/transformations/time-series/resampling.md b/docs/source/user-guide/transformations/time-series/resampling.md index 97f3ba36cd71..e82abe3f1d2f 100644 --- a/docs/source/user-guide/transformations/time-series/resampling.md +++ b/docs/source/user-guide/transformations/time-series/resampling.md @@ -8,7 +8,9 @@ We can resample by either: ## Downsampling to a lower frequency -Polars views downsampling as a special case of the **group_by** operation and you can do this with `group_by_dynamic` and `group_by_rolling` - [see the temporal group by page for examples](rolling.md). +Polars views downsampling as a special case of the **group_by** operation and you can do this with +`group_by_dynamic` and `group_by_rolling` - +[see the temporal group by page for examples](rolling.md). ## Upsampling to a higher frequency @@ -21,11 +23,14 @@ Let's go through an example where we generate data at 30 minute intervals: --8<-- "python/user-guide/transformations/time-series/resampling.py:df" ``` -Upsampling can be done by defining the new sampling interval. By upsampling we are adding in extra rows where we do not have data. As such upsampling by itself gives a DataFrame with nulls. These nulls can then be filled with a fill strategy or interpolation. +Upsampling can be done by defining the new sampling interval. By upsampling we are adding in extra +rows where we do not have data. As such upsampling by itself gives a DataFrame with nulls. These +nulls can then be filled with a fill strategy or interpolation. ### Upsampling strategies -In this example we upsample from the original 30 minutes to 15 minutes and then use a `forward` strategy to replace the nulls with the previous non-null value: +In this example we upsample from the original 30 minutes to 15 minutes and then use a `forward` +strategy to replace the nulls with the previous non-null value: {{code_block('user-guide/transformations/time-series/resampling','upsample',['upsample'])}} diff --git a/docs/source/user-guide/transformations/time-series/rolling.md b/docs/source/user-guide/transformations/time-series/rolling.md index e669ac92121e..126a68290900 100644 --- a/docs/source/user-guide/transformations/time-series/rolling.md +++ b/docs/source/user-guide/transformations/time-series/rolling.md @@ -2,11 +2,13 @@ ## Grouping by fixed windows -We can calculate temporal statistics using `group_by_dynamic` to group rows into days/months/years etc. +We can calculate temporal statistics using `group_by_dynamic` to group rows into days/months/years +etc. ### Annual average example -In following simple example we calculate the annual average closing price of Apple stock prices. We first load the data from CSV: +In following simple example we calculate the annual average closing price of Apple stock prices. We +first load the data from CSV: {{code_block('user-guide/transformations/time-series/rolling','df',['upsample'])}} @@ -40,17 +42,21 @@ A dynamic window is defined by a: - **period**: indicates the duration of the window - **offset**: can be used to offset the start of the windows -The value for `every` sets how often the groups start. The time period values are flexible - for example we could take: +The value for `every` sets how often the groups start. The time period values are flexible - for +example we could take: - the average over 2 year intervals by replacing `1y` with `2y` - the average over 18 month periods by replacing `1y` with `1y6mo` -We can also use the `period` parameter to set how long the time period for each group is. For example, if we set the `every` parameter to be `1y` and the `period` parameter to be `2y` then we would get groups at one year intervals where each groups spanned two years. +We can also use the `period` parameter to set how long the time period for each group is. For +example, if we set the `every` parameter to be `1y` and the `period` parameter to be `2y` then we +would get groups at one year intervals where each groups spanned two years. -If the `period` parameter is not specified then it is set equal to the `every` parameter so that if the `every` parameter is set to be `1y` then each group spans `1y` as well. +If the `period` parameter is not specified then it is set equal to the `every` parameter so that if +the `every` parameter is set to be `1y` then each group spans `1y` as well. -Because _**every**_ does not have to be equal to _**period**_, we can create many groups in a very flexible way. They may overlap -or leave boundaries between them. +Because _**every**_ does not have to be equal to _**period**_, we can create many groups in a very +flexible way. They may overlap or leave boundaries between them. Let's see how the windows for some parameter combinations would look. Let's start out boring. 🥱 @@ -87,16 +93,24 @@ data points that in these gaps will not be a member of any group #### `truncate` -The `truncate` parameter is a Boolean variable that determines what datetime value is associated with each group in the output. In the example above the first data point is on 23rd February 1981. If `truncate = True` (the default) then the date for the first year in the annual average is 1st January 1981. However, if `truncate = False` then the date for the first year in the annual average is the date of the first data point on 23rd February 1981. Note that `truncate` only affects what's shown in the -`Date` column and does not affect the window boundaries. +The `truncate` parameter is a Boolean variable that determines what datetime value is associated +with each group in the output. In the example above the first data point is on 23rd February 1981. +If `truncate = True` (the default) then the date for the first year in the annual average is 1st +January 1981. However, if `truncate = False` then the date for the first year in the annual average +is the date of the first data point on 23rd February 1981. Note that `truncate` only affects what's +shown in the `Date` column and does not affect the window boundaries. ### Using expressions in `group_by_dynamic` -We aren't restricted to using simple aggregations like `mean` in a group by operation - we can use the full range of expressions available in Polars. +We aren't restricted to using simple aggregations like `mean` in a group by operation - we can use +the full range of expressions available in Polars. -In the snippet below we create a `date range` with every **day** (`"1d"`) in 2021 and turn this into a `DataFrame`. +In the snippet below we create a `date range` with every **day** (`"1d"`) in 2021 and turn this into +a `DataFrame`. -Then in the `group_by_dynamic` we create dynamic windows that start every **month** (`"1mo"`) and have a window length of `1` month. The values that match these dynamic windows are then assigned to that group and can be aggregated with the powerful expression API. +Then in the `group_by_dynamic` we create dynamic windows that start every **month** (`"1mo"`) and +have a window length of `1` month. The values that match these dynamic windows are then assigned to +that group and can be aggregated with the powerful expression API. Below we show an example where we use **group_by_dynamic** to compute: @@ -111,11 +125,13 @@ Below we show an example where we use **group_by_dynamic** to compute: ## Grouping by rolling windows -The rolling operation, `rolling`, is another entrance to the `group_by`/`agg` context. But different from the `group_by_dynamic` where the windows are fixed by a parameter `every` and `period`. In a `rolling`, the windows are not fixed at all! They are determined -by the values in the `index_column`. +The rolling operation, `rolling`, is another entrance to the `group_by`/`agg` context. But different +from the `group_by_dynamic` where the windows are fixed by a parameter `every` and `period`. In a +`rolling`, the windows are not fixed at all! They are determined by the values in the +`index_column`. -So imagine having a time column with the values `{2021-01-06, 2021-01-10}` and a `period="5d"` this would create the following -windows: +So imagine having a time column with the values `{2021-01-06, 2021-01-10}` and a `period="5d"` this +would create the following windows: ```text 2021-01-01 2021-01-06 @@ -125,8 +141,8 @@ windows: |----------| ``` -Because the windows of a rolling group by are always determined by the values in the `DataFrame` column, the number of -groups is always equal to the original `DataFrame`. +Because the windows of a rolling group by are always determined by the values in the `DataFrame` +column, the number of groups is always equal to the original `DataFrame`. ## Combining group by operations diff --git a/docs/source/user-guide/transformations/time-series/timezones.md b/docs/source/user-guide/transformations/time-series/timezones.md index de5046d4cafd..25e6c873c50b 100644 --- a/docs/source/user-guide/transformations/time-series/timezones.md +++ b/docs/source/user-guide/transformations/time-series/timezones.md @@ -9,21 +9,21 @@ hide: You really should never, ever deal with time zones if you can help it. -The `Datetime` datatype can have a time zone associated with it. -Examples of valid time zones are: +The `Datetime` datatype can have a time zone associated with it. Examples of valid time zones are: - `None`: no time zone, also known as "time zone naive". - `UTC`: Coordinated Universal Time. -- `Asia/Kathmandu`: time zone in "area/location" format. - See the [list of tz database time zones](https://en.wikipedia.org/wiki/List_of_tz_database_time_zones) - to see what's available. +- `Asia/Kathmandu`: time zone in "area/location" format. See the + [list of tz database time zones](https://en.wikipedia.org/wiki/List_of_tz_database_time_zones) to + see what's available. -Caution: Fixed offsets such as +02:00, should not be used for handling time zones. It's advised to use the "Area/Location" format mentioned above, as it can manage timezones more effectively. +Caution: Fixed offsets such as +02:00, should not be used for handling time zones. It's advised to +use the "Area/Location" format mentioned above, as it can manage timezones more effectively. -Note that, because a `Datetime` can only have a single time zone, it is -impossible to have a column with multiple time zones. If you are parsing data -with multiple offsets, you may want to pass `utc=True` to convert -them all to a common time zone (`UTC`), see [parsing dates and times](parsing.md). +Note that, because a `Datetime` can only have a single time zone, it is impossible to have a column +with multiple time zones. If you are parsing data with multiple offsets, you may want to pass +`utc=True` to convert them all to a common time zone (`UTC`), see +[parsing dates and times](parsing.md). The main methods for setting and converting between time zones are: