From 73223f13b467c6d54cdfa501e9889d61921b57ee Mon Sep 17 00:00:00 2001 From: ritchie Date: Thu, 6 Mar 2025 11:58:46 +0100 Subject: [PATCH] docs(python): Document `read_().lazy()` antipattern --- py-polars/polars/io/csv/functions.py | 10 ++++++---- py-polars/polars/io/ipc/functions.py | 8 ++++++++ py-polars/polars/io/ndjson.py | 11 +++++++++++ py-polars/polars/io/parquet/functions.py | 13 ++++++++++++- 4 files changed, 37 insertions(+), 5 deletions(-) diff --git a/py-polars/polars/io/csv/functions.py b/py-polars/polars/io/csv/functions.py index 162c1080f73c..b274eb70a541 100644 --- a/py-polars/polars/io/csv/functions.py +++ b/py-polars/polars/io/csv/functions.py @@ -228,6 +228,12 @@ def read_csv( -------- scan_csv : Lazily read from a CSV file or multiple files via glob patterns. + Warnings + -------- + Calling `read_csv().lazy()` is an antipattern as this forces Polars to materialize + a full csv file and therefore cannot push any optimizations into the reader. + Therefore always prefer ``scan_csv`` if you want to work with ``LazyFrame``s. + Notes ----- If the schema is inferred incorrectly (e.g. as `pl.Int64` instead of `pl.Float64`), @@ -235,10 +241,6 @@ def read_csv( `infer_schema_length` or override the inferred dtype for those columns with `schema_overrides`. - This operation defaults to a `rechunk` operation at the end, meaning that all data - will be stored continuously in memory. Set `rechunk=False` if you are benchmarking - the csv-reader. A `rechunk` is an expensive operation. - Examples -------- >>> pl.read_csv("data.csv", separator="|") # doctest: +SKIP diff --git a/py-polars/polars/io/ipc/functions.py b/py-polars/polars/io/ipc/functions.py index c933728810c9..515150062c23 100644 --- a/py-polars/polars/io/ipc/functions.py +++ b/py-polars/polars/io/ipc/functions.py @@ -94,8 +94,16 @@ def read_ipc( ------- DataFrame + See Also + -------- + scan_ipc : Lazily read from an IPC file or multiple files via glob patterns. + Warnings -------- + Calling `read_ipc().lazy()` is an antipattern as this forces Polars to materialize + a full csv file and therefore cannot push any optimizations into the reader. + Therefore always prefer ``scan_ipc`` if you want to work with ``LazyFrame``s. + If `memory_map` is set, the bytes on disk are mapped 1:1 to memory. That means that you cannot write to the same filename. E.g. `pl.read_ipc("my_file.arrow").write_ipc("my_file.arrow")` will fail. diff --git a/py-polars/polars/io/ndjson.py b/py-polars/polars/io/ndjson.py index a99e4eb4e1f5..39095240ec14 100644 --- a/py-polars/polars/io/ndjson.py +++ b/py-polars/polars/io/ndjson.py @@ -117,6 +117,17 @@ def read_ndjson( include_file_paths Include the path of the source file(s) as a column with this name. + See Also + -------- + scan_ndjson : Lazily read from an NDJSON file or multiple files via glob patterns. + + Warnings + -------- + Calling `read_ndjson().lazy()` is an antipattern as this forces Polars to + materialize a full ndjson file and therefore cannot push any optimizations into + the reader. Therefore always prefer ``scan_ndjson`` if you want to work with + ``LazyFrame``s. + Examples -------- >>> from io import StringIO diff --git a/py-polars/polars/io/parquet/functions.py b/py-polars/polars/io/parquet/functions.py index e0a0e6ccb951..831f11d12d13 100644 --- a/py-polars/polars/io/parquet/functions.py +++ b/py-polars/polars/io/parquet/functions.py @@ -175,7 +175,18 @@ def read_parquet( See Also -------- - scan_parquet + scan_parquet : Lazily read from a Parquet file or multiple files via glob patterns. + + Warnings + -------- + Calling `read_parquet().lazy()` is an antipattern as this forces Polars to + materialize a full parquet file and therefore cannot push any optimizations + into the reader. Therefore always prefer ``scan_parquet`` if you want to work + with ``LazyFrame``s. + + See Also + -------- + scan_parquet : Lazily read from a parquet file or multiple files via glob patterns. scan_pyarrow_dataset """ if schema is not None: