From 5a28d2a1ede2f1dd492aa4793d2d35e069104ddb Mon Sep 17 00:00:00 2001 From: Robin Date: Thu, 6 Mar 2025 08:47:14 +0100 Subject: [PATCH] docs: Update Polars Cloud interactive workflow examples (#21609) --- .../polars-cloud/run/interactive-batch.md | 32 +++++++++---------- docs/source/polars-cloud/run/workflow.md | 24 ++++++-------- .../python/polars-cloud/interactive-batch.py | 5 ++- 3 files changed, 27 insertions(+), 34 deletions(-) diff --git a/docs/source/polars-cloud/run/interactive-batch.md b/docs/source/polars-cloud/run/interactive-batch.md index bc22c5126a0e..f65045257a02 100644 --- a/docs/source/polars-cloud/run/interactive-batch.md +++ b/docs/source/polars-cloud/run/interactive-batch.md @@ -50,21 +50,25 @@ The initial query remains the same. In the compute context the parameter `intera to `True`. When calling `.collect()` on your remote query execution, the output is written to a temporary -location. These intermediate result files are automatically deleted after several hours. You can see -the location of this folder when printing the query results. +location. These intermediate result files are automatically deleted after several hours. The output +of the remote query is a LazyFrame. -Users can call `.lazy()` on the result and continue working with the previous result. +```python +print(type(res1)) +``` + +``` + +``` + +If you want to inspect the results you can call collect again. ```python -print(res1) +print(res1.collect()) ``` ```text -total_stages: 1 -finished_stages: 1 -total_rows: 4 -location: ['s3://polars-cloud-/query_outputs//.parquet'] -head: +shape: (4, 3) ┌────────────────┬────────────┬───────────┐ │ name ┆ birth_year ┆ bmi │ │ --- ┆ --- ┆ --- │ @@ -77,20 +81,16 @@ head: └────────────────┴────────────┴───────────┘ ``` -To continue your query we can read the result to a new LazyFrame and continue your exploration. +To continue your exploration you can use the returned LazyFrame to build another query. {{code_block('polars-cloud/interactive-batch','interactive-next',[])}} ```python -print(res2) +print(res2.collect()) ``` ```text -total_stages: 1 -finished_stages: 1 -total_rows: 2 -location: ['s3://polars-cloud-/query_outputs//.parquet'] -head: +shape: (2, 3) ┌──────────────┬────────────┬───────────┐ │ name ┆ birth_year ┆ bmi │ │ --- ┆ --- ┆ --- │ diff --git a/docs/source/polars-cloud/run/workflow.md b/docs/source/polars-cloud/run/workflow.md index d767fa78ee2d..cdb77ef9e6c7 100644 --- a/docs/source/polars-cloud/run/workflow.md +++ b/docs/source/polars-cloud/run/workflow.md @@ -18,7 +18,7 @@ demonstrate the workflow. Here we will create the LazyFrame ourselves, but it co ```python import polars as pl -lf = pl.DataFrame( +lf = pl.LazyFrame( { "region": [ "Australia", @@ -102,8 +102,7 @@ import polars_cloud as pc ctx = pc.ComputeContext( workspace="environmental-analysis", memory=32, - cpus=8, - cluster_size=4 + cpus=8 ) query.remote(ctx).sink_parquet("s3://bucket/result.parquet") @@ -119,21 +118,16 @@ ctx = pc.ComputeContext( workspace="environmental-analysis", memory=32, cpus=8, - cluster_size=4, interactive=True, # set interactive to True ) -result = query.remote(ctx).collect().await_result() +result = query.remote(ctx).collect() -print(result) +print(result.collect()) ``` ```text -total_stages: 1 -finished_stages: 1 -total_rows: 4 -location: ['s3://polars-cloud-xxx-xxx-xxx-xxx-eddc267994b8/query_outputs/22079a94-6424-4dc1-b1d7-3ea0dc4cafcc/7e966528-4238-4b34-aa52-6242be285723.parquet'] -head: +shape: (4, 6) ┌───────────────┬─────────────┬──────────┬───────────┬────────────────────┬───────────┐ │ region ┆ temperature ┆ humidity ┆ burn_area ┆ vegetation_density ┆ fire_risk │ │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ @@ -147,15 +141,15 @@ head: ``` We can call `.collect()` instead of `.sink_parquet()`. This will store your results to a temporary -location which can be used to further iterate upon. To continue on the result from `collect` simply -call `lazy` and you can get back a `LazyFrame` for further analysis. +location which can be used to further iterate upon. A LazyFrame is returned that can be used in the +next steps of the workflow. ```python res2 = ( - result.lazy() + result .filter(pl.col("fire_risk") > 1) .sink_parquet("s3://bucket/output-interactive.parquet") ) ``` -Finally, the results of your interactive workflow can be written to S3. +The result of your interactive workflow can be written to S3. diff --git a/docs/source/src/python/polars-cloud/interactive-batch.py b/docs/source/src/python/polars-cloud/interactive-batch.py index 90d66e372695..c56770be12c8 100644 --- a/docs/source/src/python/polars-cloud/interactive-batch.py +++ b/docs/source/src/python/polars-cloud/interactive-batch.py @@ -42,19 +42,18 @@ (pl.col("weight") / (pl.col("height") ** 2)).alias("bmi"), ).sort(by="bmi") -res1 = lf.remote(ctx).collect().await_result() +res1 = lf.remote(ctx).collect() # --8<-- [end:interactive] # --8<-- [start:interactive-next] res2 = ( - res1.lazy() + res1 .filter( pl.col("birth_year").is_in([1983, 1985]), ) .remote(ctx) .collect() - .await_result() ) # --8<-- [end:interactive-next] """