diff --git a/docs/source/_build/API_REFERENCE_LINKS.yml b/docs/source/_build/API_REFERENCE_LINKS.yml index 060629a84b75..2a9bc80237dc 100644 --- a/docs/source/_build/API_REFERENCE_LINKS.yml +++ b/docs/source/_build/API_REFERENCE_LINKS.yml @@ -38,6 +38,9 @@ python: Expr.list: name: "list namespace" link: https://docs.pola.rs/api/python/stable/reference/expressions/list.html + Expr.str: + name: "str namespace" + link: https://docs.pola.rs/api/python/stable/reference/expressions/string.html element: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.element.html all: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.all.html exclude: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.exclude.html @@ -156,6 +159,7 @@ python: Expr.name: name: "name namespace" link: https://docs.pola.rs/api/python/stable/reference/expressions/name.html + round: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.round.html#polars.Expr.round rust: DataFrame: https://docs.pola.rs/api/rust/dev/polars/frame/struct.DataFrame.html @@ -293,7 +297,10 @@ rust: n_unique: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/enum.Expr.html#method.n_unique null_count: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/enum.Expr.html#method.null_count interpolate: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/enum.Expr.html#method.interpolate - is_between: https://github.com/pola-rs/polars/issues/11285 + is_between: + name: is_between + link: https://docs.pola.rs/api/rust/dev/polars/prelude/enum.Expr.html#method.is_between + feature_flags: [is_between] is_duplicated: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/enum.Expr.html#method.is_duplicated is_null: https://docs.pola.rs/api/rust/dev/polars/prelude/enum.Expr.html#method.is_null value_counts: @@ -304,6 +311,10 @@ rust: Expr.list: name: "list namespace" link: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/struct.ListNameSpace.html + Expr.str: + name: "str namespace" + link: https://docs.pola.rs/api/rust/dev/polars/prelude/trait.StringNameSpaceImpl.html + feature_flags: [strings] Series.arr: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/struct.ArrayNameSpace.html date_range: @@ -395,3 +406,7 @@ rust: name: "dt namespace" link: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/dt/struct.DateLikeNameSpace.html feature_flags: [temporal] + round: + name: "round" + link: https://docs.pola.rs/api/rust/dev/polars/prelude/enum.Expr.html#method.round + feature_flags: [round_series] diff --git a/docs/source/src/python/user-guide/getting-started/expressions.py b/docs/source/src/python/user-guide/getting-started/expressions.py index 5d1632051675..d68207ebd60d 100644 --- a/docs/source/src/python/user-guide/getting-started/expressions.py +++ b/docs/source/src/python/user-guide/getting-started/expressions.py @@ -28,7 +28,7 @@ # --8<-- [start:select] result = df.select( pl.col("name"), - pl.col("birthdate").dt.year().alias("birth year"), + pl.col("birthdate").dt.year().alias("birth_year"), (pl.col("weight") / (pl.col("height") ** 2)).alias("bmi"), ) print(result) @@ -37,15 +37,15 @@ # --8<-- [start:expression-expansion] result = df.select( pl.col("name"), - (pl.col("weight", "height") * 0.95).round(2).name.suffix(" - 5%"), + (pl.col("weight", "height") * 0.95).round(2).name.suffix("-5%"), ) print(result) # --8<-- [end:expression-expansion] # --8<-- [start:with_columns] result = df.with_columns( - pl.col("birthdate").dt.year().alias("birth year"), - (pl.col("weight") / (pl.col("height") ** 2)).alias("bmi"), + birth_year=pl.col("birthdate").dt.year(), + bmi=pl.col("weight") / (pl.col("height") ** 2), ) print(result) # --8<-- [end:with_columns] @@ -76,8 +76,8 @@ (pl.col("birthdate").dt.year() // 10 * 10).alias("decade"), maintain_order=True, ).agg( - pl.len().alias("sample size"), - pl.col("weight").mean().round(2).alias("avg weight"), + pl.len().alias("sample_size"), + pl.col("weight").mean().round(2).alias("avg_weight"), pl.col("height").max().alias("tallest"), ) print(result) @@ -98,7 +98,7 @@ ) .agg( pl.col("name"), - pl.col("weight", "height").mean().round(2).name.prefix("avg "), + pl.col("weight", "height").mean().round(2).name.prefix("avg_"), ) ) print(result) @@ -131,5 +131,5 @@ } ) -print(df.vstack(df3)) +print(pl.concat([df, df3], how="vertical")) # --8<-- [end:concat] diff --git a/docs/source/src/rust/Cargo.toml b/docs/source/src/rust/Cargo.toml index 3a41cabf2ce6..50647bb70dd4 100644 --- a/docs/source/src/rust/Cargo.toml +++ b/docs/source/src/rust/Cargo.toml @@ -28,10 +28,11 @@ required-features = ["polars/lazy", "polars/csv"] [[bin]] name = "user-guide-getting-started-expressions" path = "user-guide/getting-started/expressions.rs" -required-features = ["polars/lazy"] +required-features = ["polars/lazy", "polars/temporal", "polars/round_series", "polars/strings"] [[bin]] name = "user-guide-getting-started-joins" path = "user-guide/getting-started/joins.rs" +required-features = ["polars/polars-ops"] [[bin]] name = "user-guide-getting-started-reading-writing" path = "user-guide/getting-started/reading-writing.rs" diff --git a/docs/source/src/rust/user-guide/getting-started/expressions.rs b/docs/source/src/rust/user-guide/getting-started/expressions.rs index 30b334c09cda..362c99b533c9 100644 --- a/docs/source/src/rust/user-guide/getting-started/expressions.rs +++ b/docs/source/src/rust/user-guide/getting-started/expressions.rs @@ -1,131 +1,195 @@ -use chrono::prelude::*; -use polars::prelude::*; -use rand::Rng; - fn main() -> Result<(), Box> { // --8<-- [start:df] - let mut rng = rand::thread_rng(); - - let df: DataFrame = df!( - "a" => 0..6, - "b"=> (0..6).map(|_| rng.gen::()).collect::>(), - "c"=> [ - NaiveDate::from_ymd_opt(2025, 12, 1).unwrap().and_hms_opt(0, 0, 0).unwrap(), - NaiveDate::from_ymd_opt(2025, 12, 2).unwrap().and_hms_opt(0, 0, 0).unwrap(), - NaiveDate::from_ymd_opt(2025, 12, 3).unwrap().and_hms_opt(0, 0, 0).unwrap(), - NaiveDate::from_ymd_opt(2025, 12, 4).unwrap().and_hms_opt(0, 0, 0).unwrap(), - NaiveDate::from_ymd_opt(2025, 12, 5).unwrap().and_hms_opt(0, 0, 0).unwrap(), - NaiveDate::from_ymd_opt(2025, 12, 6).unwrap().and_hms_opt(0, 0, 0).unwrap(), + use chrono::prelude::*; + use polars::prelude::*; + + let mut df: DataFrame = df!( + "name" => ["Alice Archer", "Ben Brown", "Chloe Cooper", "Daniel Donovan"], + "birthdate" => [ + NaiveDate::from_ymd_opt(1997, 1, 10).unwrap(), + NaiveDate::from_ymd_opt(1985, 2, 15).unwrap(), + NaiveDate::from_ymd_opt(1983, 3, 22).unwrap(), + NaiveDate::from_ymd_opt(1981, 4, 30).unwrap(), ], - "d"=> [Some(1.0), Some(2.0), None, Some(-42.), None, Some(3.1415)], - "e"=> ["X", "X", "Y", "X", "Z", "Y"], + "weight" => [57.9, 72.5, 53.6, 83.1], // (kg) + "height" => [1.56, 1.77, 1.65, 1.75], // (m) ) .unwrap(); + println!("{}", df); // --8<-- [end:df] - // --8<-- [start:select] - let out = df.clone().lazy().select([col("c")]).collect()?; - println!("{}", out); - // --8<-- [end:select] + // --8<-- [start:csv] + use std::fs::File; - // --8<-- [start:select2] - let out = df.clone().lazy().select([col("a"), col("b")]).collect()?; - println!("{}", out); - // --8<-- [end:select2] + let mut file = File::create("../../../assets/data/output.csv").expect("could not create file"); + CsvWriter::new(&mut file) + .include_header(true) + .with_separator(b',') + .finish(&mut df)?; + let df_csv = CsvReadOptions::default() + .with_infer_schema_length(None) + .with_has_header(true) + .with_parse_options(CsvParseOptions::default().with_try_parse_dates(true)) + .try_into_reader_with_file_path(Some("../../../assets/data/output.csv".into()))? + .finish()?; + println!("{}", df_csv); + // --8<-- [end:csv] - // --8<-- [start:filter] - let start_date = NaiveDate::from_ymd_opt(2025, 12, 2) - .unwrap() - .and_hms_opt(0, 0, 0) - .unwrap(); - let end_date = NaiveDate::from_ymd_opt(2025, 12, 3) - .unwrap() - .and_hms_opt(0, 0, 0) - .unwrap(); - let out = df + // --8<-- [start:select] + let result = df .clone() .lazy() - .filter( - col("c") - .gt_eq(lit(start_date)) - .and(col("c").lt_eq(lit(end_date))), - ) + .select([ + col("name"), + col("birthdate").dt().year().alias("birth_year"), + (col("weight") / col("height").pow(2)).alias("bmi"), + ]) .collect()?; - println!("{}", out); - // --8<-- [end:filter] + println!("{}", result); + // --8<-- [end:select] - // --8<-- [start:filter2] - let out = df + // --8<-- [start:expression-expansion] + let result = df .clone() .lazy() - .filter(col("a").lt_eq(3).and(col("d").is_not_null())) + .select([ + col("name"), + (cols(["weight", "height"]) * lit(0.95)) + .round(2) + .name() + .suffix("-5%"), + ]) .collect()?; - println!("{}", out); - // --8<-- [end:filter2] + println!("{}", result); + // --8<-- [end:expression-expansion] // --8<-- [start:with_columns] - let out = df + let result = df .clone() .lazy() - .with_columns([(col("b") + lit(42)).alias("b+42")]) + .with_columns([ + col("birthdate").dt().year().alias("birth_year"), + (col("weight") / col("height").pow(2)).alias("bmi"), + ]) .collect()?; - println!("{}", out); + println!("{}", result); // --8<-- [end:with_columns] + // --8<-- [start:filter] + let result = df + .clone() + .lazy() + .filter(col("birthdate").dt().year().lt(lit(1990))) + .collect()?; + println!("{}", result); + // --8<-- [end:filter] + + // --8<-- [start:filter-multiple] + let result = df + .clone() + .lazy() + .filter( + col("birthdate") + .is_between( + lit(NaiveDate::from_ymd_opt(1982, 12, 31).unwrap()), + lit(NaiveDate::from_ymd_opt(1996, 1, 1).unwrap()), + ClosedInterval::Both, + ) + .and(col("height").gt(lit(1.7))), + ) + .collect()?; + println!("{}", result); + // --8<-- [end:filter-multiple] + // --8<-- [start:group_by] - let out = df.clone().lazy().group_by(["e"]).agg([len()]).collect()?; - println!("{}", out); + // Use `group_by_stable` if you want the Python behaviour of `maintain_order=True`. + let result = df + .clone() + .lazy() + .group_by([(col("birthdate").dt().year() / lit(10) * lit(10)).alias("decade")]) + .agg([len()]) + .collect()?; + println!("{}", result); // --8<-- [end:group_by] - // --8<-- [start:group_by2] - let out = df + // --8<-- [start:group_by-agg] + let result = df .clone() .lazy() - .group_by(["e"]) - .agg([col("a").max().alias("max_a"), col("b").sum().alias("sum_b")]) + .group_by([(col("birthdate").dt().year() / lit(10) * lit(10)).alias("decade")]) + .agg([ + len().alias("sample_size"), + col("weight").mean().round(2).alias("avg_weight"), + col("height").max().alias("tallest"), + ]) .collect()?; - println!("{}", out); - // --8<-- [end:group_by2] + println!("{}", result); + // --8<-- [end:group_by-agg] // --8<-- [start:complex] - let start_date = NaiveDate::from_ymd_opt(2025, 12, 1) - .unwrap() - .and_hms_opt(0, 0, 0) - .unwrap(); - let end_date = NaiveDate::from_ymd_opt(2025, 12, 5) - .unwrap() - .and_hms_opt(0, 0, 0) - .unwrap(); - let out = df + let result = df .clone() .lazy() - .filter( - col("c") - .gt_eq(lit(start_date)) - .and(col("c").lt_eq(lit(end_date))), - ) - .group_by(["e"]) + .with_columns([ + (col("birthdate").dt().year() / lit(10) * lit(10)).alias("decade"), + col("name").str().split(lit(" ")).list().first(), + ]) + .select([all().exclude(["birthdate"])]) + .group_by([col("decade")]) .agg([ - len().alias("count"), - col("a").max().alias("max_a"), - col("b").sum().alias("sum_b"), + col("name"), + cols(["weight", "height"]) + .mean() + .round(2) + .name() + .prefix("avg_"), ]) - .with_columns([(col("max_a") * col("sum_b")).alias("times")]) - .select([col("*").exclude(["max_a", "sum_b"])]) .collect()?; - println!("{}", out); + println!("{}", result); // --8<-- [end:complex] - // --8<-- [start:csv] - // --8<-- [end:csv] - // --8<-- [start:expression-expansion] - // --8<-- [end:expression-expansion] - // --8<-- [start:filter-multiple] - // --8<-- [end:filter-multiple] - // --8<-- [start:group_by-agg] - // --8<-- [end:group_by-agg] + // --8<-- [start:join] + let df2: DataFrame = df!( + "name" => ["Ben Brown", "Daniel Donovan", "Alice Archer", "Chloe Cooper"], + "parent" => [true, false, false, false], + "siblings" => [1, 2, 3, 4], + ) + .unwrap(); + + let result = df + .clone() + .lazy() + .join( + df2.clone().lazy(), + [col("name")], + [col("name")], + JoinArgs::new(JoinType::Left), + ) + .collect()?; + + println!("{}", result); // --8<-- [end:join] + // --8<-- [start:concat] + let df3: DataFrame = df!( + "name" => ["Ethan Edwards", "Fiona Foster", "Grace Gibson", "Henry Harris"], + "birthdate" => [ + NaiveDate::from_ymd_opt(1977, 5, 10).unwrap(), + NaiveDate::from_ymd_opt(1975, 6, 23).unwrap(), + NaiveDate::from_ymd_opt(1973, 7, 22).unwrap(), + NaiveDate::from_ymd_opt(1971, 8, 3).unwrap(), + ], + "weight" => [67.9, 72.5, 57.6, 93.1], // (kg) + "height" => [1.76, 1.6, 1.66, 1.8], // (m) + ) + .unwrap(); + + let result = concat( + [df.clone().lazy(), df3.clone().lazy()], + UnionArgs::default(), + )? + .collect()?; + println!("{}", result); // --8<-- [end:concat] Ok(()) diff --git a/docs/source/user-guide/getting-started.md b/docs/source/user-guide/getting-started.md index e5c913b49963..b99335bc085b 100644 --- a/docs/source/user-guide/getting-started.md +++ b/docs/source/user-guide/getting-started.md @@ -91,12 +91,15 @@ You can check other sections of the user guide to learn more about [basic operat The context `with_columns` is very similar to the context `select` but `with_columns` adds columns to the dataframe instead of selecting them. Notice how the resulting dataframe contains the four columns of the original dataframe plus the two new columns introduced by the expressions inside `with_columns`: -{{code_block('user-guide/getting-started/expressions','with_columns',['with_columns','alias'])}} +{{code_block('user-guide/getting-started/expressions','with_columns',['with_columns'])}} ```python exec="on" result="text" session="getting-started/expressions" --8<-- "python/user-guide/getting-started/expressions.py:with_columns" ``` +In the example above we also decided to use named expressions instead of the method `alias` to specify the names of the new columns. +Other contexts like `select` and `group_by` also accept named expressions. + ### `filter` The context `filter` allows us to create a second dataframe with a subset of the rows of the original one: @@ -143,7 +146,7 @@ After using the context `group_by` we can use `agg` to compute aggregations over Contexts and the expressions within can be chained to create more complex queries according to your needs. In the example below we combine some of the contexts we have seen so far to create a more complex query: -{{code_block('user-guide/getting-started/expressions','complex',['group_by','agg','select','with_columns','Expr.list'])}} +{{code_block('user-guide/getting-started/expressions','complex',['group_by','agg','select','with_columns','Expr.str','Expr.list'])}} ```python exec="on" result="text" session="getting-started/expressions" --8<-- "python/user-guide/getting-started/expressions.py:complex" @@ -157,7 +160,7 @@ In this section, we show an example of a join and an example of a concatenation. ### Joinining dataframes Polars provides many different join algorithms. -The example below shows how to use a left inner join to combine two dataframes when a column can be used as a unique identifier to establish a correspondence between rows across the dataframes: +The example below shows how to use a left outer join to combine two dataframes when a column can be used as a unique identifier to establish a correspondence between rows across the dataframes: {{code_block('user-guide/getting-started/expressions','join',['join'])}} @@ -172,7 +175,7 @@ Polars provides many different join algorithms that you can learn about in the [ Concatenating dataframes creates a taller or wider dataframe, depending on the method used. Assuming we have a second dataframe with data from other people, we could use vertical concatenation to create a taller dataframe: -{{code_block('user-guide/getting-started/expressions','concat',['vstack'])}} +{{code_block('user-guide/getting-started/expressions','concat',['concat'])}} ```python exec="on" result="text" session="getting-started/expressions" --8<-- "python/user-guide/getting-started/expressions.py:concat"