Skip to content

Commit

Permalink
Add to_char function implementation using chrono formats (#9181)
Browse files Browse the repository at this point in the history
* initial to_char impl #9147

* fleshed out to_char impl, added tests and docs.

* adding sqllogictests

* adding time support, improved tests in datatime_expressions.rs, fixed scalar doc example

* Doc fix.

* Prettier

* Update datafusion/physical-expr/src/datetime_expressions.rs

Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>

* Updating the chrono dependency since Chrono 0.4.34 changed Display for the Duration type (which is now an alias for TimeDelta) and this broke the timestamp tests.

* Updates and fixes based on PR feedback.

* Add escape to rustdoc.

* Fixed example

* Handled null formats better - now any null formats will result in a default format being used.

---------

Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
  • Loading branch information
Omega359 and alamb authored Feb 13, 2024
1 parent 60fbca9 commit 196b718
Show file tree
Hide file tree
Showing 19 changed files with 1,078 additions and 8 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ arrow-string = { version = "50.0.0", default-features = false }
async-trait = "0.1.73"
bigdecimal = "0.4.1"
bytes = "1.4"
chrono = { version = "0.4.31", default-features = false }
chrono = { version = "0.4.34", default-features = false }
ctor = "0.2.0"
dashmap = "5.4.0"
datafusion = { path = "datafusion/core", version = "35.0.0" }
Expand Down
4 changes: 2 additions & 2 deletions datafusion-cli/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions datafusion-examples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ cargo run --example csv_sql
- [`query-http-csv.rs`](examples/query-http-csv.rs): Configure `object_store` and run a query against files vi HTTP
- [`regexp.rs`](examples/regexp.rs): Examples of using regular expression functions
- [`rewrite_expr.rs`](examples/rewrite_expr.rs): Define and invoke a custom Query Optimizer pass
- [`to_char.rs`](examples/to_char.rs): Examples of using the to_char function
- [`to_timestamp.rs`](examples/to_timestamp.rs): Examples of using to_timestamp functions
- [`simple_udf.rs`](examples/simple_udf.rs): Define and invoke a User Defined Scalar Function (UDF)
- [`advanced_udf.rs`](examples/advanced_udf.rs): Define and invoke a more complicated User Defined Scalar Function (UDF)
Expand Down
197 changes: 197 additions & 0 deletions datafusion-examples/examples/to_char.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,197 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

use arrow::array::Date32Array;
use datafusion::arrow::array::StringArray;
use datafusion::arrow::datatypes::{DataType, Field, Schema};
use datafusion::arrow::record_batch::RecordBatch;
use datafusion::assert_batches_eq;
use datafusion::error::Result;
use datafusion::prelude::*;
use std::sync::Arc;

/// This example demonstrates how to use the to_char function via sql
///
/// This function accepts date, time, timestamp and duration values
/// in the first argument and string values for the second
#[tokio::main]
async fn main() -> Result<()> {
let schema = Arc::new(Schema::new(vec![
Field::new("values", DataType::Date32, false),
Field::new("patterns", DataType::Utf8, false),
]));

let batch = RecordBatch::try_new(
schema,
vec![
Arc::new(Date32Array::from(vec![18506, 18507, 18508, 18509])),
Arc::new(StringArray::from(vec![
"%Y-%m-%d", "%Y:%m:%d", "%Y%m%d", "%d-%m-%Y",
])),
],
)?;

// declare a new context. In spark API, this corresponds to a new spark SQLsession
let ctx = SessionContext::new();

// declare a table in memory. In spark API, this corresponds to createDataFrame(...).
ctx.register_batch("t", batch)?;
let _ = ctx.table("t").await?;

// use to_char function to convert col 'values' to timestamp type using
// patterns stored in col 'patterns'
let result = ctx
.sql("SELECT to_char(values, patterns) from t")
.await?
.collect()
.await?;

assert_batches_eq!(
&[
"+------------------------------+",
"| to_char(t.values,t.patterns) |",
"+------------------------------+",
"| 2020-09-01 |",
"| 2020:09:02 |",
"| 20200903 |",
"| 04-09-2020 |",
"+------------------------------+",
],
&result
);

// the date_format alias for the to_char function can be used as well
let result = ctx
.sql("SELECT date_format(values, patterns) from t")
.await?
.collect()
.await?;

assert_batches_eq!(
&[
"+------------------------------+",
"| to_char(t.values,t.patterns) |",
"+------------------------------+",
"| 2020-09-01 |",
"| 2020:09:02 |",
"| 20200903 |",
"| 04-09-2020 |",
"+------------------------------+",
],
&result
);

// use to_char function to convert col 'values' with a fixed format
let result = ctx
.sql("SELECT to_char(values, '%m-%d-%Y') FROM t")
.await?
.collect()
.await?;

assert_batches_eq!(
&[
"+------------------------------------+",
"| to_char(t.values,Utf8(\"%m-%d-%Y\")) |",
"+------------------------------------+",
"| 09-01-2020 |",
"| 09-02-2020 |",
"| 09-03-2020 |",
"| 09-04-2020 |",
"+------------------------------------+",
],
&result
);

// if you want to just use the default format cast to a string
let result = ctx
.sql("SELECT arrow_cast(values, 'Utf8') from t")
.await?
.collect()
.await?;

assert_batches_eq!(
&[
"+------------+",
"| t.values |",
"+------------+",
"| 2020-09-01 |",
"| 2020-09-02 |",
"| 2020-09-03 |",
"| 2020-09-04 |",
"+------------+",
],
&result
);

// use can use literals as well (note the use of timestamp here)
let result = ctx
.sql("SELECT to_char(arrow_cast(TIMESTAMP '2023-08-03 14:38:50Z', 'Timestamp(Second, None)'), '%d-%m-%Y %H:%M:%S')")
.await?
.collect()
.await?;

assert_batches_eq!(
&[
"+-----------------------------------------------------------------+",
"| to_char(Utf8(\"2023-08-03 14:38:50Z\"),Utf8(\"%d-%m-%Y %H:%M:%S\")) |",
"+-----------------------------------------------------------------+",
"| 03-08-2023 14:38:50 |",
"+-----------------------------------------------------------------+",
],
&result
);

// durations are supported though the output format is limited to two formats
// 'pretty' and 'ISO8601'
let result = ctx
.sql("SELECT to_char(arrow_cast(123456, 'Duration(Second)'), 'pretty')")
.await?
.collect()
.await?;

assert_batches_eq!(
&[
"+---------------------------------------+",
"| to_char(Int64(123456),Utf8(\"pretty\")) |",
"+---------------------------------------+",
"| 1 days 10 hours 17 mins 36 secs |",
"+---------------------------------------+",
],
&result
);

// durations are supported though the output format is limited to two formats
// 'pretty' and 'ISO8601'
let result = ctx
.sql("SELECT to_char(arrow_cast(123456, 'Duration(Second)'), 'iso8601')")
.await?
.collect()
.await?;

assert_batches_eq!(
&[
"+----------------------------------------+",
"| to_char(Int64(123456),Utf8(\"iso8601\")) |",
"+----------------------------------------+",
"| PT123456S |",
"+----------------------------------------+",
],
&result
);

Ok(())
}
2 changes: 1 addition & 1 deletion datafusion/execution/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ path = "src/lib.rs"

[dependencies]
arrow = { workspace = true }
chrono = { version = "0.4", default-features = false }
chrono = { workspace = true }
dashmap = { workspace = true }
datafusion-common = { workspace = true }
datafusion-expr = { workspace = true }
Expand Down
40 changes: 40 additions & 0 deletions datafusion/expr/src/built_in_function.rs
Original file line number Diff line number Diff line change
Expand Up @@ -315,6 +315,8 @@ pub enum BuiltinScalarFunction {
SubstrIndex,
/// find_in_set
FindInSet,
/// to_char
ToChar,
}

/// Maps the sql function name to `BuiltinScalarFunction`
Expand Down Expand Up @@ -479,6 +481,7 @@ impl BuiltinScalarFunction {
BuiltinScalarFunction::Strpos => Volatility::Immutable,
BuiltinScalarFunction::Substr => Volatility::Immutable,
BuiltinScalarFunction::ToHex => Volatility::Immutable,
BuiltinScalarFunction::ToChar => Volatility::Immutable,
BuiltinScalarFunction::ToTimestamp => Volatility::Immutable,
BuiltinScalarFunction::ToTimestampMillis => Volatility::Immutable,
BuiltinScalarFunction::ToTimestampMicros => Volatility::Immutable,
Expand Down Expand Up @@ -799,6 +802,7 @@ impl BuiltinScalarFunction {
BuiltinScalarFunction::FindInSet => {
utf8_to_int_type(&input_expr_types[0], "find_in_set")
}
BuiltinScalarFunction::ToChar => Ok(Utf8),
BuiltinScalarFunction::ToTimestamp
| BuiltinScalarFunction::ToTimestampNanos => Ok(Timestamp(Nanosecond, None)),
BuiltinScalarFunction::ToTimestampMillis => Ok(Timestamp(Millisecond, None)),
Expand Down Expand Up @@ -1059,6 +1063,41 @@ impl BuiltinScalarFunction {
vec![Exact(vec![Utf8, Int64]), Exact(vec![LargeUtf8, Int64])],
self.volatility(),
),
BuiltinScalarFunction::ToChar => Signature::one_of(
vec![
Exact(vec![Date32, Utf8]),
Exact(vec![Date64, Utf8]),
Exact(vec![Time32(Millisecond), Utf8]),
Exact(vec![Time32(Second), Utf8]),
Exact(vec![Time64(Microsecond), Utf8]),
Exact(vec![Time64(Nanosecond), Utf8]),
Exact(vec![Timestamp(Second, None), Utf8]),
Exact(vec![
Timestamp(Second, Some(TIMEZONE_WILDCARD.into())),
Utf8,
]),
Exact(vec![Timestamp(Millisecond, None), Utf8]),
Exact(vec![
Timestamp(Millisecond, Some(TIMEZONE_WILDCARD.into())),
Utf8,
]),
Exact(vec![Timestamp(Microsecond, None), Utf8]),
Exact(vec![
Timestamp(Microsecond, Some(TIMEZONE_WILDCARD.into())),
Utf8,
]),
Exact(vec![Timestamp(Nanosecond, None), Utf8]),
Exact(vec![
Timestamp(Nanosecond, Some(TIMEZONE_WILDCARD.into())),
Utf8,
]),
Exact(vec![Duration(Second), Utf8]),
Exact(vec![Duration(Millisecond), Utf8]),
Exact(vec![Duration(Microsecond), Utf8]),
Exact(vec![Duration(Nanosecond), Utf8]),
],
self.volatility(),
),
BuiltinScalarFunction::ToTimestamp
| BuiltinScalarFunction::ToTimestampSeconds
| BuiltinScalarFunction::ToTimestampMillis
Expand Down Expand Up @@ -1517,6 +1556,7 @@ impl BuiltinScalarFunction {
BuiltinScalarFunction::DateBin => &["date_bin"],
BuiltinScalarFunction::DateTrunc => &["date_trunc", "datetrunc"],
BuiltinScalarFunction::DatePart => &["date_part", "datepart"],
BuiltinScalarFunction::ToChar => &["to_char", "date_format"],
BuiltinScalarFunction::ToTimestamp => &["to_timestamp"],
BuiltinScalarFunction::ToTimestampMillis => &["to_timestamp_millis"],
BuiltinScalarFunction::ToTimestampMicros => &["to_timestamp_micros"],
Expand Down
6 changes: 6 additions & 0 deletions datafusion/expr/src/expr_fn.rs
Original file line number Diff line number Diff line change
Expand Up @@ -890,6 +890,12 @@ nary_scalar_expr!(
scalar_expr!(DatePart, date_part, part date, "extracts a subfield from the date");
scalar_expr!(DateTrunc, date_trunc, part date, "truncates the date to a specified level of precision");
scalar_expr!(DateBin, date_bin, stride source origin, "coerces an arbitrary timestamp to the start of the nearest specified interval");
scalar_expr!(
ToChar,
to_char,
datetime format,
"converts a date, time, timestamp or duration to a string based on the provided format"
);
nary_scalar_expr!(
ToTimestamp,
to_timestamp,
Expand Down
4 changes: 4 additions & 0 deletions datafusion/physical-expr/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -94,3 +94,7 @@ name = "to_timestamp"
[[bench]]
harness = false
name = "regexp"

[[bench]]
harness = false
name = "to_char"
Loading

0 comments on commit 196b718

Please sign in to comment.