From 5aeb21874fb4cffc1c13bb899956ef8513e78e78 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Tue, 16 Jan 2024 10:23:12 +0000 Subject: [PATCH] feat: convert fixed-offset timezones to respective Etc timezone from time zone database (#13738) --- .../src/chunked_array/temporal/mod.rs | 41 +++++++++++++++++++ crates/polars-core/src/series/from.rs | 12 ++++-- py-polars/tests/unit/interop/test_interop.py | 32 +++++++++++++-- 3 files changed, 78 insertions(+), 7 deletions(-) diff --git a/crates/polars-core/src/chunked_array/temporal/mod.rs b/crates/polars-core/src/chunked_array/temporal/mod.rs index ad2716ff7e28..f761214f85a6 100644 --- a/crates/polars-core/src/chunked_array/temporal/mod.rs +++ b/crates/polars-core/src/chunked_array/temporal/mod.rs @@ -15,6 +15,10 @@ use chrono::NaiveDateTime; use chrono::NaiveTime; #[cfg(feature = "timezones")] use chrono_tz::Tz; +#[cfg(feature = "timezones")] +use once_cell::sync::Lazy; +#[cfg(all(feature = "regex", feature = "timezones"))] +use regex::Regex; #[cfg(feature = "dtype-time")] pub use time::time_to_time64ns; @@ -26,6 +30,18 @@ pub fn unix_time() -> NaiveDateTime { NaiveDateTime::from_timestamp_opt(0, 0).unwrap() } +#[cfg(feature = "timezones")] +static FIXED_OFFSET_PATTERN: &str = r#"(?x) + ^ + (?P[-+])? # optional sign + (?P0[0-9]|1[0-4]) # hour (between 0 and 14) + :? # optional separator + 00 # minute + $ + "#; +#[cfg(feature = "timezones")] +static FIXED_OFFSET_RE: Lazy = Lazy::new(|| Regex::new(FIXED_OFFSET_PATTERN).unwrap()); + #[cfg(feature = "timezones")] pub(crate) fn validate_time_zone(tz: &str) -> PolarsResult<()> { match tz.parse::() { @@ -45,3 +61,28 @@ pub fn parse_time_zone(tz: &str) -> PolarsResult { }, } } + +/// Convert fixed offset to Etc/GMT one from time zone database +/// +/// E.g. +01:00 -> Etc/GMT-1 +/// +/// Note: the sign appears reversed, but is correct, see https://en.wikipedia.org/wiki/Tz_database#Area: +/// > In order to conform with the POSIX style, those zone names beginning with +/// > "Etc/GMT" have their sign reversed from the standard ISO 8601 convention. +/// > In the "Etc" area, zones west of GMT have a positive sign and those east +/// > have a negative sign in their name (e.g "Etc/GMT-14" is 14 hours ahead of GMT). +#[cfg(feature = "timezones")] +pub fn parse_fixed_offset(tz: &str) -> PolarsResult { + if let Some(caps) = FIXED_OFFSET_RE.captures(tz) { + let sign = match caps.name("sign").map(|s| s.as_str()) { + Some("-") => "+", + _ => "-", + }; + let hour = caps.name("hour").unwrap().as_str().parse::().unwrap(); + let etc_tz = format!("Etc/GMT{}{}", sign, hour); + if etc_tz.parse::().is_ok() { + return Ok(etc_tz); + } + } + polars_bail!(ComputeError: "unable to parse time zone: '{}'. Please check the Time Zone Database for a list of available time zones", tz) +} diff --git a/crates/polars-core/src/series/from.rs b/crates/polars-core/src/series/from.rs index c50a79a35ac0..d7fc5d033414 100644 --- a/crates/polars-core/src/series/from.rs +++ b/crates/polars-core/src/series/from.rs @@ -19,6 +19,8 @@ use crate::chunked_array::object::extension::polars_extension::PolarsExtension; #[cfg(feature = "object")] use crate::chunked_array::object::extension::EXTENSION_NAME; #[cfg(feature = "timezones")] +use crate::chunked_array::temporal::parse_fixed_offset; +#[cfg(feature = "timezones")] use crate::chunked_array::temporal::validate_time_zone; #[cfg(all(feature = "dtype-decimal", feature = "python"))] use crate::config::decimal_is_active; @@ -205,12 +207,14 @@ impl Series { let mut tz = tz.clone(); match tz.as_deref() { Some("") => tz = None, + #[cfg(feature = "timezones")] Some("+00:00") | Some("00:00") => tz = Some("UTC".to_string()), - Some(_tz) => { - #[cfg(feature = "timezones")] - validate_time_zone(_tz)?; + #[cfg(feature = "timezones")] + Some(tz_str) => match validate_time_zone(tz_str) { + Ok(_) => (), + Err(_) => tz = Some(parse_fixed_offset(tz_str)?), }, - None => (), + _ => (), } let chunks = cast_chunks(&chunks, &DataType::Int64, false).unwrap(); let s = Int64Chunked::from_chunks(name, chunks) diff --git a/py-polars/tests/unit/interop/test_interop.py b/py-polars/tests/unit/interop/test_interop.py index 440772eaa54f..f2f6d6556d7a 100644 --- a/py-polars/tests/unit/interop/test_interop.py +++ b/py-polars/tests/unit/interop/test_interop.py @@ -1,6 +1,6 @@ from __future__ import annotations -from datetime import date, datetime, time +from datetime import date, datetime, time, timezone from typing import Any, cast import numpy as np @@ -1082,12 +1082,38 @@ def test_sliced_struct_from_arrow() -> None: def test_from_arrow_invalid_time_zone() -> None: arr = pa.array( - [datetime(2021, 1, 1, 0, 0, 0, 0)], type=pa.timestamp("ns", tz="+01:00") + [datetime(2021, 1, 1, 0, 0, 0, 0)], + type=pa.timestamp("ns", tz="this-is-not-a-time-zone"), ) - with pytest.raises(ComputeError, match=r"unable to parse time zone: '\+01:00'"): + with pytest.raises( + ComputeError, match=r"unable to parse time zone: 'this-is-not-a-time-zone'" + ): pl.from_arrow(arr) +@pytest.mark.parametrize( + ("fixed_offset", "etc_tz"), + [ + ("+10:00", "Etc/GMT-10"), + ("10:00", "Etc/GMT-10"), + ("-10:00", "Etc/GMT+10"), + ("+05:00", "Etc/GMT-5"), + ("05:00", "Etc/GMT-5"), + ("-05:00", "Etc/GMT+5"), + ], +) +def test_from_arrow_fixed_offset(fixed_offset: str, etc_tz: str) -> None: + arr = pa.array( + [datetime(2021, 1, 1, 0, 0, 0, 0)], + type=pa.timestamp("us", tz=fixed_offset), + ) + result = cast(pl.Series, pl.from_arrow(arr)) + expected = pl.Series( + [datetime(2021, 1, 1, tzinfo=timezone.utc)] + ).dt.convert_time_zone(etc_tz) + assert_series_equal(result, expected) + + def test_from_avro_valid_time_zone_13032() -> None: arr = pa.array( [datetime(2021, 1, 1, 0, 0, 0, 0)], type=pa.timestamp("ns", tz="00:00")