Skip to content

Commit

Permalink
feat: convert fixed-offset timezones to respective Etc timezone from …
Browse files Browse the repository at this point in the history
…time zone database (pola-rs#13738)
  • Loading branch information
MarcoGorelli authored and r-brink committed Jan 24, 2024
1 parent c5ca69d commit 5aeb218
Show file tree
Hide file tree
Showing 3 changed files with 78 additions and 7 deletions.
41 changes: 41 additions & 0 deletions crates/polars-core/src/chunked_array/temporal/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@ use chrono::NaiveDateTime;
use chrono::NaiveTime;
#[cfg(feature = "timezones")]
use chrono_tz::Tz;
#[cfg(feature = "timezones")]
use once_cell::sync::Lazy;
#[cfg(all(feature = "regex", feature = "timezones"))]
use regex::Regex;
#[cfg(feature = "dtype-time")]
pub use time::time_to_time64ns;

Expand All @@ -26,6 +30,18 @@ pub fn unix_time() -> NaiveDateTime {
NaiveDateTime::from_timestamp_opt(0, 0).unwrap()
}

#[cfg(feature = "timezones")]
static FIXED_OFFSET_PATTERN: &str = r#"(?x)
^
(?P<sign>[-+])? # optional sign
(?P<hour>0[0-9]|1[0-4]) # hour (between 0 and 14)
:? # optional separator
00 # minute
$
"#;
#[cfg(feature = "timezones")]
static FIXED_OFFSET_RE: Lazy<Regex> = Lazy::new(|| Regex::new(FIXED_OFFSET_PATTERN).unwrap());

#[cfg(feature = "timezones")]
pub(crate) fn validate_time_zone(tz: &str) -> PolarsResult<()> {
match tz.parse::<Tz>() {
Expand All @@ -45,3 +61,28 @@ pub fn parse_time_zone(tz: &str) -> PolarsResult<Tz> {
},
}
}

/// Convert fixed offset to Etc/GMT one from time zone database
///
/// E.g. +01:00 -> Etc/GMT-1
///
/// Note: the sign appears reversed, but is correct, see https://en.wikipedia.org/wiki/Tz_database#Area:
/// > In order to conform with the POSIX style, those zone names beginning with
/// > "Etc/GMT" have their sign reversed from the standard ISO 8601 convention.
/// > In the "Etc" area, zones west of GMT have a positive sign and those east
/// > have a negative sign in their name (e.g "Etc/GMT-14" is 14 hours ahead of GMT).
#[cfg(feature = "timezones")]
pub fn parse_fixed_offset(tz: &str) -> PolarsResult<String> {
if let Some(caps) = FIXED_OFFSET_RE.captures(tz) {
let sign = match caps.name("sign").map(|s| s.as_str()) {
Some("-") => "+",
_ => "-",
};
let hour = caps.name("hour").unwrap().as_str().parse::<i32>().unwrap();
let etc_tz = format!("Etc/GMT{}{}", sign, hour);
if etc_tz.parse::<Tz>().is_ok() {
return Ok(etc_tz);
}
}
polars_bail!(ComputeError: "unable to parse time zone: '{}'. Please check the Time Zone Database for a list of available time zones", tz)
}
12 changes: 8 additions & 4 deletions crates/polars-core/src/series/from.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ use crate::chunked_array::object::extension::polars_extension::PolarsExtension;
#[cfg(feature = "object")]
use crate::chunked_array::object::extension::EXTENSION_NAME;
#[cfg(feature = "timezones")]
use crate::chunked_array::temporal::parse_fixed_offset;
#[cfg(feature = "timezones")]
use crate::chunked_array::temporal::validate_time_zone;
#[cfg(all(feature = "dtype-decimal", feature = "python"))]
use crate::config::decimal_is_active;
Expand Down Expand Up @@ -205,12 +207,14 @@ impl Series {
let mut tz = tz.clone();
match tz.as_deref() {
Some("") => tz = None,
#[cfg(feature = "timezones")]
Some("+00:00") | Some("00:00") => tz = Some("UTC".to_string()),
Some(_tz) => {
#[cfg(feature = "timezones")]
validate_time_zone(_tz)?;
#[cfg(feature = "timezones")]
Some(tz_str) => match validate_time_zone(tz_str) {
Ok(_) => (),
Err(_) => tz = Some(parse_fixed_offset(tz_str)?),
},
None => (),
_ => (),
}
let chunks = cast_chunks(&chunks, &DataType::Int64, false).unwrap();
let s = Int64Chunked::from_chunks(name, chunks)
Expand Down
32 changes: 29 additions & 3 deletions py-polars/tests/unit/interop/test_interop.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from __future__ import annotations

from datetime import date, datetime, time
from datetime import date, datetime, time, timezone
from typing import Any, cast

import numpy as np
Expand Down Expand Up @@ -1082,12 +1082,38 @@ def test_sliced_struct_from_arrow() -> None:

def test_from_arrow_invalid_time_zone() -> None:
arr = pa.array(
[datetime(2021, 1, 1, 0, 0, 0, 0)], type=pa.timestamp("ns", tz="+01:00")
[datetime(2021, 1, 1, 0, 0, 0, 0)],
type=pa.timestamp("ns", tz="this-is-not-a-time-zone"),
)
with pytest.raises(ComputeError, match=r"unable to parse time zone: '\+01:00'"):
with pytest.raises(
ComputeError, match=r"unable to parse time zone: 'this-is-not-a-time-zone'"
):
pl.from_arrow(arr)


@pytest.mark.parametrize(
("fixed_offset", "etc_tz"),
[
("+10:00", "Etc/GMT-10"),
("10:00", "Etc/GMT-10"),
("-10:00", "Etc/GMT+10"),
("+05:00", "Etc/GMT-5"),
("05:00", "Etc/GMT-5"),
("-05:00", "Etc/GMT+5"),
],
)
def test_from_arrow_fixed_offset(fixed_offset: str, etc_tz: str) -> None:
arr = pa.array(
[datetime(2021, 1, 1, 0, 0, 0, 0)],
type=pa.timestamp("us", tz=fixed_offset),
)
result = cast(pl.Series, pl.from_arrow(arr))
expected = pl.Series(
[datetime(2021, 1, 1, tzinfo=timezone.utc)]
).dt.convert_time_zone(etc_tz)
assert_series_equal(result, expected)


def test_from_avro_valid_time_zone_13032() -> None:
arr = pa.array(
[datetime(2021, 1, 1, 0, 0, 0, 0)], type=pa.timestamp("ns", tz="00:00")
Expand Down

0 comments on commit 5aeb218

Please sign in to comment.