use arrow::array::PrimitiveArray;
use chrono::format::ParseErrorKind;
use chrono::{DateTime, NaiveDate, NaiveDateTime, NaiveTime};
use polars_core::prelude::*;
use super::patterns::{self, Pattern};
#[cfg(feature = "dtype-date")]
use crate::chunkedarray::date::naive_date_to_date;
use crate::chunkedarray::string::strptime;
use crate::prelude::string::strptime::StrpTimeState;
polars_utils::regex_cache::cached_regex! {
static DATETIME_DMY_RE = r#"(?x)
^
['"]? # optional quotes
(?:\d{1,2}) # day
[-/\.] # separator
(?P<month>[01]?\d{1}) # month
[-/\.] # separator
(?:\d{4,}) # year
(?:
[T\ ] # separator
(?:\d{1,2}) # hour
:? # separator
(?:\d{1,2}) # minute
(?:
:? # separator
(?:\d{1,2}) # second
(?:
\.(?:\d{1,9}) # subsecond
)?
)?
)?
['"]? # optional quotes
$
"#;
static DATETIME_YMD_RE = r#"(?x)
^
['"]? # optional quotes
(?:\d{4,}) # year
[-/\.]? # separator
(?P<month>[01]?\d{1}) # month
[-/\.]? # separator
(?:\d{1,2}) # day
(?:
[T\ ] # separator
(?:\d{1,2}) # hour
:? # separator
(?:\d{1,2}) # minute
(?:
:? # separator
(?:\d{1,2}) # seconds
(?:
\.(?:\d{1,9}) # subsecond
)?
)?
)?
['"]? # optional quotes
$
"#;
static DATETIME_YMDZ_RE = r#"(?x)
^
['"]? # optional quotes
(?:\d{4,}) # year
[-/\.]? # separator
(?P<month>[01]?\d{1}) # month
[-/\.]? # separator
(?:\d{1,2}) # year
[T\ ] # separator
(?:\d{2}) # hour
:? # separator
(?:\d{2}) # minute
(?:
:? # separator
(?:\d{2}) # second
(?:
\.(?:\d{1,9}) # subsecond
)?
)?
(?:
# offset (e.g. +01:00, +0100, or +01)
[+-](?:\d{2})
(?::?\d{2})?
# or Zulu suffix
|Z
)
['"]? # optional quotes
$
"#;
}
impl Pattern {
pub fn is_inferable(&self, val: &str) -> bool {
match self {
Pattern::DateDMY => true, Pattern::DateYMD => true, Pattern::Time => true,
Pattern::DatetimeDMY => match DATETIME_DMY_RE.captures(val) {
Some(search) => (1..=12).contains(
&search
.name("month")
.unwrap()
.as_str()
.parse::<u8>()
.unwrap(),
),
None => false,
},
Pattern::DatetimeYMD => match DATETIME_YMD_RE.captures(val) {
Some(search) => (1..=12).contains(
&search
.name("month")
.unwrap()
.as_str()
.parse::<u8>()
.unwrap(),
),
None => false,
},
Pattern::DatetimeYMDZ => match DATETIME_YMDZ_RE.captures(val) {
Some(search) => (1..=12).contains(
&search
.name("month")
.unwrap()
.as_str()
.parse::<u8>()
.unwrap(),
),
None => false,
},
}
}
}
pub trait StrpTimeParser<T> {
fn parse_bytes(&mut self, val: &[u8], time_unit: Option<TimeUnit>) -> Option<T>;
}
#[cfg(feature = "dtype-datetime")]
impl StrpTimeParser<i64> for DatetimeInfer<Int64Type> {
fn parse_bytes(&mut self, val: &[u8], time_unit: Option<TimeUnit>) -> Option<i64> {
if self.fmt_len == 0 {
self.fmt_len = strptime::fmt_len(self.latest_fmt.as_bytes())?;
}
let transform = match time_unit {
Some(TimeUnit::Nanoseconds) => datetime_to_timestamp_ns,
Some(TimeUnit::Microseconds) => datetime_to_timestamp_us,
Some(TimeUnit::Milliseconds) => datetime_to_timestamp_ms,
_ => unreachable!(), };
unsafe {
self.transform_bytes
.parse(val, self.latest_fmt.as_bytes(), self.fmt_len)
.map(transform)
.or_else(|| {
for fmt in self.patterns {
if self.fmt_len == 0 {
self.fmt_len = strptime::fmt_len(fmt.as_bytes())?;
}
if let Some(parsed) = self
.transform_bytes
.parse(val, fmt.as_bytes(), self.fmt_len)
.map(datetime_to_timestamp_us)
{
self.latest_fmt = fmt;
return Some(parsed);
}
}
None
})
}
}
}
#[cfg(feature = "dtype-date")]
impl StrpTimeParser<i32> for DatetimeInfer<Int32Type> {
fn parse_bytes(&mut self, val: &[u8], _time_unit: Option<TimeUnit>) -> Option<i32> {
if self.fmt_len == 0 {
self.fmt_len = strptime::fmt_len(self.latest_fmt.as_bytes())?;
}
unsafe {
self.transform_bytes
.parse(val, self.latest_fmt.as_bytes(), self.fmt_len)
.map(|ndt| naive_date_to_date(ndt.date()))
.or_else(|| {
for fmt in self.patterns {
if self.fmt_len == 0 {
self.fmt_len = strptime::fmt_len(fmt.as_bytes())?;
}
if let Some(parsed) = self
.transform_bytes
.parse(val, fmt.as_bytes(), self.fmt_len)
.map(|ndt| naive_date_to_date(ndt.date()))
{
self.latest_fmt = fmt;
return Some(parsed);
}
}
None
})
}
}
}
#[derive(Clone)]
pub struct DatetimeInfer<T: PolarsNumericType> {
pub pattern: Pattern,
patterns: &'static [&'static str],
latest_fmt: &'static str,
transform: fn(&str, &str) -> Option<T::Native>,
transform_bytes: StrpTimeState,
fmt_len: u16,
pub logical_type: DataType,
}
pub trait TryFromWithUnit<T>: Sized {
type Error;
fn try_from_with_unit(pattern: T, time_unit: Option<TimeUnit>) -> PolarsResult<Self>;
}
#[cfg(feature = "dtype-datetime")]
impl TryFromWithUnit<Pattern> for DatetimeInfer<Int64Type> {
type Error = PolarsError;
fn try_from_with_unit(value: Pattern, time_unit: Option<TimeUnit>) -> PolarsResult<Self> {
let time_unit = time_unit.expect("time_unit must be provided for datetime");
let transform = match (time_unit, value) {
(TimeUnit::Milliseconds, Pattern::DatetimeYMDZ) => transform_tzaware_datetime_ms,
(TimeUnit::Milliseconds, _) => transform_datetime_ms,
(TimeUnit::Microseconds, Pattern::DatetimeYMDZ) => transform_tzaware_datetime_us,
(TimeUnit::Microseconds, _) => transform_datetime_us,
(TimeUnit::Nanoseconds, Pattern::DatetimeYMDZ) => transform_tzaware_datetime_ns,
(TimeUnit::Nanoseconds, _) => transform_datetime_ns,
};
let (pattern, patterns) = match value {
Pattern::DatetimeDMY | Pattern::DateDMY => {
(Pattern::DatetimeDMY, patterns::DATETIME_D_M_Y)
},
Pattern::DatetimeYMD | Pattern::DateYMD => {
(Pattern::DatetimeYMD, patterns::DATETIME_Y_M_D)
},
Pattern::DatetimeYMDZ => (Pattern::DatetimeYMDZ, patterns::DATETIME_Y_M_D_Z),
Pattern::Time => (Pattern::Time, patterns::TIME_H_M_S),
};
Ok(DatetimeInfer {
pattern,
patterns,
latest_fmt: patterns[0],
transform,
transform_bytes: StrpTimeState::default(),
fmt_len: 0,
logical_type: DataType::Datetime(time_unit, None),
})
}
}
#[cfg(feature = "dtype-date")]
impl TryFromWithUnit<Pattern> for DatetimeInfer<Int32Type> {
type Error = PolarsError;
fn try_from_with_unit(value: Pattern, _time_unit: Option<TimeUnit>) -> PolarsResult<Self> {
match value {
Pattern::DateDMY => Ok(DatetimeInfer {
pattern: Pattern::DateDMY,
patterns: patterns::DATE_D_M_Y,
latest_fmt: patterns::DATE_D_M_Y[0],
transform: transform_date,
transform_bytes: StrpTimeState::default(),
fmt_len: 0,
logical_type: DataType::Date,
}),
Pattern::DateYMD => Ok(DatetimeInfer {
pattern: Pattern::DateYMD,
patterns: patterns::DATE_Y_M_D,
latest_fmt: patterns::DATE_Y_M_D[0],
transform: transform_date,
transform_bytes: StrpTimeState::default(),
fmt_len: 0,
logical_type: DataType::Date,
}),
_ => polars_bail!(ComputeError: "could not convert pattern"),
}
}
}
impl<T: PolarsNumericType> DatetimeInfer<T> {
pub fn parse(&mut self, val: &str) -> Option<T::Native> {
match (self.transform)(val, self.latest_fmt) {
Some(parsed) => Some(parsed),
None => {
if !self.pattern.is_inferable(val) {
return None;
}
for fmt in self.patterns {
self.fmt_len = 0;
if let Some(parsed) = (self.transform)(val, fmt) {
self.latest_fmt = fmt;
return Some(parsed);
}
}
None
},
}
}
}
impl<T: PolarsNumericType> DatetimeInfer<T> {
pub fn coerce_string(&mut self, ca: &StringChunked) -> Series {
let chunks = ca.downcast_iter().map(|array| {
let iter = array
.into_iter()
.map(|opt_val| opt_val.and_then(|val| self.parse(val)));
PrimitiveArray::from_trusted_len_iter(iter)
});
ChunkedArray::<T>::from_chunk_iter(ca.name().clone(), chunks)
.into_series()
.cast(&self.logical_type)
.unwrap()
.with_name(ca.name().clone())
}
}
#[cfg(feature = "dtype-date")]
fn transform_date(val: &str, fmt: &str) -> Option<i32> {
NaiveDate::parse_from_str(val, fmt)
.ok()
.map(naive_date_to_date)
}
pub(crate) fn parse_datetime(val: &str, fmt: &str) -> Option<NaiveDateTime> {
NaiveDateTime::parse_from_str(val, fmt)
.or_else(|parse_error| match parse_error.kind() {
ParseErrorKind::NotEnough => {
NaiveDate::parse_from_str(val, fmt).map(|nd| nd.and_hms_opt(0, 0, 0).unwrap())
},
_ => Err(parse_error),
})
.ok()
}
pub(crate) fn parse_datetime_and_remainder<'a>(
val: &'a str,
fmt: &str,
) -> Option<(NaiveDateTime, &'a str)> {
NaiveDateTime::parse_and_remainder(val, fmt)
.or_else(|parse_error| match parse_error.kind() {
ParseErrorKind::NotEnough => NaiveDate::parse_and_remainder(val, fmt)
.map(|(nd, r)| (nd.and_hms_opt(0, 0, 0).unwrap(), r)),
_ => Err(parse_error),
})
.ok()
}
#[cfg(feature = "dtype-datetime")]
pub(crate) fn transform_datetime_ns(val: &str, fmt: &str) -> Option<i64> {
parse_datetime(val, fmt).map(datetime_to_timestamp_ns)
}
#[cfg(feature = "dtype-datetime")]
pub(crate) fn transform_datetime_us(val: &str, fmt: &str) -> Option<i64> {
parse_datetime(val, fmt).map(datetime_to_timestamp_us)
}
#[cfg(feature = "dtype-datetime")]
pub(crate) fn transform_datetime_ms(val: &str, fmt: &str) -> Option<i64> {
parse_datetime(val, fmt).map(datetime_to_timestamp_ms)
}
fn transform_tzaware_datetime_ns(val: &str, fmt: &str) -> Option<i64> {
let dt = DateTime::parse_from_str(val, fmt);
dt.ok().map(|dt| datetime_to_timestamp_ns(dt.naive_utc()))
}
fn transform_tzaware_datetime_us(val: &str, fmt: &str) -> Option<i64> {
let dt = DateTime::parse_from_str(val, fmt);
dt.ok().map(|dt| datetime_to_timestamp_us(dt.naive_utc()))
}
fn transform_tzaware_datetime_ms(val: &str, fmt: &str) -> Option<i64> {
let dt = DateTime::parse_from_str(val, fmt);
dt.ok().map(|dt| datetime_to_timestamp_ms(dt.naive_utc()))
}
pub fn infer_pattern_single(val: &str) -> Option<Pattern> {
infer_pattern_date_single(val)
.or_else(|| infer_pattern_time_single(val))
.or_else(|| infer_pattern_datetime_single(val))
}
pub fn infer_pattern_datetime_single(val: &str) -> Option<Pattern> {
if patterns::DATETIME_D_M_Y.iter().any(|fmt| {
NaiveDateTime::parse_from_str(val, fmt).is_ok()
|| NaiveDate::parse_from_str(val, fmt).is_ok()
}) {
Some(Pattern::DatetimeDMY)
} else if patterns::DATETIME_Y_M_D.iter().any(|fmt| {
NaiveDateTime::parse_from_str(val, fmt).is_ok()
|| NaiveDate::parse_from_str(val, fmt).is_ok()
}) {
Some(Pattern::DatetimeYMD)
} else if patterns::DATETIME_Y_M_D_Z
.iter()
.any(|fmt| NaiveDateTime::parse_from_str(val, fmt).is_ok())
{
Some(Pattern::DatetimeYMDZ)
} else {
None
}
}
pub fn infer_pattern_date_single(val: &str) -> Option<Pattern> {
if patterns::DATE_D_M_Y
.iter()
.any(|fmt| NaiveDate::parse_from_str(val, fmt).is_ok())
{
Some(Pattern::DateDMY)
} else if patterns::DATE_Y_M_D
.iter()
.any(|fmt| NaiveDate::parse_from_str(val, fmt).is_ok())
{
Some(Pattern::DateYMD)
} else {
None
}
}
pub fn infer_pattern_time_single(val: &str) -> Option<Pattern> {
sniff_time_fmt(val).is_some().then_some(Pattern::Time)
}
pub fn sniff_time_fmt(val: &str) -> Option<&'static str> {
patterns::TIME_H_M_S
.iter()
.copied()
.find(|fmt| NaiveTime::parse_from_str(val, fmt).is_ok())
}
#[cfg(feature = "dtype-datetime")]
pub fn to_datetime_with_inferred_tz(
ca: &StringChunked,
tu: TimeUnit,
strict: bool,
exact: bool,
ambiguous: &StringChunked,
) -> PolarsResult<DatetimeChunked> {
use super::StringMethods;
let out = if exact {
to_datetime(ca, tu, None, ambiguous, false)
} else {
ca.as_datetime_not_exact(None, tu, false, None, ambiguous, false)
}?;
if strict && ca.null_count() != out.null_count() {
polars_core::utils::handle_casting_failures(
&ca.clone().into_series(),
&out.clone().into_series(),
)?;
}
Ok(out)
}
#[cfg(feature = "dtype-datetime")]
pub fn to_datetime(
ca: &StringChunked,
tu: TimeUnit,
tz: Option<&TimeZone>,
ambiguous: &StringChunked,
ensure_matching_time_zone: bool,
) -> PolarsResult<DatetimeChunked> {
match ca.first_non_null() {
None => {
Ok(Int64Chunked::full_null(ca.name().clone(), ca.len()).into_datetime(tu, tz.cloned()))
},
Some(idx) => {
let subset = ca.slice(idx as i64, ca.len());
let pattern = subset
.iter()
.find_map(|opt_val| opt_val.and_then(infer_pattern_datetime_single))
.ok_or_else(|| polars_err!(parse_fmt_idk = "date"))?;
let mut infer = DatetimeInfer::<Int64Type>::try_from_with_unit(pattern, Some(tu))?;
#[cfg(feature = "timezones")]
if matches!(pattern, Pattern::DatetimeYMDZ) {
polars_ensure!(
!ensure_matching_time_zone || tz.is_some(),
to_datetime_tz_mismatch
);
}
coerce_string_to_datetime(&mut infer, ca, tz, ambiguous)
},
}
}
#[cfg(feature = "dtype-date")]
pub fn coerce_string_to_date(
infer: &mut DatetimeInfer<Int32Type>,
ca: &StringChunked,
) -> PolarsResult<DateChunked> {
infer.coerce_string(ca).date().cloned()
}
#[cfg(feature = "dtype-datetime")]
pub fn coerce_string_to_datetime(
infer: &mut DatetimeInfer<Int64Type>,
ca: &StringChunked,
tz: Option<&TimeZone>,
ambiguous: &StringChunked,
) -> PolarsResult<DatetimeChunked> {
let DataType::Datetime(tu, _) = &infer.logical_type else {
unreachable!()
};
let tu = *tu;
match infer.pattern {
#[cfg(feature = "timezones")]
Pattern::DatetimeYMDZ => infer.coerce_string(ca).datetime().map(|ca| {
let mut ca = ca.clone();
ca.set_time_unit_and_time_zone(tu, tz.cloned().unwrap_or(TimeZone::UTC))?;
Ok(ca)
})?,
_ => infer.coerce_string(ca).datetime().map(|ca| {
let mut ca = ca.clone();
ca.set_time_unit(tu);
match tz {
#[cfg(feature = "timezones")]
Some(tz) => polars_ops::prelude::replace_time_zone(
&ca,
Some(tz),
ambiguous,
NonExistent::Raise,
),
_ => Ok(ca),
}
})?,
}
}
#[cfg(feature = "dtype-date")]
pub(crate) fn to_date(ca: &StringChunked) -> PolarsResult<DateChunked> {
match ca.first_non_null() {
None => Ok(Int32Chunked::full_null(ca.name().clone(), ca.len()).into_date()),
Some(idx) => {
let subset = ca.slice(idx as i64, ca.len());
let pattern = subset
.iter()
.find_map(|opt_val| opt_val.and_then(infer_pattern_date_single))
.ok_or_else(|| polars_err!(parse_fmt_idk = "date"))?;
let mut infer = DatetimeInfer::<Int32Type>::try_from_with_unit(pattern, None).unwrap();
coerce_string_to_date(&mut infer, ca)
},
}
}