datafusion_spark/function/datetime/
to_utc_timestamp.rs1use std::sync::Arc;
19
20use arrow::array::timezone::Tz;
21use arrow::array::{Array, ArrayRef, AsArray, PrimitiveBuilder, StringArrayType};
22use arrow::datatypes::TimeUnit;
23use arrow::datatypes::{
24 ArrowTimestampType, DataType, Field, FieldRef, TimestampMicrosecondType,
25 TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType,
26};
27use chrono::{DateTime, Offset, TimeZone};
28use datafusion_common::types::{NativeType, logical_string};
29use datafusion_common::utils::take_function_args;
30use datafusion_common::{
31 Result, exec_datafusion_err, exec_err, internal_datafusion_err, internal_err,
32};
33use datafusion_expr::{
34 Coercion, ColumnarValue, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl,
35 Signature, TypeSignatureClass, Volatility,
36};
37use datafusion_functions::utils::make_scalar_function;
38
39#[derive(Debug, PartialEq, Eq, Hash)]
49pub struct SparkToUtcTimestamp {
50 signature: Signature,
51}
52
53impl Default for SparkToUtcTimestamp {
54 fn default() -> Self {
55 Self::new()
56 }
57}
58
59impl SparkToUtcTimestamp {
60 pub fn new() -> Self {
61 Self {
62 signature: Signature::coercible(
63 vec![
64 Coercion::new_implicit(
65 TypeSignatureClass::Timestamp,
66 vec![TypeSignatureClass::Native(logical_string())],
67 NativeType::Timestamp(TimeUnit::Microsecond, None),
68 ),
69 Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
70 ],
71 Volatility::Immutable,
72 ),
73 }
74 }
75}
76
77impl ScalarUDFImpl for SparkToUtcTimestamp {
78 fn name(&self) -> &str {
79 "to_utc_timestamp"
80 }
81
82 fn signature(&self) -> &Signature {
83 &self.signature
84 }
85
86 fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
87 internal_err!("return_field_from_args should be used instead")
88 }
89
90 fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
91 let nullable = args.arg_fields.iter().any(|f| f.is_nullable());
92
93 Ok(Arc::new(Field::new(
94 self.name(),
95 args.arg_fields[0].data_type().clone(),
96 nullable,
97 )))
98 }
99
100 fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
101 make_scalar_function(to_utc_timestamp, vec![])(&args.args)
102 }
103}
104
105fn to_utc_timestamp(args: &[ArrayRef]) -> Result<ArrayRef> {
106 let [timestamp, timezone] = take_function_args("to_utc_timestamp", args)?;
107
108 match timestamp.data_type() {
109 DataType::Timestamp(TimeUnit::Nanosecond, tz_opt) => {
110 process_timestamp_with_tz_array::<TimestampNanosecondType>(
111 timestamp,
112 timezone,
113 tz_opt.clone(),
114 )
115 }
116 DataType::Timestamp(TimeUnit::Microsecond, tz_opt) => {
117 process_timestamp_with_tz_array::<TimestampMicrosecondType>(
118 timestamp,
119 timezone,
120 tz_opt.clone(),
121 )
122 }
123 DataType::Timestamp(TimeUnit::Millisecond, tz_opt) => {
124 process_timestamp_with_tz_array::<TimestampMillisecondType>(
125 timestamp,
126 timezone,
127 tz_opt.clone(),
128 )
129 }
130 DataType::Timestamp(TimeUnit::Second, tz_opt) => {
131 process_timestamp_with_tz_array::<TimestampSecondType>(
132 timestamp,
133 timezone,
134 tz_opt.clone(),
135 )
136 }
137 ts_type => {
138 exec_err!("`to_utc_timestamp`: unsupported argument types: {ts_type}")
139 }
140 }
141}
142
143fn process_timestamp_with_tz_array<T: ArrowTimestampType>(
144 ts_array: &ArrayRef,
145 tz_array: &ArrayRef,
146 tz_opt: Option<Arc<str>>,
147) -> Result<ArrayRef> {
148 match tz_array.data_type() {
149 DataType::Utf8 => {
150 process_arrays::<T, _>(tz_opt, ts_array, tz_array.as_string::<i32>())
151 }
152 DataType::LargeUtf8 => {
153 process_arrays::<T, _>(tz_opt, ts_array, tz_array.as_string::<i64>())
154 }
155 DataType::Utf8View => {
156 process_arrays::<T, _>(tz_opt, ts_array, tz_array.as_string_view())
157 }
158 other => {
159 exec_err!("`to_utc_timestamp`: timezone must be a string type, got {other}")
160 }
161 }
162}
163
164fn process_arrays<'a, T: ArrowTimestampType, S>(
165 return_tz_opt: Option<Arc<str>>,
166 ts_array: &ArrayRef,
167 tz_array: &'a S,
168) -> Result<ArrayRef>
169where
170 &'a S: StringArrayType<'a>,
171{
172 let ts_primitive = ts_array.as_primitive::<T>();
173 let mut builder = PrimitiveBuilder::<T>::with_capacity(ts_array.len());
174
175 for (ts_opt, tz_opt) in ts_primitive.iter().zip(tz_array.iter()) {
176 match (ts_opt, tz_opt) {
177 (Some(ts), Some(tz_str)) => {
178 let tz: Tz = tz_str.parse().map_err(|e| {
179 exec_datafusion_err!(
180 "`to_utc_timestamp`: invalid timezone '{tz_str}': {e}"
181 )
182 })?;
183 let val = adjust_to_utc_time::<T>(ts, tz)?;
184 builder.append_value(val);
185 }
186 _ => builder.append_null(),
187 }
188 }
189
190 builder = builder.with_timezone_opt(return_tz_opt);
191 Ok(Arc::new(builder.finish()))
192}
193
194fn adjust_to_utc_time<T: ArrowTimestampType>(ts: i64, tz: Tz) -> Result<i64> {
195 let dt = match T::UNIT {
196 TimeUnit::Nanosecond => Some(DateTime::from_timestamp_nanos(ts)),
197 TimeUnit::Microsecond => DateTime::from_timestamp_micros(ts),
198 TimeUnit::Millisecond => DateTime::from_timestamp_millis(ts),
199 TimeUnit::Second => DateTime::from_timestamp(ts, 0),
200 }
201 .ok_or_else(|| internal_datafusion_err!("Invalid timestamp"))?;
202 let naive_dt = dt.naive_utc();
203
204 let offset_seconds = tz
205 .offset_from_utc_datetime(&naive_dt)
206 .fix()
207 .local_minus_utc() as i64;
208
209 let offset_in_unit = match T::UNIT {
210 TimeUnit::Nanosecond => offset_seconds.checked_mul(1_000_000_000),
211 TimeUnit::Microsecond => offset_seconds.checked_mul(1_000_000),
212 TimeUnit::Millisecond => offset_seconds.checked_mul(1_000),
213 TimeUnit::Second => Some(offset_seconds),
214 }
215 .ok_or_else(|| internal_datafusion_err!("Offset overflow"))?;
216
217 ts.checked_sub(offset_in_unit).ok_or_else(|| {
218 internal_datafusion_err!("Timestamp overflow during timezone adjustment")
219 })
220}