use arrow::{
array::{ArrayRef, GenericStringArray, OffsetSizeTrait, PrimitiveArray},
datatypes::{ArrowNativeType, ArrowPrimitiveType},
};
use datafusion_common::{
cast::{as_generic_string_array, as_int64_array},
DataFusionError, Result,
};
use hashbrown::HashMap;
use std::cmp::{max, Ordering};
use std::sync::Arc;
use unicode_segmentation::UnicodeSegmentation;
pub fn character_length<T: ArrowPrimitiveType>(args: &[ArrayRef]) -> Result<ArrayRef>
where
T::Native: OffsetSizeTrait,
{
let string_array: &GenericStringArray<T::Native> =
as_generic_string_array::<T::Native>(&args[0])?;
let result = string_array
.iter()
.map(|string| {
string.map(|string: &str| {
T::Native::from_usize(string.chars().count())
.expect("should not fail as string.chars will always return integer")
})
})
.collect::<PrimitiveArray<T>>();
Ok(Arc::new(result) as ArrayRef)
}
pub fn left<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
let string_array = as_generic_string_array::<T>(&args[0])?;
let n_array = as_int64_array(&args[1])?;
let result = string_array
.iter()
.zip(n_array.iter())
.map(|(string, n)| match (string, n) {
(Some(string), Some(n)) => match n.cmp(&0) {
Ordering::Less => {
let len = string.chars().count() as i64;
Some(if n.abs() < len {
string.chars().take((len + n) as usize).collect::<String>()
} else {
"".to_string()
})
}
Ordering::Equal => Some("".to_string()),
Ordering::Greater => {
Some(string.chars().take(n as usize).collect::<String>())
}
},
_ => None,
})
.collect::<GenericStringArray<T>>();
Ok(Arc::new(result) as ArrayRef)
}
pub fn lpad<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
match args.len() {
2 => {
let string_array = as_generic_string_array::<T>(&args[0])?;
let length_array = as_int64_array(&args[1])?;
let result = string_array
.iter()
.zip(length_array.iter())
.map(|(string, length)| match (string, length) {
(Some(string), Some(length)) => {
if length > i32::MAX as i64 {
return Err(DataFusionError::Internal(format!(
"lpad requested length {length} too large"
)));
}
let length = if length < 0 { 0 } else { length as usize };
if length == 0 {
Ok(Some("".to_string()))
} else {
let graphemes = string.graphemes(true).collect::<Vec<&str>>();
if length < graphemes.len() {
Ok(Some(graphemes[..length].concat()))
} else {
let mut s: String = " ".repeat(length - graphemes.len());
s.push_str(string);
Ok(Some(s))
}
}
}
_ => Ok(None),
})
.collect::<Result<GenericStringArray<T>>>()?;
Ok(Arc::new(result) as ArrayRef)
}
3 => {
let string_array = as_generic_string_array::<T>(&args[0])?;
let length_array = as_int64_array(&args[1])?;
let fill_array = as_generic_string_array::<T>(&args[2])?;
let result = string_array
.iter()
.zip(length_array.iter())
.zip(fill_array.iter())
.map(|((string, length), fill)| match (string, length, fill) {
(Some(string), Some(length), Some(fill)) => {
if length > i32::MAX as i64 {
return Err(DataFusionError::Internal(format!(
"lpad requested length {length} too large"
)));
}
let length = if length < 0 { 0 } else { length as usize };
if length == 0 {
Ok(Some("".to_string()))
} else {
let graphemes = string.graphemes(true).collect::<Vec<&str>>();
let fill_chars = fill.chars().collect::<Vec<char>>();
if length < graphemes.len() {
Ok(Some(graphemes[..length].concat()))
} else if fill_chars.is_empty() {
Ok(Some(string.to_string()))
} else {
let mut s = string.to_string();
let mut char_vector =
Vec::<char>::with_capacity(length - graphemes.len());
for l in 0..length - graphemes.len() {
char_vector.push(
*fill_chars.get(l % fill_chars.len()).unwrap(),
);
}
s.insert_str(
0,
char_vector.iter().collect::<String>().as_str(),
);
Ok(Some(s))
}
}
}
_ => Ok(None),
})
.collect::<Result<GenericStringArray<T>>>()?;
Ok(Arc::new(result) as ArrayRef)
}
other => Err(DataFusionError::Internal(format!(
"lpad was called with {other} arguments. It requires at least 2 and at most 3."
))),
}
}
pub fn reverse<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
let string_array = as_generic_string_array::<T>(&args[0])?;
let result = string_array
.iter()
.map(|string| string.map(|string: &str| string.chars().rev().collect::<String>()))
.collect::<GenericStringArray<T>>();
Ok(Arc::new(result) as ArrayRef)
}
pub fn right<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
let string_array = as_generic_string_array::<T>(&args[0])?;
let n_array = as_int64_array(&args[1])?;
let result = string_array
.iter()
.zip(n_array.iter())
.map(|(string, n)| match (string, n) {
(Some(string), Some(n)) => match n.cmp(&0) {
Ordering::Less => Some(
string
.chars()
.skip(n.unsigned_abs() as usize)
.collect::<String>(),
),
Ordering::Equal => Some("".to_string()),
Ordering::Greater => Some(
string
.chars()
.skip(max(string.chars().count() as i64 - n, 0) as usize)
.collect::<String>(),
),
},
_ => None,
})
.collect::<GenericStringArray<T>>();
Ok(Arc::new(result) as ArrayRef)
}
pub fn rpad<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
match args.len() {
2 => {
let string_array = as_generic_string_array::<T>(&args[0])?;
let length_array = as_int64_array(&args[1])?;
let result = string_array
.iter()
.zip(length_array.iter())
.map(|(string, length)| match (string, length) {
(Some(string), Some(length)) => {
if length > i32::MAX as i64 {
return Err(DataFusionError::Internal(format!(
"rpad requested length {length} too large"
)));
}
let length = if length < 0 { 0 } else { length as usize };
if length == 0 {
Ok(Some("".to_string()))
} else {
let graphemes = string.graphemes(true).collect::<Vec<&str>>();
if length < graphemes.len() {
Ok(Some(graphemes[..length].concat()))
} else {
let mut s = string.to_string();
s.push_str(" ".repeat(length - graphemes.len()).as_str());
Ok(Some(s))
}
}
}
_ => Ok(None),
})
.collect::<Result<GenericStringArray<T>>>()?;
Ok(Arc::new(result) as ArrayRef)
}
3 => {
let string_array = as_generic_string_array::<T>(&args[0])?;
let length_array = as_int64_array(&args[1])?;
let fill_array = as_generic_string_array::<T>(&args[2])?;
let result = string_array
.iter()
.zip(length_array.iter())
.zip(fill_array.iter())
.map(|((string, length), fill)| match (string, length, fill) {
(Some(string), Some(length), Some(fill)) => {
if length > i32::MAX as i64 {
return Err(DataFusionError::Internal(format!(
"rpad requested length {length} too large"
)));
}
let length = if length < 0 { 0 } else { length as usize };
let graphemes = string.graphemes(true).collect::<Vec<&str>>();
let fill_chars = fill.chars().collect::<Vec<char>>();
if length < graphemes.len() {
Ok(Some(graphemes[..length].concat()))
} else if fill_chars.is_empty() {
Ok(Some(string.to_string()))
} else {
let mut s = string.to_string();
let mut char_vector =
Vec::<char>::with_capacity(length - graphemes.len());
for l in 0..length - graphemes.len() {
char_vector
.push(*fill_chars.get(l % fill_chars.len()).unwrap());
}
s.push_str(char_vector.iter().collect::<String>().as_str());
Ok(Some(s))
}
}
_ => Ok(None),
})
.collect::<Result<GenericStringArray<T>>>()?;
Ok(Arc::new(result) as ArrayRef)
}
other => Err(DataFusionError::Internal(format!(
"rpad was called with {other} arguments. It requires at least 2 and at most 3."
))),
}
}
pub fn strpos<T: ArrowPrimitiveType>(args: &[ArrayRef]) -> Result<ArrayRef>
where
T::Native: OffsetSizeTrait,
{
let string_array: &GenericStringArray<T::Native> =
as_generic_string_array::<T::Native>(&args[0])?;
let substring_array: &GenericStringArray<T::Native> =
as_generic_string_array::<T::Native>(&args[1])?;
let result = string_array
.iter()
.zip(substring_array.iter())
.map(|(string, substring)| match (string, substring) {
(Some(string), Some(substring)) => {
T::Native::from_usize(
string
.find(substring)
.map(|x| string[..x].chars().count() + 1)
.unwrap_or(0),
)
}
_ => None,
})
.collect::<PrimitiveArray<T>>();
Ok(Arc::new(result) as ArrayRef)
}
pub fn substr<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
match args.len() {
2 => {
let string_array = as_generic_string_array::<T>(&args[0])?;
let start_array = as_int64_array(&args[1])?;
let result = string_array
.iter()
.zip(start_array.iter())
.map(|(string, start)| match (string, start) {
(Some(string), Some(start)) => {
if start <= 0 {
Some(string.to_string())
} else {
Some(string.chars().skip(start as usize - 1).collect())
}
}
_ => None,
})
.collect::<GenericStringArray<T>>();
Ok(Arc::new(result) as ArrayRef)
}
3 => {
let string_array = as_generic_string_array::<T>(&args[0])?;
let start_array = as_int64_array(&args[1])?;
let count_array = as_int64_array(&args[2])?;
let result = string_array
.iter()
.zip(start_array.iter())
.zip(count_array.iter())
.map(|((string, start), count)| match (string, start, count) {
(Some(string), Some(start), Some(count)) => {
if count < 0 {
Err(DataFusionError::Execution(format!(
"negative substring length not allowed: substr(<str>, {start}, {count})"
)))
} else {
let skip = max(0, start - 1);
let count = max(0, count + (if start < 1 {start - 1} else {0}));
Ok(Some(string.chars().skip(skip as usize).take(count as usize).collect::<String>()))
}
}
_ => Ok(None),
})
.collect::<Result<GenericStringArray<T>>>()?;
Ok(Arc::new(result) as ArrayRef)
}
other => Err(DataFusionError::Internal(format!(
"substr was called with {other} arguments. It requires 2 or 3."
))),
}
}
pub fn translate<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
let string_array = as_generic_string_array::<T>(&args[0])?;
let from_array = as_generic_string_array::<T>(&args[1])?;
let to_array = as_generic_string_array::<T>(&args[2])?;
let result = string_array
.iter()
.zip(from_array.iter())
.zip(to_array.iter())
.map(|((string, from), to)| match (string, from, to) {
(Some(string), Some(from), Some(to)) => {
let from_map: HashMap<&str, usize> = from
.graphemes(true)
.collect::<Vec<&str>>()
.iter()
.enumerate()
.map(|(index, c)| (c.to_owned(), index))
.collect();
let to = to.graphemes(true).collect::<Vec<&str>>();
Some(
string
.graphemes(true)
.collect::<Vec<&str>>()
.iter()
.flat_map(|c| match from_map.get(*c) {
Some(n) => to.get(*n).copied(),
None => Some(*c),
})
.collect::<Vec<&str>>()
.concat(),
)
}
_ => None,
})
.collect::<GenericStringArray<T>>();
Ok(Arc::new(result) as ArrayRef)
}