use crate::array::{
ArrayRef, GenericStringArray, GenericStringBuilder, ListBuilder, OffsetSizeTrait,
};
use crate::error::{ArrowError, Result};
use std::collections::HashMap;
use std::sync::Arc;
use regex::Regex;
pub fn regexp_match<OffsetSize: OffsetSizeTrait>(
array: &GenericStringArray<OffsetSize>,
regex_array: &GenericStringArray<OffsetSize>,
flags_array: Option<&GenericStringArray<OffsetSize>>,
) -> Result<ArrayRef> {
let mut patterns: HashMap<String, Regex> = HashMap::new();
let builder: GenericStringBuilder<OffsetSize> = GenericStringBuilder::new(0);
let mut list_builder = ListBuilder::new(builder);
let complete_pattern = match flags_array {
Some(flags) => Box::new(regex_array.iter().zip(flags.iter()).map(
|(pattern, flags)| {
pattern.map(|pattern| match flags {
Some(value) => format!("(?{}){}", value, pattern),
None => pattern.to_string(),
})
},
)) as Box<dyn Iterator<Item = Option<String>>>,
None => Box::new(
regex_array
.iter()
.map(|pattern| pattern.map(|pattern| pattern.to_string())),
),
};
array
.iter()
.zip(complete_pattern)
.map(|(value, pattern)| {
match (value, pattern) {
(Some(_), Some(pattern)) if pattern == *"" => {
list_builder.values().append_value("")?;
list_builder.append(true)?;
}
(Some(value), Some(pattern)) => {
let existing_pattern = patterns.get(&pattern);
let re = match existing_pattern {
Some(re) => re.clone(),
None => {
let re = Regex::new(pattern.as_str()).map_err(|e| {
ArrowError::ComputeError(format!(
"Regular expression did not compile: {:?}",
e
))
})?;
patterns.insert(pattern, re.clone());
re
}
};
match re.captures(value) {
Some(caps) => {
for m in caps.iter().skip(1).flatten() {
list_builder.values().append_value(m.as_str())?;
}
list_builder.append(true)?
}
None => list_builder.append(false)?,
}
}
_ => list_builder.append(false)?,
}
Ok(())
})
.collect::<Result<Vec<()>>>()?;
Ok(Arc::new(list_builder.finish()))
}
#[cfg(test)]
mod tests {
use super::*;
use crate::array::{ListArray, StringArray};
#[test]
fn match_single_group() -> Result<()> {
let values = vec![
Some("abc-005-def"),
Some("X-7-5"),
Some("X545"),
None,
Some("foobarbequebaz"),
Some("foobarbequebaz"),
];
let array = StringArray::from(values);
let mut pattern_values = vec![r".*-(\d*)-.*"; 4];
pattern_values.push(r"(bar)(bequ1e)");
pattern_values.push("");
let pattern = StringArray::from(pattern_values);
let actual = regexp_match(&array, &pattern, None)?;
let elem_builder: GenericStringBuilder<i32> = GenericStringBuilder::new(0);
let mut expected_builder = ListBuilder::new(elem_builder);
expected_builder.values().append_value("005")?;
expected_builder.append(true)?;
expected_builder.values().append_value("7")?;
expected_builder.append(true)?;
expected_builder.append(false)?;
expected_builder.append(false)?;
expected_builder.append(false)?;
expected_builder.values().append_value("")?;
expected_builder.append(true)?;
let expected = expected_builder.finish();
let result = actual.as_any().downcast_ref::<ListArray>().unwrap();
assert_eq!(&expected, result);
Ok(())
}
#[test]
fn match_single_group_with_flags() -> Result<()> {
let values = vec![Some("abc-005-def"), Some("X-7-5"), Some("X545"), None];
let array = StringArray::from(values);
let pattern = StringArray::from(vec![r"x.*-(\d*)-.*"; 4]);
let flags = StringArray::from(vec!["i"; 4]);
let actual = regexp_match(&array, &pattern, Some(&flags))?;
let elem_builder: GenericStringBuilder<i32> = GenericStringBuilder::new(0);
let mut expected_builder = ListBuilder::new(elem_builder);
expected_builder.append(false)?;
expected_builder.values().append_value("7")?;
expected_builder.append(true)?;
expected_builder.append(false)?;
expected_builder.append(false)?;
let expected = expected_builder.finish();
let result = actual.as_any().downcast_ref::<ListArray>().unwrap();
assert_eq!(&expected, result);
Ok(())
}
}