use fancy_regex::{Regex as FRegex, RegexBuilder as FRegexBuilder};
use jaq_core::native::{Filter, bome, v};
use jaq_core::{Cv, DataT, Error, RunPtr, ValR};
use jaq_std::ValT;
#[derive(Clone, Copy, Default)]
struct Flags {
g: bool,
n: bool,
i: bool,
m: bool,
s: bool,
x: bool,
}
impl Flags {
fn parse(s: &str) -> Result<Self, char> {
let mut out = Self::default();
for c in s.chars() {
match c {
'g' => out.g = true,
'n' => out.n = true,
'i' => out.i = true,
'm' => out.m = true,
's' => out.s = true,
'x' => out.x = true,
'l' => {} 'p' => {
out.m = true;
out.s = true;
}
c => return Err(c),
}
}
Ok(out)
}
fn build(&self, pattern: &str) -> Result<FRegex, fancy_regex::Error> {
let mut b = FRegexBuilder::new(pattern);
b.case_insensitive(self.i)
.multi_line(self.m)
.dot_matches_new_line(self.s)
.ignore_whitespace(self.x);
b.build()
}
}
struct ByteCharMap<'a> {
text: &'a str,
last_byte: usize,
last_char: usize,
}
impl<'a> ByteCharMap<'a> {
fn new(text: &'a str) -> Self {
Self {
text,
last_byte: 0,
last_char: 0,
}
}
fn byte_to_char(&mut self, byte_offset: usize) -> usize {
if byte_offset < self.last_byte {
self.last_byte = 0;
self.last_char = 0;
}
let slice = &self.text[self.last_byte..byte_offset];
self.last_char += slice.chars().count();
self.last_byte = byte_offset;
self.last_char
}
}
fn run_regex<V: ValT>(
text: &str,
re: &FRegex,
flags: Flags,
sm: (bool, bool),
sub: impl Fn(&str) -> V,
) -> Result<V, Error<V>> {
let (mi, ma) = sm;
let mut bc = ByteCharMap::new(text);
let mut last_byte = 0usize;
let mut out: Vec<V> = Vec::new();
for cap_result in re.captures_iter(text) {
let caps = cap_result.map_err(|e| Error::str(format_args!("regex error: {e}")))?;
let whole = caps
.get(0)
.expect("captures always include the whole match at index 0");
if flags.n && whole.range().is_empty() {
continue;
}
if mi {
out.push(sub(&text[last_byte..whole.start()]));
last_byte = whole.end();
}
if ma {
let names: Vec<Option<&str>> = re.capture_names().collect();
let mut match_objs: Vec<V> = Vec::with_capacity(names.len());
for (idx, name) in names.into_iter().enumerate() {
let Some(m) = caps.get(idx) else {
continue;
};
let offset = bc.byte_to_char(m.start());
let length = m.as_str().chars().count();
let mut fields: Vec<(V, V)> = vec![
(V::from(String::from("offset")), V::from(offset as isize)),
(V::from(String::from("length")), V::from(length as isize)),
(V::from(String::from("string")), sub(m.as_str())),
];
if let Some(n) = name {
fields.push((V::from(String::from("name")), V::from(n.to_string())));
}
match_objs.push(V::from_map(fields)?);
}
let arr: V = match_objs.into_iter().collect();
out.push(arr);
}
if !flags.g {
break;
}
}
if mi {
out.push(sub(&text[last_byte..]));
}
Ok(out.into_iter().collect())
}
fn re_native<'a, D: DataT>(s: bool, m: bool, mut cv: Cv<'a, D>) -> ValR<D::V<'a>>
where
D::V<'a>: ValT,
{
let flags_v = cv.0.pop_var();
let pat_v = cv.0.pop_var();
let flag_bytes = flags_v.try_as_utf8_bytes()?;
let flag_str = core::str::from_utf8(flag_bytes)
.map_err(|_| Error::str(format_args!("invalid UTF-8 in regex flags")))?;
let flags =
Flags::parse(flag_str).map_err(|c| Error::str(format_args!("invalid regex flag: {c}")))?;
let pat_bytes = pat_v.try_as_utf8_bytes()?;
let pat_str = core::str::from_utf8(pat_bytes)
.map_err(|_| Error::str(format_args!("invalid UTF-8 in regex pattern")))?;
let re = flags
.build(pat_str)
.map_err(|e| Error::str(format_args!("invalid regex: {e}")))?;
let in_bytes = cv.1.try_as_utf8_bytes()?;
let text = core::str::from_utf8(in_bytes)
.map_err(|_| Error::str(format_args!("invalid UTF-8 input to regex")))?;
let input = cv.1.clone();
let sub = move |x: &str| input.as_sub_str(x.as_bytes());
run_regex::<D::V<'a>>(text, &re, flags, (s, m), sub)
}
pub(super) const SHADOWED_NATIVE_NAMES: &[&str] = &["matches", "split_matches", "split_"];
pub(super) fn funs<D: DataT>() -> Box<[Filter<RunPtr<D>>]>
where
for<'a> D::V<'a>: ValT,
{
Box::new([
("matches", v(2), |cv| bome(re_native(false, true, cv))),
("split_matches", v(2), |cv| bome(re_native(true, true, cv))),
("split_", v(2), |cv| bome(re_native(true, false, cv))),
])
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn flags_parse_known() {
assert!(Flags::parse("gimsxn").is_ok());
assert!(Flags::parse("p").is_ok());
assert!(Flags::parse("l").is_ok()); assert!(Flags::parse("").is_ok());
}
#[test]
fn flags_parse_unknown() {
assert_eq!(Flags::parse("z").err(), Some('z'));
}
#[test]
fn flags_p_implies_m_and_s() {
let f = Flags::parse("p").unwrap();
assert!(f.m);
assert!(f.s);
}
#[test]
fn build_simple_pattern() {
let f = Flags::default();
assert!(f.build("abc").is_ok());
}
#[test]
fn build_lookahead() {
let f = Flags::default();
assert!(f.build(r"foo(?=bar)").is_ok());
}
#[test]
fn build_lookbehind() {
let f = Flags::default();
assert!(f.build(r"(?<=foo)bar").is_ok());
}
#[test]
fn build_backreference() {
let f = Flags::default();
assert!(f.build(r"(\w+) \1").is_ok());
}
#[test]
fn build_atomic_group() {
let f = Flags::default();
assert!(f.build(r"(?>abc|abd)d").is_ok());
}
#[test]
fn build_invalid_returns_error() {
let f = Flags::default();
assert!(f.build(r"(unbalanced").is_err());
}
#[test]
fn byte_to_char_ascii() {
let mut bc = ByteCharMap::new("hello");
assert_eq!(bc.byte_to_char(0), 0);
assert_eq!(bc.byte_to_char(3), 3);
}
#[test]
fn byte_to_char_multi_byte() {
let mut bc = ByteCharMap::new("héllo");
assert_eq!(bc.byte_to_char(0), 0);
assert_eq!(bc.byte_to_char(3), 2); }
#[test]
fn byte_to_char_handles_out_of_order() {
let mut bc = ByteCharMap::new("héllo");
let _ = bc.byte_to_char(3);
assert_eq!(bc.byte_to_char(1), 1); }
}