use std::ops::Range;
use crate::encodings::utf8::ONIG_ENCODING_UTF8;
use crate::error::RegexError;
use crate::oniguruma::*;
use crate::regcomp::onig_new;
use crate::regexec::{onig_name_to_group_numbers, onig_search};
use crate::regint::RegexType;
use crate::regsyntax::OnigSyntaxOniguruma;
pub struct Regex {
inner: RegexType,
}
impl Regex {
pub fn new(pattern: &str) -> Result<Regex, RegexError> {
Self::new_bytes(pattern.as_bytes())
}
pub fn new_bytes(pattern: &[u8]) -> Result<Regex, RegexError> {
let inner = onig_new(
pattern,
ONIG_OPTION_NONE,
&ONIG_ENCODING_UTF8,
&OnigSyntaxOniguruma,
)?;
Ok(Regex { inner })
}
pub fn builder(pattern: &str) -> RegexBuilder {
RegexBuilder::new(pattern)
}
pub fn find<'t>(&self, text: &'t str) -> Option<Match<'t>> {
self.find_bytes(text.as_bytes())
}
pub fn find_bytes<'t>(&self, text: &'t [u8]) -> Option<Match<'t>> {
let (result, region) = onig_search(
&self.inner,
text,
text.len(),
0,
text.len(),
Some(OnigRegion::new()),
ONIG_OPTION_NONE,
);
if result < 0 {
return None;
}
let region = region?;
if region.num_regs < 1 {
return None;
}
let start = region.beg[0] as usize;
let end = region.end[0] as usize;
Some(Match { text, start, end })
}
pub fn is_match(&self, text: &str) -> bool {
self.is_match_bytes(text.as_bytes())
}
pub fn is_match_bytes(&self, text: &[u8]) -> bool {
let (result, _) = onig_search(
&self.inner,
text,
text.len(),
0,
text.len(),
None,
ONIG_OPTION_NONE,
);
result >= 0
}
pub fn captures<'t>(&'t self, text: &'t str) -> Option<Captures<'t>> {
self.captures_bytes(text.as_bytes())
}
pub fn captures_bytes<'t>(&'t self, text: &'t [u8]) -> Option<Captures<'t>> {
let (result, region) = onig_search(
&self.inner,
text,
text.len(),
0,
text.len(),
Some(OnigRegion::new()),
ONIG_OPTION_NONE,
);
if result < 0 {
return None;
}
let region = region?;
Some(Captures {
text,
region,
regex: self,
})
}
pub fn find_iter<'r, 't>(&'r self, text: &'t str) -> FindIter<'r, 't> {
FindIter {
regex: self,
text: text.as_bytes(),
last_end: 0,
last_was_empty: false,
}
}
pub fn find_iter_bytes<'r, 't>(&'r self, text: &'t [u8]) -> FindIter<'r, 't> {
FindIter {
regex: self,
text,
last_end: 0,
last_was_empty: false,
}
}
pub fn captures_len(&self) -> usize {
self.inner.num_mem as usize
}
pub fn as_raw(&self) -> &RegexType {
&self.inner
}
}
impl std::fmt::Debug for Regex {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("Regex").finish_non_exhaustive()
}
}
pub struct RegexBuilder {
pattern: Vec<u8>,
options: OnigOptionType,
syntax: &'static OnigSyntaxType,
}
impl RegexBuilder {
pub fn new(pattern: &str) -> Self {
RegexBuilder {
pattern: pattern.as_bytes().to_vec(),
options: ONIG_OPTION_NONE,
syntax: &OnigSyntaxOniguruma,
}
}
pub fn case_insensitive(mut self, yes: bool) -> Self {
if yes {
self.options |= ONIG_OPTION_IGNORECASE;
} else {
self.options &= !ONIG_OPTION_IGNORECASE;
}
self
}
pub fn dot_matches_newline(mut self, yes: bool) -> Self {
if yes {
self.options |= ONIG_OPTION_MULTILINE;
} else {
self.options &= !ONIG_OPTION_MULTILINE;
}
self
}
pub fn multi_line_anchors(mut self, yes: bool) -> Self {
if yes {
self.options |= ONIG_OPTION_SINGLELINE;
} else {
self.options &= !ONIG_OPTION_SINGLELINE;
}
self
}
pub fn extended(mut self, yes: bool) -> Self {
if yes {
self.options |= ONIG_OPTION_EXTEND;
} else {
self.options &= !ONIG_OPTION_EXTEND;
}
self
}
pub fn option(mut self, flag: OnigOptionType) -> Self {
self.options |= flag;
self
}
pub fn syntax(mut self, syntax: &'static OnigSyntaxType) -> Self {
self.syntax = syntax;
self
}
pub fn build(self) -> Result<Regex, RegexError> {
let inner = onig_new(&self.pattern, self.options, &ONIG_ENCODING_UTF8, self.syntax)?;
Ok(Regex { inner })
}
}
#[derive(Debug, Clone, Copy)]
pub struct Match<'t> {
text: &'t [u8],
start: usize,
end: usize,
}
impl<'t> Match<'t> {
pub fn start(&self) -> usize {
self.start
}
pub fn end(&self) -> usize {
self.end
}
pub fn range(&self) -> Range<usize> {
self.start..self.end
}
pub fn as_bytes(&self) -> &'t [u8] {
&self.text[self.start..self.end]
}
pub fn as_str(&self) -> &'t str {
std::str::from_utf8(self.as_bytes()).expect("match is not valid UTF-8")
}
pub fn len(&self) -> usize {
self.end - self.start
}
pub fn is_empty(&self) -> bool {
self.start == self.end
}
}
pub struct Captures<'t> {
text: &'t [u8],
region: OnigRegion,
regex: &'t Regex,
}
impl<'t> Captures<'t> {
pub fn get(&self, i: usize) -> Option<Match<'t>> {
if i >= self.region.num_regs as usize {
return None;
}
let beg = self.region.beg[i];
let end = self.region.end[i];
if beg == ONIG_REGION_NOTPOS {
return None;
}
Some(Match {
text: self.text,
start: beg as usize,
end: end as usize,
})
}
pub fn name(&self, name: &str) -> Option<Match<'t>> {
let nums = onig_name_to_group_numbers(&self.regex.inner, name.as_bytes()).ok()?;
for &num in nums {
let m = self.get(num as usize);
if m.is_some() {
return m;
}
}
None
}
pub fn len(&self) -> usize {
self.region.num_regs as usize
}
pub fn is_empty(&self) -> bool {
self.region.num_regs == 0
}
pub fn iter(&self) -> CapturesIter<'_, 't> {
CapturesIter {
captures: self,
index: 0,
}
}
}
impl std::fmt::Debug for Captures<'_> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let mut list = f.debug_list();
for i in 0..self.len() {
list.entry(&self.get(i));
}
list.finish()
}
}
pub struct CapturesIter<'c, 't> {
captures: &'c Captures<'t>,
index: usize,
}
impl<'c, 't> Iterator for CapturesIter<'c, 't> {
type Item = Option<Match<'t>>;
fn next(&mut self) -> Option<Self::Item> {
if self.index >= self.captures.len() {
return None;
}
let m = self.captures.get(self.index);
self.index += 1;
Some(m)
}
fn size_hint(&self) -> (usize, Option<usize>) {
let remaining = self.captures.len() - self.index;
(remaining, Some(remaining))
}
}
impl ExactSizeIterator for CapturesIter<'_, '_> {}
pub struct FindIter<'r, 't> {
regex: &'r Regex,
text: &'t [u8],
last_end: usize,
last_was_empty: bool,
}
impl<'r, 't> Iterator for FindIter<'r, 't> {
type Item = Match<'t>;
fn next(&mut self) -> Option<Match<'t>> {
if self.last_end > self.text.len() {
return None;
}
let (result, region) = onig_search(
&self.regex.inner,
self.text,
self.text.len(),
self.last_end,
self.text.len(),
Some(OnigRegion::new()),
ONIG_OPTION_NONE,
);
if result < 0 {
return None;
}
let region = region?;
if region.num_regs < 1 {
return None;
}
let start = region.beg[0] as usize;
let end = region.end[0] as usize;
if start == end {
if self.last_was_empty {
if self.last_end >= self.text.len() {
return None;
}
self.last_end +=
self.regex.inner.enc.mbc_enc_len(&self.text[self.last_end..]);
self.last_was_empty = false;
return self.next();
}
self.last_was_empty = true;
} else {
self.last_was_empty = false;
}
self.last_end = end;
Some(Match {
text: self.text,
start,
end,
})
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn regex_new_and_find() {
let re = Regex::new(r"\d+").unwrap();
let m = re.find("hello 42 world").unwrap();
assert_eq!(m.as_str(), "42");
assert_eq!(m.start(), 6);
assert_eq!(m.end(), 8);
assert_eq!(m.range(), 6..8);
assert_eq!(m.len(), 2);
assert!(!m.is_empty());
}
#[test]
fn regex_no_match() {
let re = Regex::new(r"\d+").unwrap();
assert!(re.find("no digits here").is_none());
}
#[test]
fn regex_is_match() {
let re = Regex::new(r"hello").unwrap();
assert!(re.is_match("say hello"));
assert!(!re.is_match("say goodbye"));
}
#[test]
fn regex_captures() {
let re = Regex::new(r"(\d{4})-(\d{2})-(\d{2})").unwrap();
let caps = re.captures("date: 2026-02-14").unwrap();
assert_eq!(caps.get(0).unwrap().as_str(), "2026-02-14");
assert_eq!(caps.get(1).unwrap().as_str(), "2026");
assert_eq!(caps.get(2).unwrap().as_str(), "02");
assert_eq!(caps.get(3).unwrap().as_str(), "14");
assert!(caps.get(4).is_none());
assert_eq!(caps.len(), 4);
}
#[test]
fn regex_captures_len() {
let re = Regex::new(r"(a)(b)(c)").unwrap();
assert_eq!(re.captures_len(), 3);
}
#[test]
fn regex_find_iter() {
let re = Regex::new(r"\d+").unwrap();
let matches: Vec<&str> = re.find_iter("1 + 22 = 333").map(|m| m.as_str()).collect();
assert_eq!(matches, vec!["1", "22", "333"]);
}
#[test]
fn regex_builder_case_insensitive() {
let re = Regex::builder(r"hello")
.case_insensitive(true)
.build()
.unwrap();
assert!(re.is_match("HELLO"));
assert!(re.is_match("Hello"));
}
#[test]
fn regex_invalid_pattern() {
let err = Regex::new(r"(unclosed").unwrap_err();
assert!(matches!(err, RegexError::Syntax { .. }));
}
#[test]
fn match_as_bytes() {
let re = Regex::new(r"world").unwrap();
let m = re.find("hello world").unwrap();
assert_eq!(m.as_bytes(), b"world");
}
#[test]
fn captures_iter() {
let re = Regex::new(r"(a)(b)?").unwrap();
let caps = re.captures("a").unwrap();
let items: Vec<_> = caps.iter().collect();
assert_eq!(items.len(), 3);
assert!(items[0].is_some());
assert!(items[1].is_some());
assert!(items[2].is_none());
}
#[test]
fn named_captures() {
let re = Regex::new(r"(?<year>\d{4})-(?<month>\d{2})").unwrap();
let caps = re.captures("2026-02").unwrap();
assert_eq!(caps.name("year").unwrap().as_str(), "2026");
assert_eq!(caps.name("month").unwrap().as_str(), "02");
assert!(caps.name("day").is_none());
}
#[test]
fn empty_match_find_iter() {
let re = Regex::new(r"").unwrap();
let matches: Vec<_> = re.find_iter("ab").collect();
assert_eq!(matches.len(), 3);
assert_eq!(matches[0].start(), 0);
assert_eq!(matches[1].start(), 1);
assert_eq!(matches[2].start(), 2);
}
}