#[derive(Debug, Clone, PartialEq, Eq)]
pub struct ContinuePayload {
safe_len: usize,
}
impl ContinuePayload {
pub fn new(safe_len: usize) -> Self {
Self { safe_len }
}
#[inline(always)]
pub fn safe_len(&self) -> usize {
self.safe_len
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct StopPayload {
trimmed_len: usize,
stop: String,
}
impl StopPayload {
pub fn new(trimmed_len: usize, stop: impl Into<String>) -> Self {
Self {
trimmed_len,
stop: stop.into(),
}
}
#[inline(always)]
pub fn trimmed_len(&self) -> usize {
self.trimmed_len
}
#[inline(always)]
pub fn stop(&self) -> &str {
&self.stop
}
}
#[derive(
Debug, Clone, PartialEq, Eq, derive_more::IsVariant, derive_more::Unwrap, derive_more::TryUnwrap,
)]
#[unwrap(ref, ref_mut)]
#[try_unwrap(ref, ref_mut)]
#[non_exhaustive]
pub enum StopDecision {
Continue(ContinuePayload),
Stop(StopPayload),
}
#[derive(Debug, Clone)]
pub struct StopMatcher {
stops: Vec<String>,
max_len: usize,
}
impl StopMatcher {
pub fn new<I, S>(stop_strings: I) -> Self
where
I: IntoIterator<Item = S>,
S: Into<String>,
{
let stops: Vec<String> = stop_strings
.into_iter()
.map(Into::into)
.filter(|s| !s.is_empty())
.collect();
let max_len = stops.iter().map(String::len).max().unwrap_or(0);
Self { stops, max_len }
}
pub fn is_active(&self) -> bool {
!self.stops.is_empty()
}
pub fn step(&self, full_text: &str) -> StopDecision {
if self.stops.is_empty() {
return StopDecision::Continue(ContinuePayload::new(full_text.len()));
}
let mut best: Option<(usize, &str)> = None;
for stop in &self.stops {
if let Some(start) = full_text.find(stop.as_str()) {
match best {
Some((b, _)) if start >= b => {}
_ => best = Some((start, stop.as_str())),
}
}
}
if let Some((start, stop)) = best {
return StopDecision::Stop(StopPayload::new(start, stop));
}
let held = self.held_back_suffix(full_text);
StopDecision::Continue(ContinuePayload::new(full_text.len() - held))
}
fn held_back_suffix(&self, text: &str) -> usize {
let cap = self.max_len.saturating_sub(1).min(text.len());
let mut len = cap;
while len > 0 {
let start = text.len() - len;
if text.is_char_boundary(start) {
let suffix = &text[start..];
if self
.stops
.iter()
.any(|s| s.len() > suffix.len() && s.as_bytes().starts_with(suffix.as_bytes()))
{
return len;
}
}
len -= 1;
}
0
}
}
#[cfg(test)]
mod tests {
use super::*;
fn matcher(stops: &[&str]) -> StopMatcher {
StopMatcher::new(stops.iter().copied())
}
#[test]
fn inert_when_empty() {
let m = matcher(&[]);
assert!(!m.is_active());
assert_eq!(
m.step("anything at all"),
StopDecision::Continue(ContinuePayload::new("anything at all".len()))
);
}
#[test]
fn empty_strings_are_dropped() {
let m = matcher(&["", ""]);
assert!(!m.is_active());
assert_eq!(m.step("x"), StopDecision::Continue(ContinuePayload::new(1)));
}
#[test]
fn simple_match_trims_at_start() {
let m = matcher(&["STOP"]);
assert_eq!(
m.step("abcSTOPdef"),
StopDecision::Stop(StopPayload::new(3, "STOP"))
);
}
#[test]
fn no_match_holds_back_partial_prefix() {
let m = matcher(&["STOP"]);
assert_eq!(
m.step("abcST"),
StopDecision::Continue(ContinuePayload::new(3))
);
assert_eq!(
m.step("abc"),
StopDecision::Continue(ContinuePayload::new(3))
);
}
#[test]
fn partial_then_diverge_releases_held_text() {
let m = matcher(&["STOP"]);
assert_eq!(
m.step("xxST"),
StopDecision::Continue(ContinuePayload::new(2))
);
assert_eq!(
m.step("xxSTX"),
StopDecision::Continue(ContinuePayload::new(5))
);
}
#[test]
fn first_match_wins_earliest_start() {
let m = matcher(&["bar", "foo"]);
assert_eq!(
m.step("foobar"),
StopDecision::Stop(StopPayload::new(0, "foo"))
);
}
#[test]
fn first_match_wins_tie_broken_by_order() {
let m = matcher(&["ab", "abc"]);
assert_eq!(m.step("abc"), StopDecision::Stop(StopPayload::new(0, "ab")));
}
#[test]
fn multibyte_held_back_suffix_is_char_safe() {
let m = matcher(&["é!"]);
let d = m.step("abé");
assert_eq!(d, StopDecision::Continue(ContinuePayload::new(2)));
}
#[test]
fn multibyte_match_trims_at_char_boundary() {
let m = matcher(&["é!"]);
assert_eq!(
m.step("abé!cd"),
StopDecision::Stop(StopPayload::new(2, "é!"))
);
}
}