daggrs 0.1.1

A fast Double-Array Aho-Corasick implementation for multi-pattern matching
Documentation
use crate::types::Match;

use super::{Trie, DEAD_STATE};

/// Iterator over matches found in text using the trie.
pub struct TrieFindIter<'a> {
    trie: &'a Trie,
    text: &'a [u8],
    pos: usize,
    state: usize,
    outpos: u32,
}

impl<'a> TrieFindIter<'a> {
    pub(super) fn new(trie: &'a Trie, text: &'a [u8]) -> Self {
        Self {
            trie,
            text,
            pos: 0,
            state: 0,
            outpos: u32::MAX,
        }
    }

    #[inline]
    fn next_overlapping(&mut self) -> Option<Match> {
        // Yield pending outputs from output chain
        if self.outpos != u32::MAX {
            let output = &self.trie.outputs[self.outpos as usize];
            self.outpos = output.parent;
            return Some(Match {
                pattern_id: output.pattern_id,
                start: self.pos - output.length as usize,
                end: self.pos,
            });
        }

        // Process input bytes
        while self.pos < self.text.len() {
            let byte = self.text[self.pos];
            self.pos += 1;

            // Follow failure links until we find a transition or reach root
            while self.state != 0 && !self.trie.states[self.state].edges.contains_key(&byte) {
                self.state = self.trie.states[self.state].fail as usize;
            }
            self.state = self.trie.states[self.state]
                .edges
                .get(&byte)
                .copied()
                .unwrap_or(0) as usize;

            // Check for outputs
            self.outpos = self.trie.states[self.state].outpos.unwrap_or(u32::MAX);
            if self.outpos != u32::MAX {
                let output = &self.trie.outputs[self.outpos as usize];
                self.outpos = output.parent;
                return Some(Match {
                    pattern_id: output.pattern_id,
                    start: self.pos - output.length as usize,
                    end: self.pos,
                });
            }
        }

        None
    }

    #[inline]
    fn next_leftmost(&mut self) -> Option<Match> {
        let mut state: usize = 0;
        let mut last_outpos: Option<u32> = None;

        for read_pos in self.pos..self.text.len() {
            let byte = self.text[read_pos];

            // Transition with failure link fallback
            loop {
                if let Some(&next) = self.trie.states[state].edges.get(&byte) {
                    state = next as usize;
                    break;
                } else if state == 0 {
                    break;
                } else {
                    let fail = self.trie.states[state].fail;
                    if fail == DEAD_STATE {
                        if let Some(outpos) = last_outpos {
                            let output = &self.trie.outputs[outpos as usize];
                            return Some(Match {
                                pattern_id: output.pattern_id,
                                start: self.pos - output.length as usize,
                                end: self.pos,
                            });
                        }
                        state = 0;
                        break;
                    }
                    state = fail as usize;
                }
            }

            if let Some(outpos) = self.trie.states[state].outpos {
                last_outpos = Some(outpos);
                self.pos = read_pos + 1;
            }
        }

        // Yield any remaining match at end of text
        if let Some(outpos) = last_outpos {
            let output = &self.trie.outputs[outpos as usize];
            return Some(Match {
                pattern_id: output.pattern_id,
                start: self.pos - output.length as usize,
                end: self.pos,
            });
        }

        None
    }
}

impl Iterator for TrieFindIter<'_> {
    type Item = Match;

    #[inline]
    fn next(&mut self) -> Option<Match> {
        match self.trie.match_kind {
            crate::types::MatchKind::Overlapping => self.next_overlapping(),
            crate::types::MatchKind::LeftmostFirst
            | crate::types::MatchKind::LeftmostLongest
            | crate::types::MatchKind::WordPiece => self.next_leftmost(),
        }
    }
}