use std::io::Read;
use super::FuzzyRegex;
use crate::engine::FuzzyBridge;
#[derive(Debug, Clone)]
pub struct StreamingMatch {
start: usize,
end: usize,
edits: u8,
similarity: f32,
}
impl StreamingMatch {
#[inline]
pub(crate) fn new(start: usize, end: usize, edits: u8, similarity: f32) -> Self {
Self {
start,
end,
edits,
similarity,
}
}
#[inline]
#[must_use]
pub fn start(&self) -> usize {
self.start
}
#[inline]
#[must_use]
pub fn end(&self) -> usize {
self.end
}
#[inline]
#[must_use]
pub fn edits(&self) -> u8 {
self.edits
}
#[inline]
#[must_use]
pub fn similarity(&self) -> f32 {
self.similarity
}
#[inline]
#[must_use]
pub fn len(&self) -> usize {
self.end - self.start
}
#[inline]
#[must_use]
pub fn is_empty(&self) -> bool {
self.start == self.end
}
}
pub struct StreamingMatcher<'r> {
regex: &'r FuzzyRegex,
buffer: Vec<u8>,
global_offset: usize,
max_buffer_size: usize,
threshold: f32,
pending_matches: Vec<StreamingMatch>,
}
impl<'r> StreamingMatcher<'r> {
pub(crate) fn new(regex: &'r FuzzyRegex, threshold: f32) -> Self {
let max_buffer_size =
regex.max_pattern_len().unwrap_or(64) + regex.max_edits().unwrap_or(2) as usize + 4;
Self {
regex,
buffer: Vec::with_capacity(max_buffer_size),
global_offset: 0,
max_buffer_size,
threshold,
pending_matches: Vec::new(),
}
}
pub fn feed(&mut self, chunk: &[u8]) -> FeedMatches<'_> {
self.pending_matches.clear();
if chunk.is_empty() {
return FeedMatches {
matches: &self.pending_matches,
index: 0,
};
}
let search_data: Vec<u8>;
let buffer_len = self.buffer.len();
let search_offset: usize;
if buffer_len > 0 {
search_data = [&self.buffer[..], chunk].concat();
search_offset = self.global_offset - buffer_len;
} else {
search_data = chunk.to_vec();
search_offset = self.global_offset;
}
self.search_bytes(&search_data, search_offset, buffer_len);
self.buffer.clear();
let keep_bytes = self.max_buffer_size.min(chunk.len());
if keep_bytes > 0 {
let start = chunk.len() - keep_bytes;
self.buffer.extend_from_slice(&chunk[start..]);
}
self.global_offset += chunk.len();
FeedMatches {
matches: &self.pending_matches,
index: 0,
}
}
pub fn finish(&mut self) -> Option<StreamingMatch> {
if self.buffer.is_empty() {
return None;
}
self.pending_matches.clear();
let search_offset = self.global_offset - self.buffer.len();
let buffer_copy = self.buffer.clone();
self.search_bytes(&buffer_copy, search_offset, 0);
self.buffer.clear();
self.pending_matches.pop()
}
pub fn reset(&mut self) {
self.buffer.clear();
self.global_offset = 0;
self.pending_matches.clear();
}
#[inline]
#[must_use]
pub fn position(&self) -> usize {
self.global_offset
}
fn search_bytes(&mut self, data: &[u8], base_offset: usize, skip_before: usize) {
if let Some(bridge) = self.regex.fuzzy_bridge() {
self.search_with_bridge(bridge, data, base_offset, skip_before);
} else {
if let Ok(text) = std::str::from_utf8(data) {
self.search_string_fallback(text, base_offset, skip_before);
}
}
}
fn search_with_bridge(
&mut self,
bridge: &FuzzyBridge,
data: &[u8],
base_offset: usize,
skip_before: usize,
) {
if let Some((_pattern_idx, start, result)) =
bridge.find_first_multi_pattern_individual(data, self.threshold, &[0])
{
if result.end > skip_before {
self.pending_matches.push(StreamingMatch::new(
base_offset + start,
base_offset + result.end,
result.total_edits(),
result.similarity,
));
}
}
}
fn search_string_fallback(&mut self, text: &str, base_offset: usize, skip_before: usize) {
if let Some(m) = self.regex.find(text) {
if m.end() > skip_before {
self.pending_matches.push(StreamingMatch::new(
base_offset + m.start(),
base_offset + m.end(),
0,
1.0,
));
}
}
}
pub fn search_reader<R: Read>(self, reader: R) -> ReaderMatches<'r, R> {
ReaderMatches::new(self, reader)
}
}
pub struct FeedMatches<'a> {
matches: &'a [StreamingMatch],
index: usize,
}
impl Iterator for FeedMatches<'_> {
type Item = StreamingMatch;
fn next(&mut self) -> Option<Self::Item> {
if self.index < self.matches.len() {
let m = self.matches[self.index].clone();
self.index += 1;
Some(m)
} else {
None
}
}
}
impl ExactSizeIterator for FeedMatches<'_> {
fn len(&self) -> usize {
self.matches.len() - self.index
}
}
pub struct ReaderMatches<'r, R: Read> {
matcher: StreamingMatcher<'r>,
reader: R,
buffer: Vec<u8>,
chunk_size: usize,
pending: Vec<StreamingMatch>,
pending_index: usize,
finished: bool,
}
impl<'r, R: Read> ReaderMatches<'r, R> {
fn new(matcher: StreamingMatcher<'r>, reader: R) -> Self {
let chunk_size = 8192; Self {
matcher,
reader,
buffer: vec![0u8; chunk_size],
chunk_size,
pending: Vec::new(),
pending_index: 0,
finished: false,
}
}
#[must_use]
pub fn with_chunk_size(mut self, size: usize) -> Self {
self.chunk_size = size;
self.buffer = vec![0u8; size];
self
}
}
impl<R: Read> Iterator for ReaderMatches<'_, R> {
type Item = StreamingMatch;
fn next(&mut self) -> Option<Self::Item> {
loop {
if self.pending_index < self.pending.len() {
let m = self.pending[self.pending_index].clone();
self.pending_index += 1;
return Some(m);
}
if self.finished {
return None;
}
match self.reader.read(&mut self.buffer) {
Ok(0) => {
self.finished = true;
if let Some(m) = self.matcher.finish() {
return Some(m);
}
return None;
}
Ok(n) => {
self.pending.clear();
self.pending_index = 0;
for m in self.matcher.feed(&self.buffer[..n]) {
self.pending.push(m);
}
}
Err(_) => {
self.finished = true;
return None;
}
}
}
}
}
pub struct ByteMatches<'r, 't> {
regex: &'r FuzzyRegex,
text: &'t [u8],
last_end: usize,
}
impl<'r, 't> ByteMatches<'r, 't> {
pub(crate) fn new(regex: &'r FuzzyRegex, text: &'t [u8]) -> Self {
Self {
regex,
text,
last_end: 0,
}
}
}
impl Iterator for ByteMatches<'_, '_> {
type Item = StreamingMatch;
fn next(&mut self) -> Option<Self::Item> {
if self.last_end >= self.text.len() {
return None;
}
let search_slice = &self.text[self.last_end..];
if let Some(bridge) = self.regex.fuzzy_bridge() {
if let Some((_pattern_idx, start, result)) =
bridge.find_first_multi_pattern_individual(search_slice, 0.0, &[0])
{
let abs_start = self.last_end + start;
let abs_end = self.last_end + result.end;
self.last_end = abs_end.max(self.last_end + 1);
return Some(StreamingMatch::new(
abs_start,
abs_end,
result.total_edits(),
result.similarity,
));
}
} else {
if let Ok(text) = std::str::from_utf8(search_slice)
&& let Some(m) = self.regex.find(text)
{
let abs_start = self.last_end + m.start();
let abs_end = self.last_end + m.end();
self.last_end = abs_end.max(self.last_end + 1);
return Some(StreamingMatch::new(abs_start, abs_end, 0, 1.0));
}
}
None
}
}