use crate::model::filesystem::{FileSearchCursor, FileSearchOptions, FileSystem, SearchMatch};
use regex::bytes::Regex;
use std::io;
use std::path::PathBuf;
use super::LineScanChunk;
#[derive(Debug)]
pub struct ChunkedSearchState {
pub chunks: Vec<LineScanChunk>,
pub next_chunk: usize,
pub next_doc_offset: usize,
pub total_bytes: usize,
pub scanned_bytes: usize,
pub regex: regex::bytes::Regex,
pub matches: Vec<SearchMatch>,
pub overlap_tail: Vec<u8>,
pub overlap_doc_offset: usize,
pub max_matches: usize,
pub capped: bool,
pub query_len: usize,
pub(crate) running_line: usize,
}
impl ChunkedSearchState {
pub fn is_done(&self) -> bool {
self.next_chunk >= self.chunks.len() || self.capped
}
pub fn progress_percent(&self) -> usize {
if self.total_bytes > 0 {
(self.scanned_bytes * 100) / self.total_bytes
} else {
100
}
}
}
#[derive(Debug)]
pub(crate) enum SearchRegion {
Unloaded {
file_offset: usize,
bytes: usize,
doc_offset: usize,
},
Loaded { data: Vec<u8>, doc_offset: usize },
}
#[derive(Debug)]
pub struct HybridSearchPlan {
pub(crate) file_path: PathBuf,
pub(crate) regions: Vec<SearchRegion>,
}
impl HybridSearchPlan {
pub fn execute(
&self,
fs: &dyn FileSystem,
pattern: &str,
opts: &FileSearchOptions,
regex: &Regex,
max_matches: usize,
query_len: usize,
) -> io::Result<Vec<SearchMatch>> {
if self.regions.is_empty() {
return Ok(vec![]);
}
if self.regions.len() == 1 {
if let SearchRegion::Unloaded { .. } = &self.regions[0] {
let mut cursor = FileSearchCursor::new();
let mut all_matches = Vec::new();
while !cursor.done && all_matches.len() < max_matches {
let batch = fs.search_file(&self.file_path, pattern, opts, &mut cursor)?;
all_matches.extend(batch);
}
all_matches.truncate(max_matches);
return Ok(all_matches);
}
}
let overlap_size = query_len.max(256);
let mut all_matches: Vec<SearchMatch> = Vec::new();
let mut running_line: usize = 1;
let mut prev_tail: Vec<u8> = Vec::new();
for region in &self.regions {
if all_matches.len() >= max_matches {
break;
}
let remaining = max_matches - all_matches.len();
match region {
SearchRegion::Unloaded {
file_offset,
bytes,
doc_offset: region_doc_offset,
} => {
if !prev_tail.is_empty() {
let overlap_read = (*bytes).min(overlap_size);
if let Ok(head) =
fs.read_range(&self.file_path, *file_offset as u64, overlap_read)
{
let boundary = search_boundary_overlap(
&prev_tail,
&head,
*region_doc_offset - prev_tail.len(),
running_line,
regex,
remaining,
);
all_matches.extend(boundary);
}
}
let mut opts_bounded = opts.clone();
opts_bounded.max_matches = remaining.saturating_sub(all_matches.len());
let mut cursor = FileSearchCursor::for_range(
*file_offset,
*file_offset + *bytes,
running_line,
);
while !cursor.done && all_matches.len() < max_matches {
let mut batch =
fs.search_file(&self.file_path, pattern, &opts_bounded, &mut cursor)?;
for m in &mut batch {
m.byte_offset = *region_doc_offset + (m.byte_offset - *file_offset);
}
all_matches.extend(batch);
}
running_line = cursor.running_line;
if *bytes >= overlap_size {
let tail_off = *file_offset + *bytes - overlap_size;
prev_tail = fs
.read_range(&self.file_path, tail_off as u64, overlap_size)
.unwrap_or_default();
} else {
prev_tail = fs
.read_range(&self.file_path, *file_offset as u64, *bytes)
.unwrap_or_default();
}
}
SearchRegion::Loaded {
data,
doc_offset: region_doc_offset,
} => {
let mut search_buf = Vec::with_capacity(prev_tail.len() + data.len());
search_buf.extend_from_slice(&prev_tail);
search_buf.extend_from_slice(data);
let overlap_len = prev_tail.len();
let buf_doc_offset = if overlap_len > 0 {
*region_doc_offset - overlap_len
} else {
*region_doc_offset
};
let newlines_in_overlap = search_buf[..overlap_len]
.iter()
.filter(|&&b| b == b'\n')
.count();
let mut line_at = running_line.saturating_sub(newlines_in_overlap);
let mut counted_to = 0usize;
for m in regex.find_iter(&search_buf) {
if overlap_len > 0 && m.end() <= overlap_len {
continue;
}
if all_matches.len() >= max_matches {
break;
}
line_at += search_buf[counted_to..m.start()]
.iter()
.filter(|&&b| b == b'\n')
.count();
counted_to = m.start();
let line_start = search_buf[..m.start()]
.iter()
.rposition(|&b| b == b'\n')
.map(|p| p + 1)
.unwrap_or(0);
let line_end = search_buf[m.start()..]
.iter()
.position(|&b| b == b'\n')
.map(|p| m.start() + p)
.unwrap_or(search_buf.len());
let match_doc_offset = buf_doc_offset + m.start();
let column = m.start() - line_start + 1;
let context =
String::from_utf8_lossy(&search_buf[line_start..line_end]).into_owned();
all_matches.push(SearchMatch {
byte_offset: match_doc_offset,
length: m.end() - m.start(),
line: line_at,
column,
context,
});
}
running_line += data.iter().filter(|&&b| b == b'\n').count();
let tail_start = data.len().saturating_sub(overlap_size);
prev_tail = data[tail_start..].to_vec();
}
}
}
all_matches.truncate(max_matches);
Ok(all_matches)
}
}
pub(crate) fn search_boundary_overlap(
prev_tail: &[u8],
next_head: &[u8],
doc_offset: usize,
running_line: usize,
regex: &Regex,
max_matches: usize,
) -> Vec<SearchMatch> {
let mut buf = Vec::with_capacity(prev_tail.len() + next_head.len());
buf.extend_from_slice(prev_tail);
buf.extend_from_slice(next_head);
let overlap_len = prev_tail.len();
let newlines_before = prev_tail.iter().filter(|&&b| b == b'\n').count();
let mut line_at = running_line.saturating_sub(newlines_before);
let mut counted_to = 0usize;
let mut matches = Vec::new();
for m in regex.find_iter(&buf) {
if m.start() < overlap_len && m.end() > overlap_len {
if matches.len() >= max_matches {
break;
}
line_at += buf[counted_to..m.start()]
.iter()
.filter(|&&b| b == b'\n')
.count();
counted_to = m.start();
let line_start = buf[..m.start()]
.iter()
.rposition(|&b| b == b'\n')
.map(|p| p + 1)
.unwrap_or(0);
let line_end = buf[m.start()..]
.iter()
.position(|&b| b == b'\n')
.map(|p| m.start() + p)
.unwrap_or(buf.len());
let column = m.start() - line_start + 1;
let context = String::from_utf8_lossy(&buf[line_start..line_end]).into_owned();
matches.push(SearchMatch {
byte_offset: doc_offset + m.start(),
length: m.end() - m.start(),
line: line_at,
column,
context,
});
}
}
matches
}