use crate::error::Result;
use crate::executor::{Match, QueryOptions};
use crate::format::is_binary;
use regex::Regex;
use std::collections::{HashSet, VecDeque};
use std::io::{BufRead, BufReader, Read};
use std::path::Path;
const DEFAULT_CHUNK_SIZE: usize = 16 * 1024 * 1024;
const DEFAULT_CHUNK_OVERLAP: usize = DEFAULT_CHUNK_SIZE / 16;
#[derive(Debug, Default)]
pub struct StreamStats {
pub lines_read: u32,
pub bytes_read: u64,
pub matches_found: u32,
}
pub fn stream_file<R: Read>(
reader: R,
path: &Path,
regex: &Regex,
options: &QueryOptions,
is_compressed: bool,
stats: &mut StreamStats,
) -> Result<Vec<Match>> {
if options.multiline {
return stream_file_multiline(reader, path, regex, options, is_compressed, stats);
}
let mut buf_reader = BufReader::new(reader);
let mut matches = Vec::new();
let mut line_number = 0u32;
let mut byte_offset = 0u64;
if !is_compressed {
let buffer = buf_reader.fill_buf()?;
let is_bin = is_binary(buffer);
if is_bin && !options.binary {
return Ok(vec![]);
}
}
let mut line = String::new();
let mut context_before = VecDeque::new();
let mut pending_matches: Vec<Match> = Vec::new();
while buf_reader.read_line(&mut line)? > 0 {
line_number += 1;
let line_len = u64::try_from(line.len()).unwrap_or(u64::MAX);
let trimmed_line = line.trim_end().to_string();
for m in &mut pending_matches {
if m.context_after.len() < options.context_lines {
m.context_after.push(trimmed_line.clone());
}
}
let (completed, still_pending): (Vec<_>, Vec<_>) = pending_matches
.into_iter()
.partition(|m| m.context_after.len() >= options.context_lines);
matches.extend(completed);
pending_matches = still_pending;
if let Some(m) = regex.find(&line) {
let context_before_vec: Vec<String> = context_before.iter().cloned().collect();
let new_match = Match {
file_path: path.to_path_buf(),
line_number,
col: u32::try_from(m.start() + 1).unwrap_or(u32::MAX),
line_content: if options.count_only {
String::new()
} else {
trimmed_line.clone()
},
byte_offset: byte_offset + u64::try_from(m.start()).unwrap_or(u64::MAX),
context_before: context_before_vec,
context_after: vec![],
is_binary: false,
};
if options.context_lines > 0 {
pending_matches.push(new_match);
} else {
matches.push(new_match);
}
if options.max_results > 0
&& (matches.len() + pending_matches.len()) >= options.max_results
&& (pending_matches.is_empty() || matches.len() >= options.max_results)
{
break;
}
}
if options.context_lines > 0 {
if context_before.len() == options.context_lines {
if let Some(mut old_line) = context_before.pop_front() {
old_line.clear();
old_line.push_str(&trimmed_line);
context_before.push_back(old_line);
}
} else {
context_before.push_back(trimmed_line.clone());
}
}
byte_offset += line_len;
line.clear();
}
matches.extend(pending_matches);
stats.lines_read = line_number;
stats.bytes_read = byte_offset;
stats.matches_found = matches.len() as u32;
Ok(matches)
}
#[allow(clippy::string_slice)]
fn stream_file_multiline<R: Read>(
reader: R,
path: &Path,
regex: &Regex,
options: &QueryOptions,
is_compressed: bool,
stats: &mut StreamStats,
) -> Result<Vec<Match>> {
let mut buf_reader = BufReader::new(reader);
if !is_compressed {
let buffer = buf_reader.fill_buf()?;
let is_bin = is_binary(buffer);
if is_bin && !options.binary {
return Ok(vec![]);
}
}
let mut content = String::new();
buf_reader.read_to_string(&mut content)?;
let mut matches = Vec::new();
let line_offsets: Vec<usize> = content.lines().fold(Vec::new(), |mut acc, line| {
let offset = if acc.is_empty() {
0
} else {
acc.last().copied().unwrap_or(0) + line.len() + 1
};
acc.push(offset);
acc
});
for m in regex.find_iter(&content) {
let match_start = m.start();
let line_number = u32::try_from(
line_offsets
.binary_search(&match_start)
.unwrap_or_else(|i| i),
)
.unwrap_or(u32::MAX)
+ 1;
let ln_idx = usize::try_from(line_number).unwrap_or(0);
let line_start = line_offsets
.get(ln_idx.saturating_sub(1))
.copied()
.unwrap_or(0);
let line_end = line_offsets
.get(ln_idx)
.map_or(content.len(), |&off| off - 1);
let line_content = content[line_start..line_end].trim_end().to_string();
let col = u32::try_from(match_start - line_start + 1).unwrap_or(u32::MAX);
matches.push(Match {
file_path: path.to_path_buf(),
line_number,
col,
line_content: if options.count_only {
String::new()
} else {
line_content
},
byte_offset: u64::try_from(match_start).unwrap_or(u64::MAX),
context_before: vec![],
context_after: vec![],
is_binary: false,
});
if options.max_results > 0 && matches.len() >= options.max_results {
break;
}
}
stats.bytes_read = content.len() as u64;
stats.lines_read = line_offsets.len() as u32;
stats.matches_found = matches.len() as u32;
Ok(matches)
}
#[allow(clippy::unnecessary_wraps)]
fn search_window(
mmap_data: &[u8],
path: &Path,
regex: &Regex,
options: &QueryOptions,
win_start: usize,
win_end: usize,
matches: &mut Vec<Match>,
seen_offsets: &mut HashSet<u64>,
) -> bool {
let s = win_start.min(mmap_data.len());
let e = win_end.min(mmap_data.len());
if s >= e {
return false;
}
let window = mmap_data.get(s..e).unwrap_or_default();
let line_start = mmap_data
.get(..s)
.and_then(|before| before.iter().rposition(|&b| b == b'\n'))
.map_or(0, |pos| pos + 1);
let line_offsets =
compute_line_offsets(mmap_data.get(line_start..e).unwrap_or_default(), line_start);
let window_str = String::from_utf8_lossy(window);
let is_lossy = matches!(window_str, std::borrow::Cow::Owned(_));
for m in regex.find_iter(&window_str) {
let abs_byte_start = if is_lossy {
s + count_bytes_before_offset(window, m.start())
} else {
s + m.start()
};
if !seen_offsets.insert(u64::try_from(abs_byte_start).unwrap_or(u64::MAX)) {
continue;
}
let line_number = match line_offsets.binary_search(&abs_byte_start) {
Ok(i) => u32::try_from(i).unwrap_or(u32::MAX) + 1,
Err(i) => u32::try_from(i).unwrap_or(u32::MAX),
};
let ln_idx = usize::try_from(line_number).unwrap_or(0);
let line_start_off = line_offsets
.get(ln_idx.saturating_sub(1))
.copied()
.unwrap_or(0);
let line_end_off = line_offsets.get(ln_idx).map_or(mmap_data.len(), |&off| off);
let line_bytes = trim_line_end(
mmap_data
.get(line_start_off..line_end_off)
.unwrap_or_default(),
);
let line_str = String::from_utf8_lossy(line_bytes).to_string();
let col = u32::try_from(abs_byte_start - line_start_off + 1).unwrap_or(u32::MAX);
let (context_before, context_after) = if options.context_lines > 0 {
let before = collect_context_before(
&line_offsets,
mmap_data,
line_number,
options.context_lines,
);
let after =
collect_context_after(&line_offsets, mmap_data, line_number, options.context_lines);
(before, after)
} else {
(vec![], vec![])
};
let new_match = Match {
file_path: path.to_path_buf(),
line_number,
col,
line_content: if options.count_only {
String::new()
} else {
line_str
},
byte_offset: u64::try_from(abs_byte_start).unwrap_or(u64::MAX),
context_before,
context_after,
is_binary: false,
};
matches.push(new_match);
if options.max_results > 0 && matches.len() >= options.max_results {
return true;
}
}
false
}
pub fn stream_file_chunked(
mmap_data: &[u8],
path: &Path,
regex: &Regex,
options: &QueryOptions,
) -> Result<Vec<Match>> {
let chunk_size = if options.chunk_size_bytes > 0 {
options.chunk_size_bytes
} else {
DEFAULT_CHUNK_SIZE
};
let chunk_overlap = if options.chunk_overlap_bytes > 0 {
options.chunk_overlap_bytes
} else {
DEFAULT_CHUNK_OVERLAP
};
let mut matches = Vec::new();
let mut seen_offsets: HashSet<u64> = HashSet::new();
if mmap_data.len() <= chunk_size.saturating_add(chunk_overlap) {
search_window(
mmap_data,
path,
regex,
options,
0,
mmap_data.len(),
&mut matches,
&mut seen_offsets,
);
return Ok(matches);
}
let mut start = 0usize;
while start < mmap_data.len() {
let mut end = (start + chunk_size).min(mmap_data.len());
if end < mmap_data.len() {
let remaining = &mmap_data[end..];
if let Some(nl_pos) = remaining.iter().position(|&b| b == b'\n') {
end += nl_pos + 1; } else {
end = mmap_data.len(); }
}
if search_window(
mmap_data,
path,
regex,
options,
start,
end,
&mut matches,
&mut seen_offsets,
) {
break;
}
if end >= mmap_data.len() {
break;
}
start = end.saturating_sub(chunk_overlap);
}
Ok(matches)
}
fn count_bytes_before_offset(data: &[u8], str_offset: usize) -> usize {
let s = String::from_utf8_lossy(data);
let mut byte_pos = 0;
for (char_pos, c) in s.chars().enumerate() {
if char_pos >= str_offset {
break;
}
byte_pos += c.len_utf8();
}
byte_pos
}
fn compute_line_offsets(data: &[u8], base_offset: usize) -> Vec<usize> {
let mut offsets = Vec::with_capacity(64);
offsets.push(base_offset);
for (i, &b) in data.iter().enumerate() {
if b == b'\n' {
let next = base_offset + i + 1;
if next < data.len() + base_offset {
offsets.push(next);
}
}
}
offsets
}
fn trim_line_end(data: &[u8]) -> &[u8] {
let end = data
.iter()
.rposition(|&b| b != b'\n' && b != b'\r')
.map_or(0, |i| i + 1);
data.get(..end).unwrap_or_default()
}
fn collect_context_before(
line_offsets: &[usize],
data: &[u8],
line_number: u32,
context_lines: usize,
) -> Vec<String> {
let match_idx = usize::try_from(line_number).unwrap_or(0).saturating_sub(1);
let start = match_idx.saturating_sub(context_lines);
let end = match_idx;
let mut ctx = Vec::with_capacity(end.saturating_sub(start));
for i in start..end {
let lo = line_offsets.get(i).copied().unwrap_or(0);
let hi = line_offsets.get(i + 1).copied().unwrap_or(data.len());
let line_bytes = trim_line_end(data.get(lo..hi).unwrap_or_default());
ctx.push(String::from_utf8_lossy(line_bytes).to_string());
}
ctx
}
fn collect_context_after(
line_offsets: &[usize],
data: &[u8],
line_number: u32,
context_lines: usize,
) -> Vec<String> {
let after_start = usize::try_from(line_number).unwrap_or(0);
let end = (after_start + context_lines).min(line_offsets.len().saturating_sub(1));
let mut ctx = Vec::with_capacity(end.saturating_sub(after_start));
for i in after_start..end {
let lo = line_offsets.get(i).copied().unwrap_or(0);
let hi = line_offsets.get(i + 1).copied().unwrap_or(data.len());
let line_bytes = trim_line_end(data.get(lo..hi).unwrap_or_default());
ctx.push(String::from_utf8_lossy(line_bytes).to_string());
}
ctx
}