use crate::error::Result;
use crate::executor::{Match, QueryOptions};
use crate::format::is_binary;
use regex::Regex;
use std::collections::{HashSet, VecDeque};
use std::io::{BufRead, BufReader, Read};
use std::path::Path;
#[derive(Debug, Default)]
pub struct StreamStats {
pub lines_read: u32,
pub bytes_read: u64,
pub matches_found: u32,
}
pub fn stream_file<R: Read>(
reader: R,
path: &Path,
regex: &Regex,
options: &QueryOptions,
is_compressed: bool,
) -> Result<Vec<Match>> {
if options.multiline {
return stream_file_multiline(reader, path, regex, options, is_compressed);
}
let mut buf_reader = BufReader::new(reader);
let mut matches = Vec::new();
let mut line_number = 0u32;
let mut byte_offset = 0u64;
if !is_compressed {
let buffer = buf_reader.fill_buf()?;
let is_bin = is_binary(buffer);
if is_bin && !options.binary {
return Ok(vec![]);
}
}
let mut line = String::new();
let mut context_before = VecDeque::new();
let mut pending_matches: Vec<Match> = Vec::new();
while buf_reader.read_line(&mut line)? > 0 {
line_number += 1;
let line_len = u64::try_from(line.len()).unwrap_or(u64::MAX);
let trimmed_line = line.trim_end().to_string();
for m in &mut pending_matches {
if m.context_after.len() < options.context_lines {
m.context_after.push(trimmed_line.clone());
}
}
let (completed, still_pending): (Vec<_>, Vec<_>) = pending_matches
.into_iter()
.partition(|m| m.context_after.len() >= options.context_lines);
matches.extend(completed);
pending_matches = still_pending;
if let Some(m) = regex.find(&line) {
let context_before_vec: Vec<String> = context_before.iter().cloned().collect();
let new_match = Match {
file_path: path.to_path_buf(),
line_number,
col: u32::try_from(m.start() + 1).unwrap_or(u32::MAX),
line_content: if options.count_only {
String::new()
} else {
trimmed_line.clone()
},
byte_offset: byte_offset + u64::try_from(m.start()).unwrap_or(u64::MAX),
context_before: context_before_vec,
context_after: vec![],
is_binary: false,
};
if options.context_lines > 0 {
pending_matches.push(new_match);
} else {
matches.push(new_match);
}
if options.max_results > 0
&& (matches.len() + pending_matches.len()) >= options.max_results
&& (pending_matches.is_empty() || matches.len() >= options.max_results)
{
break;
}
}
if options.context_lines > 0 {
context_before.push_back(trimmed_line.clone());
if context_before.len() > options.context_lines {
context_before.pop_front();
}
}
byte_offset += line_len;
line.clear();
}
matches.extend(pending_matches);
Ok(matches)
}
fn stream_file_multiline<R: Read>(
reader: R,
path: &Path,
regex: &Regex,
options: &QueryOptions,
is_compressed: bool,
) -> Result<Vec<Match>> {
let mut buf_reader = BufReader::new(reader);
if !is_compressed {
let buffer = buf_reader.fill_buf()?;
let is_bin = is_binary(buffer);
if is_bin && !options.binary {
return Ok(vec![]);
}
}
let mut content = String::new();
buf_reader.read_to_string(&mut content)?;
let mut matches = Vec::new();
let line_offsets: Vec<usize> = content.lines().fold(Vec::new(), |mut acc, line| {
let offset = if acc.is_empty() {
0
} else {
acc.last().copied().unwrap_or(0) + line.len() + 1
};
acc.push(offset);
acc
});
for m in regex.find_iter(&content) {
let match_start = m.start();
let line_number = u32::try_from(
line_offsets
.binary_search(&match_start)
.unwrap_or_else(|i| i),
)
.unwrap_or(u32::MAX)
+ 1;
let ln_idx = usize::try_from(line_number).unwrap_or(0);
let line_start = line_offsets
.get(ln_idx.saturating_sub(1))
.copied()
.unwrap_or(0);
let line_end = line_offsets
.get(ln_idx)
.map_or(content.len(), |&off| off - 1);
let line_content = content[line_start..line_end].trim_end().to_string();
let col = u32::try_from(match_start - line_start + 1).unwrap_or(u32::MAX);
matches.push(Match {
file_path: path.to_path_buf(),
line_number,
col,
line_content: if options.count_only {
String::new()
} else {
line_content
},
byte_offset: u64::try_from(match_start).unwrap_or(u64::MAX),
context_before: vec![],
context_after: vec![],
is_binary: false,
});
if options.max_results > 0 && matches.len() >= options.max_results {
break;
}
}
Ok(matches)
}
pub fn stream_file_with_windows(
mmap_data: &[u8],
path: &Path,
regex: &Regex,
options: &QueryOptions,
windows: &[(usize, usize)],
) -> Result<Vec<Match>> {
if windows.is_empty() {
return Ok(vec![]);
}
let mut matches = Vec::new();
let mut seen_offsets: HashSet<u64> = HashSet::new();
for &(win_start, win_end) in windows {
let s = win_start.min(mmap_data.len());
let e = win_end.min(mmap_data.len());
if s >= e {
continue;
}
let window = mmap_data.get(s..e).unwrap_or_default();
let line_start = if s == 0 {
0
} else {
match mmap_data.get(..s).and_then(|before| before.iter().rposition(|&b| b == b'\n')) {
Some(pos) => pos + 1,
None => 0,
}
};
let line_offsets = compute_line_offsets(mmap_data.get(line_start..e).unwrap_or_default(), line_start);
let window_str = String::from_utf8_lossy(window);
let is_lossy = matches!(window_str, std::borrow::Cow::Owned(_));
for m in regex.find_iter(&window_str) {
let abs_byte_start = if is_lossy {
s + count_bytes_before_offset(window, m.start())
} else {
s + m.start()
};
if !seen_offsets.insert(u64::try_from(abs_byte_start).unwrap_or(u64::MAX)) {
continue;
}
let line_number = u32::try_from(
line_offsets
.binary_search(&abs_byte_start)
.unwrap_or_else(|i| i),
)
.unwrap_or(u32::MAX)
+ 1;
let ln_idx = usize::try_from(line_number).unwrap_or(0);
let line_start_off = line_offsets
.get(ln_idx.saturating_sub(1))
.copied()
.unwrap_or(0);
let line_end_off = line_offsets
.get(ln_idx)
.map_or(mmap_data.len(), |&off| off);
let line_bytes = trim_line_end(mmap_data.get(line_start_off..line_end_off).unwrap_or_default());
let line_str = String::from_utf8_lossy(line_bytes).to_string();
let col = u32::try_from(abs_byte_start - line_start_off + 1).unwrap_or(u32::MAX);
let (context_before, context_after) = if options.context_lines > 0 {
let before = collect_context_before(&line_offsets, mmap_data, line_number, options.context_lines);
let after = collect_context_after(&line_offsets, mmap_data, line_number, options.context_lines);
(before, after)
} else {
(vec![], vec![])
};
let new_match = Match {
file_path: path.to_path_buf(),
line_number,
col,
line_content: if options.count_only {
String::new()
} else {
line_str
},
byte_offset: u64::try_from(abs_byte_start).unwrap_or(u64::MAX),
context_before,
context_after,
is_binary: false,
};
matches.push(new_match);
if options.max_results > 0 && matches.len() >= options.max_results {
return Ok(matches);
}
}
}
Ok(matches)
}
fn count_bytes_before_offset(data: &[u8], str_offset: usize) -> usize {
let s = String::from_utf8_lossy(data);
let mut byte_pos = 0;
for (char_pos, c) in s.chars().enumerate() {
if char_pos >= str_offset {
break;
}
byte_pos += c.len_utf8();
}
byte_pos
}
fn compute_line_offsets(data: &[u8], base_offset: usize) -> Vec<usize> {
let mut offsets = Vec::with_capacity(64);
offsets.push(base_offset);
for (i, &b) in data.iter().enumerate() {
if b == b'\n' {
let next = base_offset + i + 1;
if next < data.len() + base_offset {
offsets.push(next);
}
}
}
offsets
}
fn trim_line_end(data: &[u8]) -> &[u8] {
let end = data
.iter()
.rposition(|&b| b != b'\n' && b != b'\r')
.map_or(0, |i| i + 1);
data.get(..end).unwrap_or_default()
}
fn collect_context_before(
line_offsets: &[usize],
data: &[u8],
line_number: u32,
context_lines: usize,
) -> Vec<String> {
let match_idx = usize::try_from(line_number).unwrap_or(0).saturating_sub(1);
let start = match_idx.saturating_sub(context_lines);
let end = match_idx;
let mut ctx = Vec::with_capacity(end.saturating_sub(start));
for i in start..end {
let lo = line_offsets.get(i).copied().unwrap_or(0);
let hi = line_offsets.get(i + 1).copied().unwrap_or(data.len());
let line_bytes = trim_line_end(data.get(lo..hi).unwrap_or_default());
ctx.push(String::from_utf8_lossy(line_bytes).to_string());
}
ctx
}
fn collect_context_after(
line_offsets: &[usize],
data: &[u8],
line_number: u32,
context_lines: usize,
) -> Vec<String> {
let after_start = usize::try_from(line_number).unwrap_or(0);
let end = (after_start + context_lines).min(line_offsets.len().saturating_sub(1));
let mut ctx = Vec::with_capacity(end.saturating_sub(after_start));
for i in after_start..end {
let lo = line_offsets.get(i).copied().unwrap_or(0);
let hi = line_offsets.get(i + 1).copied().unwrap_or(data.len());
let line_bytes = trim_line_end(data.get(lo..hi).unwrap_or_default());
ctx.push(String::from_utf8_lossy(line_bytes).to_string());
}
ctx
}