moeix 0.5.0 - Docs.rs

use crate::error::Result;
use crate::executor::{Match, QueryOptions};
use crate::format::is_binary;
use regex::Regex;
use std::collections::{HashSet, VecDeque};
use std::io::{BufRead, BufReader, Read};
use std::path::Path;

#[derive(Debug, Default)]
pub struct StreamStats {
    pub lines_read: u32,
    pub bytes_read: u64,
    pub matches_found: u32,
}

/// Stream search through a reader, returning all matches.
///
/// # Errors
///
/// Returns an I/O error if reading fails.
pub fn stream_file<R: Read>(
    reader: R,
    path: &Path,
    regex: &Regex,
    options: &QueryOptions,
    is_compressed: bool,
) -> Result<Vec<Match>> {
    if options.multiline {
        return stream_file_multiline(reader, path, regex, options, is_compressed);
    }

    let mut buf_reader = BufReader::new(reader);
    let mut matches = Vec::new();
    let mut line_number = 0u32;
    let mut byte_offset = 0u64;

    if !is_compressed {
        let buffer = buf_reader.fill_buf()?;
        let is_bin = is_binary(buffer);
        if is_bin && !options.binary {
            return Ok(vec![]);
        }
    }

    let mut line = String::new();
    let mut context_before = VecDeque::new();
    let mut pending_matches: Vec<Match> = Vec::new();

    while buf_reader.read_line(&mut line)? > 0 {
        line_number += 1;
        let line_len = u64::try_from(line.len()).unwrap_or(u64::MAX);
        let trimmed_line = line.trim_end().to_string();

        for m in &mut pending_matches {
            if m.context_after.len() < options.context_lines {
                m.context_after.push(trimmed_line.clone());
            }
        }

        let (completed, still_pending): (Vec<_>, Vec<_>) = pending_matches
            .into_iter()
            .partition(|m| m.context_after.len() >= options.context_lines);
        matches.extend(completed);
        pending_matches = still_pending;

        if let Some(m) = regex.find(&line) {
            let context_before_vec: Vec<String> = context_before.iter().cloned().collect();

            let new_match = Match {
                file_path: path.to_path_buf(),
                line_number,
                col: u32::try_from(m.start() + 1).unwrap_or(u32::MAX),
                line_content: if options.count_only {
                    String::new()
                } else {
                    trimmed_line.clone()
                },
                byte_offset: byte_offset + u64::try_from(m.start()).unwrap_or(u64::MAX),
                context_before: context_before_vec,
                context_after: vec![],
                is_binary: false,
            };

            if options.context_lines > 0 {
                pending_matches.push(new_match);
            } else {
                matches.push(new_match);
            }

            if options.max_results > 0
                && (matches.len() + pending_matches.len()) >= options.max_results
                && (pending_matches.is_empty() || matches.len() >= options.max_results)
            {
                break;
            }
        }

        if options.context_lines > 0 {
            context_before.push_back(trimmed_line.clone());
            if context_before.len() > options.context_lines {
                context_before.pop_front();
            }
        }

        byte_offset += line_len;
        line.clear();
    }

    matches.extend(pending_matches);
    Ok(matches)
}

fn stream_file_multiline<R: Read>(
    reader: R,
    path: &Path,
    regex: &Regex,
    options: &QueryOptions,
    is_compressed: bool,
) -> Result<Vec<Match>> {
    let mut buf_reader = BufReader::new(reader);

    if !is_compressed {
        let buffer = buf_reader.fill_buf()?;
        let is_bin = is_binary(buffer);
        if is_bin && !options.binary {
            return Ok(vec![]);
        }
    }

    let mut content = String::new();
    buf_reader.read_to_string(&mut content)?;

    let mut matches = Vec::new();
    let line_offsets: Vec<usize> = content.lines().fold(Vec::new(), |mut acc, line| {
        let offset = if acc.is_empty() {
            0
        } else {
            acc.last().copied().unwrap_or(0) + line.len() + 1
        };
        acc.push(offset);
        acc
    });

    for m in regex.find_iter(&content) {
        let match_start = m.start();
        let line_number = u32::try_from(
            line_offsets
                .binary_search(&match_start)
                .unwrap_or_else(|i| i),
        )
        .unwrap_or(u32::MAX)
            + 1;

        let ln_idx = usize::try_from(line_number).unwrap_or(0);
        let line_start = line_offsets
            .get(ln_idx.saturating_sub(1))
            .copied()
            .unwrap_or(0);
        let line_end = line_offsets
            .get(ln_idx)
            .map_or(content.len(), |&off| off - 1);
        let line_content = content[line_start..line_end].trim_end().to_string();

        let col = u32::try_from(match_start - line_start + 1).unwrap_or(u32::MAX);

        matches.push(Match {
            file_path: path.to_path_buf(),
            line_number,
            col,
            line_content: if options.count_only {
                String::new()
            } else {
                line_content
            },
            byte_offset: u64::try_from(match_start).unwrap_or(u64::MAX),
            context_before: vec![],
            context_after: vec![],
            is_binary: false,
        });

        if options.max_results > 0 && matches.len() >= options.max_results {
            break;
        }
    }

    Ok(matches)
}

/// Stream search within byte-range windows of mmap data.
///
/// For each window, extracts the slice, runs regex.find on it,
/// and converts match offsets back to absolute file positions.
///
/// # Errors
///
/// Returns an I/O error if reading fails.
pub fn stream_file_with_windows(
    mmap_data: &[u8],
    path: &Path,
    regex: &Regex,
    options: &QueryOptions,
    windows: &[(usize, usize)],
) -> Result<Vec<Match>> {
    if windows.is_empty() {
        return Ok(vec![]);
    }

    let mut matches = Vec::new();
    let mut seen_offsets: HashSet<u64> = HashSet::new();

    for &(win_start, win_end) in windows {
        let s = win_start.min(mmap_data.len());
        let e = win_end.min(mmap_data.len());
        if s >= e {
            continue;
        }

        let window = mmap_data.get(s..e).unwrap_or_default();

        let line_start = if s == 0 {
            0
        } else {
            match mmap_data.get(..s).and_then(|before| before.iter().rposition(|&b| b == b'\n')) {
                Some(pos) => pos + 1,
                None => 0,
            }
        };

        let line_offsets = compute_line_offsets(mmap_data.get(line_start..e).unwrap_or_default(), line_start);

        let window_str = String::from_utf8_lossy(window);
        let is_lossy = matches!(window_str, std::borrow::Cow::Owned(_));

        for m in regex.find_iter(&window_str) {
            let abs_byte_start = if is_lossy {
                s + count_bytes_before_offset(window, m.start())
            } else {
                s + m.start()
            };

            if !seen_offsets.insert(u64::try_from(abs_byte_start).unwrap_or(u64::MAX)) {
                continue;
            }

            let line_number = u32::try_from(
                line_offsets
                    .binary_search(&abs_byte_start)
                    .unwrap_or_else(|i| i),
            )
            .unwrap_or(u32::MAX)
                + 1;

            let ln_idx = usize::try_from(line_number).unwrap_or(0);
            let line_start_off = line_offsets
                .get(ln_idx.saturating_sub(1))
                .copied()
                .unwrap_or(0);
            let line_end_off = line_offsets
                .get(ln_idx)
                .map_or(mmap_data.len(), |&off| off);

            let line_bytes = trim_line_end(mmap_data.get(line_start_off..line_end_off).unwrap_or_default());
            let line_str = String::from_utf8_lossy(line_bytes).to_string();

            let col = u32::try_from(abs_byte_start - line_start_off + 1).unwrap_or(u32::MAX);

            let (context_before, context_after) = if options.context_lines > 0 {
                let before = collect_context_before(&line_offsets, mmap_data, line_number, options.context_lines);
                let after = collect_context_after(&line_offsets, mmap_data, line_number, options.context_lines);
                (before, after)
            } else {
                (vec![], vec![])
            };

            let new_match = Match {
                file_path: path.to_path_buf(),
                line_number,
                col,
                line_content: if options.count_only {
                    String::new()
                } else {
                    line_str
                },
                byte_offset: u64::try_from(abs_byte_start).unwrap_or(u64::MAX),
                context_before,
                context_after,
                is_binary: false,
            };

            matches.push(new_match);

            if options.max_results > 0 && matches.len() >= options.max_results {
                return Ok(matches);
            }
        }
    }

    Ok(matches)
}

fn count_bytes_before_offset(data: &[u8], str_offset: usize) -> usize {
    let s = String::from_utf8_lossy(data);
    let mut byte_pos = 0;
    for (char_pos, c) in s.chars().enumerate() {
        if char_pos >= str_offset {
            break;
        }
        byte_pos += c.len_utf8();
    }
    byte_pos
}

fn compute_line_offsets(data: &[u8], base_offset: usize) -> Vec<usize> {
    let mut offsets = Vec::with_capacity(64);
    offsets.push(base_offset);
    for (i, &b) in data.iter().enumerate() {
        if b == b'\n' {
            let next = base_offset + i + 1;
            if next < data.len() + base_offset {
                offsets.push(next);
            }
        }
    }
    offsets
}

fn trim_line_end(data: &[u8]) -> &[u8] {
    let end = data
        .iter()
        .rposition(|&b| b != b'\n' && b != b'\r')
        .map_or(0, |i| i + 1);
    data.get(..end).unwrap_or_default()
}

fn collect_context_before(
    line_offsets: &[usize],
    data: &[u8],
    line_number: u32,
    context_lines: usize,
) -> Vec<String> {
    let match_idx = usize::try_from(line_number).unwrap_or(0).saturating_sub(1);
    let start = match_idx.saturating_sub(context_lines);
    let end = match_idx;
    let mut ctx = Vec::with_capacity(end.saturating_sub(start));
    for i in start..end {
        let lo = line_offsets.get(i).copied().unwrap_or(0);
        let hi = line_offsets.get(i + 1).copied().unwrap_or(data.len());
        let line_bytes = trim_line_end(data.get(lo..hi).unwrap_or_default());
        ctx.push(String::from_utf8_lossy(line_bytes).to_string());
    }
    ctx
}

fn collect_context_after(
    line_offsets: &[usize],
    data: &[u8],
    line_number: u32,
    context_lines: usize,
) -> Vec<String> {
    let after_start = usize::try_from(line_number).unwrap_or(0);
    let end = (after_start + context_lines).min(line_offsets.len().saturating_sub(1));
    let mut ctx = Vec::with_capacity(end.saturating_sub(after_start));
    for i in after_start..end {
        let lo = line_offsets.get(i).copied().unwrap_or(0);
        let hi = line_offsets.get(i + 1).copied().unwrap_or(data.len());
        let line_bytes = trim_line_end(data.get(lo..hi).unwrap_or_default());
        ctx.push(String::from_utf8_lossy(line_bytes).to_string());
    }
    ctx
}