moeix 0.4.2 - Docs.rs

//! Query executor — search through the index and verify results.
//!
//! Handles literal searches, indexed regex, and full scans.

use crate::decompress::maybe_decompress;
use crate::error::Result;
use crate::format::is_binary;
use crate::planner::QueryPlan;
use crate::reader::{FileInfo, Reader};
use crate::trigram::Trigram;
use rayon::prelude::*;
use regex::Regex;
use std::collections::HashSet;
use std::fs::File;
use std::io::{BufRead, BufReader, Cursor, Read};
use std::path::{Path, PathBuf};
use std::sync::atomic::{AtomicU32, AtomicU64, Ordering};

/// A single regex match found in a file.
#[derive(Debug)]
pub struct Match {
    /// Absolute path to the file containing the match.
    pub file_path: PathBuf,
    /// 1-based line number.
    pub line_number: u32,
    /// 1-based column within the line (byte offset).
    pub col: u32,
    /// The entire content of the matching line.
    pub line_content: String,
    /// Byte offset from the start of the file.
    pub byte_offset: u64,
    /// Lines preceding the match (context).
    pub context_before: Vec<String>,
    /// Lines following the match (context).
    pub context_after: Vec<String>,
    /// Whether the file was detected as binary.
    pub is_binary: bool,
}

/// Performance counters collected during query execution.
#[derive(Default, Debug)]
pub struct QueryStats {
    /// Number of trigrams looked up in the trigram table.
    pub trigrams_queried: u32,
    /// Number of posting lists that were fully decoded from disk.
    pub posting_lists_decoded: u32,
    /// Number of candidate files after intersection and bloom filtering.
    pub candidate_files: u32,
    /// Number of files whose content was verified against the regex.
    pub files_verified: u32,
    /// Total bytes of file content read during verification.
    pub bytes_verified: u64,
    /// Total number of matches produced.
    pub total_matches: u32,
}

/// Tunable knobs that control query execution behaviour.
#[derive(Debug, Default, Clone)]
#[allow(clippy::struct_excessive_bools)]
pub struct QueryOptions {
    /// Only report match counts (not line content).
    pub count_only: bool,
    /// Only list file paths containing matches.
    pub files_only: bool,
    /// Maximum number of results to return (0 = unlimited).
    pub max_results: usize,
    /// File extensions to restrict the search to.
    pub type_filter: Vec<String>,
    /// Number of context lines to show before and after each match.
    pub context_lines: usize,
    /// Transparently decompress archives (e.g. `.gz`) when scanning.
    pub decompress: bool,
    /// Number of Rayon threads to use for parallel work.
    pub threads: usize,
    /// Dot-matches-newline mode for regex matching.
    pub multiline: bool,
    /// Search inside archive files (zip, tar.gz).
    pub archive: bool,
    /// Search binary files as if they were text.
    pub binary: bool,
}

/// Query executor that searches through an open index and verifies
/// candidate files against the original regex.
pub struct Executor<'a> {
    index: &'a Reader,
}

impl<'a> Executor<'a> {
    /// Create an executor backed by the given index reader.
    #[must_use]
    pub const fn new(index: &'a Reader) -> Self {
        Self { index }
    }

    /// Execute a query plan against the index.
    ///
    /// # Errors
    ///
    /// Returns an error if I/O fails when reading index sections, if posted
    /// data is corrupted, or if file content cannot be read during verification.
    pub fn execute(
        &self,
        plan: &QueryPlan,
        options: &QueryOptions,
    ) -> Result<(Vec<Match>, QueryStats)> {
        match plan {
            QueryPlan::Literal {
                pattern,
                trigrams,
                regex,
            } => self.execute_literal(pattern, trigrams, regex, options),
            QueryPlan::RegexWithLiterals {
                regex,
                required_trigram_sets,
            } => self.execute_regex_indexed(regex, required_trigram_sets, options),
            QueryPlan::CaseInsensitive {
                regex,
                trigram_groups,
            } => Ok(self.execute_case_insensitive(regex, trigram_groups, options)),
            QueryPlan::FullScan { regex } => Ok(self.execute_full_scan(regex, options)),
        }
    }

    #[allow(clippy::as_conversions)] // match counts: len()→u32 fits within range
    #[allow(clippy::indexing_slicing)] // infos sorted+checked: .get(0) always valid
    fn execute_literal(
        &self,
        _pattern: &[u8],
        trigrams: &[Trigram],
        regex: &Regex,
        options: &QueryOptions,
    ) -> Result<(Vec<Match>, QueryStats)> {
        let mut stats = QueryStats::default();

        let mut infos = Vec::new();
        for &tri in trigrams {
            stats.trigrams_queried += 1;
            if let Some(info) = self.index.get_trigram(tri) {
                infos.push((tri, info));
            } else {
                return Ok((vec![], stats));
            }
        }

        // Sort by doc_frequency (rarest first)
        infos.sort_by_key(|(_, info)| info.doc_frequency);

        tracing::debug!(
            "literal search: {} trigrams, rarities: {:?}",
            infos.len(),
            infos.iter().map(|(t, i)| (format!("0x{t:06x}"), i.doc_frequency)).collect::<Vec<_>>()
        );

        // ── Step 1: Decode rarest posting list ──
        let (_, rarest_info) = &infos[0];
        let postings = self.index.decode_postings(rarest_info)?;
        stats.posting_lists_decoded += 1;

        let mut candidates: HashSet<u32> = postings.entries.iter().map(|e| e.file_id).collect();
        tracing::debug!("step 1 (rarest): {} candidates", candidates.len());

        // ── Step 2: Intersect with next rarest lists if candidate set is large ──
        // Only decode up to 3 lists to avoid excessive I/O
        for (_, info) in infos.iter().take(infos.len().min(3)).skip(1) {
            if candidates.len() < 100 {
                tracing::debug!("step 2: breaking early, {} candidates < 100", candidates.len());
                break;
            }

            let next_postings = self.index.decode_postings(info)?;
            stats.posting_lists_decoded += 1;

            let next_set: HashSet<u32> = next_postings.entries.iter().map(|e| e.file_id).collect();
            candidates.retain(|fid| next_set.contains(fid));
        }

        // ── Step 3: Filter remaining using Bloom filters ──
        for &(tri, _) in &infos[1..] {
            if candidates.is_empty() {
                break;
            }
            candidates.retain(|&fid| self.index.bloom_may_contain(fid, tri));
        }

        stats.candidate_files = candidates.len() as u32;

        // Parallel verification
        let files_verified = AtomicU32::new(0);
        let bytes_verified = std::sync::atomic::AtomicU64::new(0);
        let matches_found = AtomicU32::new(0);

        let candidate_list: Vec<u32> = candidates.into_iter().collect();

        let mut all_matches: Vec<Match> = candidate_list
            .into_par_iter()
            .filter_map(|fid| {
                // Don't early-terminate when listing files - verify ALL candidates
                // for complete results with common terms (fixes charlie_summaries bug)
                let should_early_terminate = options.max_results > 0
                    && !options.files_only
                    && matches_found.load(Ordering::Relaxed) >= options.max_results as u32;
                if should_early_terminate {
                    return None;
                }

                let file_info = self.index.get_file(fid).ok()?;

                // Filter by extension
                if !options.type_filter.is_empty() {
                    let ext = file_info
                        .path
                        .extension()
                        .and_then(|e: &std::ffi::OsStr| e.to_str())
                        .unwrap_or("");
                    if !options.type_filter.iter().any(|e: &String| e == ext) {
                        return None;
                    }
                }

                files_verified.fetch_add(1, Ordering::Relaxed);
                bytes_verified.fetch_add(file_info.size_bytes, Ordering::Relaxed);

                let matches = Self::verify_file(&file_info, regex, options).ok()?;
                matches_found.fetch_add(matches.len() as u32, Ordering::Relaxed);
                Some(matches)
            })
            .flatten()
            .collect();

        stats.files_verified = files_verified.into_inner();
        stats.bytes_verified = bytes_verified.into_inner();

        // Don't truncate when listing files - need all unique files for complete results
        if options.max_results > 0 && !options.files_only && all_matches.len() > options.max_results {
            all_matches.truncate(options.max_results);
        }

        stats.total_matches = all_matches.len() as u32;

        Ok((all_matches, stats))
    }

    #[allow(clippy::as_conversions)] // match counts: len()→u32 fits within range
    #[allow(clippy::indexing_slicing)] // infos sorted+checked: .get(0) always valid
    fn execute_regex_indexed(
        &self,
        regex: &Regex,
        required_trigram_sets: &[Vec<Trigram>],
        options: &QueryOptions,
    ) -> Result<(Vec<Match>, QueryStats)> {
        let mut stats = QueryStats::default();

        // For each required literal fragment, find candidate files
        let mut fragment_candidates = Vec::new();
        for trigram_set in required_trigram_sets {
            let mut infos = Vec::new();
            for &tri in trigram_set {
                stats.trigrams_queried += 1;
                if let Some(info) = self.index.get_trigram(tri) {
                    infos.push((tri, info));
                } else {
                    return Ok((vec![], stats));
                }
            }

            infos.sort_by_key(|(_, info)| info.doc_frequency);

            // Intersection within fragment
            let (_, rarest_info) = &infos[0];
            let postings = self.index.decode_postings(rarest_info)?;
            stats.posting_lists_decoded += 1;
            let mut set_candidates: HashSet<u32> =
                postings.entries.iter().map(|e| e.file_id).collect();

            // Intersect with up to 2 more lists if large
            for (_, info) in infos.iter().take(infos.len().min(3)).skip(1) {
                if set_candidates.len() < 100 {
                    break;
                }
                let next_postings = self.index.decode_postings(info)?;
                stats.posting_lists_decoded += 1;
                let next_set: HashSet<u32> =
                    next_postings.entries.iter().map(|e| e.file_id).collect();
                set_candidates.retain(|fid| next_set.contains(fid));
            }

            for &(tri, _) in &infos[1..] {
                set_candidates.retain(|&fid| self.index.bloom_may_contain(fid, tri));
            }
            fragment_candidates.push(set_candidates);
        }

        // Intersect candidates from all fragments
        let mut final_candidates: HashSet<u32> = match fragment_candidates.pop() {
            Some(c) => c,
            None => return Ok((vec![], stats)),
        };
        for set in fragment_candidates {
            final_candidates.retain(|fid: &u32| set.contains(fid));
        }

        stats.candidate_files = final_candidates.len() as u32;

        let files_verified = AtomicU32::new(0);
        let bytes_verified = AtomicU64::new(0);
        let matches_found = AtomicU32::new(0);

        let candidate_list: Vec<u32> = final_candidates.into_iter().collect();

        let mut all_matches: Vec<Match> = candidate_list
            .into_par_iter()
            .filter_map(|fid| {
                // Don't early-terminate when listing files - verify ALL candidates
                let should_early_terminate = options.max_results > 0
                    && !options.files_only
                    && matches_found.load(Ordering::Relaxed) >= options.max_results as u32;
                if should_early_terminate {
                    return None;
                }

                let file_info = self.index.get_file(fid).ok()?;

                // Filter by extension
                if !options.type_filter.is_empty() {
                    let ext = file_info
                        .path
                        .extension()
                        .and_then(|e: &std::ffi::OsStr| e.to_str())
                        .unwrap_or("");
                    if !options.type_filter.iter().any(|e: &String| e == ext) {
                        return None;
                    }
                }

                files_verified.fetch_add(1, Ordering::Relaxed);
                bytes_verified.fetch_add(file_info.size_bytes, Ordering::Relaxed);

                let file_matches = Self::verify_file(&file_info, regex, options).ok()?;
                matches_found.fetch_add(file_matches.len() as u32, Ordering::Relaxed);
                Some(file_matches)
            })
            .flatten()
            .collect();

        stats.files_verified = files_verified.into_inner();
        stats.bytes_verified = bytes_verified.into_inner();

        // Don't truncate when listing files
        if options.max_results > 0 && !options.files_only && all_matches.len() > options.max_results {
            all_matches.truncate(options.max_results);
        }

        stats.total_matches = all_matches.len() as u32;
        Ok((all_matches, stats))
    }

    #[allow(clippy::as_conversions)] // match counts: len()→u32 fits within range
    fn execute_case_insensitive(
        &self,
        regex: &Regex,
        trigram_groups: &[Vec<Trigram>],
        options: &QueryOptions,
    ) -> (Vec<Match>, QueryStats) {
        let mut stats = QueryStats::default();

        // For each position group: UNION posting lists of all variants found
        let mut group_candidates = Vec::new();
        for group in trigram_groups {
            let mut union_set: HashSet<u32> = HashSet::new();
            for &tri in group {
                stats.trigrams_queried += 1;
                if let Some(info) = self.index.get_trigram(tri)
                    && let Ok(postings) = self.index.decode_postings(&info)
                {
                    stats.posting_lists_decoded += 1;
                    for entry in &postings.entries {
                        union_set.insert(entry.file_id);
                    }
                }
                // Missing variant = skip, not abort
            }
            if !union_set.is_empty() {
                group_candidates.push(union_set);
            }
        }

        // Intersect across position groups
        let final_candidates = if let Some(mut base) = group_candidates.pop() {
            for set in group_candidates {
                base.retain(|fid| set.contains(fid));
            }
            base
        } else {
            // No trigrams found at all — fall back to all files
            let all: HashSet<u32> = (0..self.index.header.file_count).collect();
            all
        };

        stats.candidate_files = final_candidates.len() as u32;

        let files_verified = AtomicU32::new(0);
        let bytes_verified = AtomicU64::new(0);
        let matches_found = AtomicU32::new(0);

        let candidate_list: Vec<u32> = final_candidates.into_iter().collect();

        let mut all_matches: Vec<Match> = candidate_list
            .into_par_iter()
            .filter_map(|fid| {
                // Don't early-terminate when listing files - verify ALL candidates
                let should_early_terminate = options.max_results > 0
                    && !options.files_only
                    && matches_found.load(Ordering::Relaxed) >= options.max_results as u32;
                if should_early_terminate {
                    return None;
                }

                let file_info = self.index.get_file(fid).ok()?;

                if !options.type_filter.is_empty() {
                    let ext = file_info
                        .path
                        .extension()
                        .and_then(|e| e.to_str())
                        .unwrap_or("");
                    if !options.type_filter.iter().any(|e| e == ext) {
                        return None;
                    }
                }

                files_verified.fetch_add(1, Ordering::Relaxed);
                bytes_verified.fetch_add(file_info.size_bytes, Ordering::Relaxed);

                let file_matches = Self::verify_file(&file_info, regex, options).ok()?;
                matches_found.fetch_add(file_matches.len() as u32, Ordering::Relaxed);
                Some(file_matches)
            })
            .flatten()
            .collect();

        stats.files_verified = files_verified.into_inner();
        stats.bytes_verified = bytes_verified.into_inner();

        // Don't truncate when listing files
        if options.max_results > 0 && !options.files_only && all_matches.len() > options.max_results {
            all_matches.truncate(options.max_results);
        }

        stats.total_matches = all_matches.len() as u32;
        (all_matches, stats)
    }

    #[allow(clippy::as_conversions)] // line count fits within range
    fn execute_full_scan(
        &self,
        regex: &Regex,
        options: &QueryOptions,
    ) -> (Vec<Match>, QueryStats) {
        let stats_candidate_files = self.index.header.file_count;

        let files_verified = AtomicU32::new(0);
        let bytes_verified = AtomicU64::new(0);
        let matches_found = AtomicU32::new(0);

        let mut all_matches: Vec<Match> = (0..self.index.header.file_count)
            .into_par_iter()
            .filter_map(|fid| {
                // Don't early-terminate when listing files - verify ALL candidates
                let should_early_terminate = options.max_results > 0
                    && !options.files_only
                    && matches_found.load(Ordering::Relaxed) >= options.max_results as u32;
                if should_early_terminate {
                    return None;
                }

                let file_info = self.index.get_file(fid).ok()?;

                // Filter by extension
                if !options.type_filter.is_empty() {
                    let ext = file_info
                        .path
                        .extension()
                        .and_then(|e: &std::ffi::OsStr| e.to_str())
                        .unwrap_or("");
                    if !options.type_filter.iter().any(|e: &String| e == ext) {
                        return None;
                    }
                }

                files_verified.fetch_add(1, Ordering::Relaxed);
                bytes_verified.fetch_add(file_info.size_bytes, Ordering::Relaxed);

                let file_matches = Self::verify_file(&file_info, regex, options).ok()?;
                matches_found.fetch_add(file_matches.len() as u32, Ordering::Relaxed);
                Some(file_matches)
            })
            .flatten()
            .collect();

        // Don't truncate when listing files
        if options.max_results > 0 && !options.files_only && all_matches.len() > options.max_results {
            all_matches.truncate(options.max_results);
        }

        let stats = QueryStats {
            candidate_files: stats_candidate_files,
            files_verified: files_verified.into_inner(),
            bytes_verified: bytes_verified.into_inner(),
            total_matches: all_matches.len() as u32,
            ..Default::default()
        };
        (all_matches, stats)
    }

    /// Exposed for integration testing of the streaming logic.
    ///
    /// # Errors
    ///
    /// Returns an error if the file content cannot be read or if
    /// regex matching operations fail.
    pub fn verify_stream_for_test<R: Read>(
        &self,
        reader: R,
        path: &Path,
        regex: &Regex,
        options: &QueryOptions,
    ) -> Result<Vec<Match>> {
        Self::verify_stream(reader, path, regex, options)
    }

    #[allow(clippy::as_conversions)] // line.len()→u64 fits within range
    fn verify_stream<R: Read>(
        reader: R,
        path: &Path,
        regex: &Regex,
        options: &QueryOptions,
    ) -> Result<Vec<Match>> {
        let mut buf_reader = BufReader::new(reader);
        let mut matches = Vec::new();
        let mut line_number = 0u32;
        let mut byte_offset = 0u64;

        // Binary check on first 8KB
        {
            let buffer = buf_reader.fill_buf()?;
            let is_bin = is_binary(buffer);
            if is_bin && !options.binary {
                return Ok(vec![]);
            }
        }

        let mut line = String::new();
        let mut context_before = std::collections::VecDeque::new();
        let mut pending_matches: Vec<Match> = Vec::new();

        while buf_reader.read_line(&mut line)? > 0 {
            line_number += 1;
            let line_len = line.len() as u64;
            let trimmed_line = line.trim_end().to_string();

            // Fill context_after for pending matches
            for m in &mut pending_matches {
                if m.context_after.len() < options.context_lines {
                    m.context_after.push(trimmed_line.clone());
                }
            }

            // Move completed matches to final list
            let (completed, still_pending): (Vec<_>, Vec<_>) = pending_matches
                .into_iter()
                .partition(|m| m.context_after.len() >= options.context_lines);
            matches.extend(completed);
            pending_matches = still_pending;

        if let Some(m) = regex.find(&line) {
            let context_before_vec: Vec<String> = context_before
                .iter()
                .cloned()
                .collect();

                let new_match = Match {
                    file_path: path.to_path_buf(),
                    line_number,
                    col: (m.start() + 1) as u32,
                    line_content: if options.count_only {
                        String::new()
                    } else {
                        trimmed_line.clone()
                    },
                    byte_offset: byte_offset + m.start() as u64,
                    context_before: context_before_vec,
                    context_after: vec![],
                    is_binary: false,
                };

                if options.context_lines > 0 {
                    pending_matches.push(new_match);
                } else {
                    matches.push(new_match);
                }

                if options.max_results > 0
                    && (matches.len() + pending_matches.len()) >= options.max_results
                    && (pending_matches.is_empty() || matches.len() >= options.max_results)
                {
                    break;
                }
            }

            if options.context_lines > 0 {
                context_before.push_back(trimmed_line.clone());
                if context_before.len() > options.context_lines {
                    context_before.pop_front();
                }
            }

            byte_offset += line_len;
            line.clear();
        }

        matches.extend(pending_matches);
        Ok(matches)
    }

    fn verify_file(
        info: &FileInfo,
        regex: &Regex,
        options: &QueryOptions,
    ) -> Result<Vec<Match>> {
        let file = File::open(&info.path)?;
        let mmap = unsafe { memmap2::Mmap::map(&file)? };

        let effective_options = if options.files_only && options.max_results == 0 {
            QueryOptions { max_results: 1, ..options.clone() }
        } else {
            options.clone()
        };

        if options.decompress
            && let Some(reader) = maybe_decompress(&info.path, &mmap)?
        {
            return Self::verify_stream(reader, info.path.as_ref(), regex, &effective_options);
        }

        Self::verify_stream(Cursor::new(&mmap[..]), info.path.as_ref(), regex, &effective_options)
    }
}