keyhog-scanner 0.5.40

keyhog-scanner: high-performance SIMD-accelerated secret detection engine
Documentation
use super::*;
use std::collections::{HashSet, VecDeque};

impl CompiledScanner {
    pub(super) fn scan_windowed(
        &self,
        chunk: &Chunk,
        deadline: Option<std::time::Instant>,
    ) -> Vec<RawMatch> {
        let chunk_text = &chunk.data;
        if chunk_text.len() > 512 * 1024 * 1024 {
            tracing::warn!(
                "Chunk from {} exceeds 512MB limit ({} bytes), skipping to prevent OOM.",
                chunk.metadata.path.as_deref().unwrap_or("unknown"),
                chunk_text.len()
            );
            return Vec::new();
        }
        let mut all_matches = Vec::with_capacity((chunk_text.len() / 4096).max(16));
        let mut seen = HashSet::new();
        let mut seen_order = VecDeque::new();
        let mut offset = 0usize;

        while offset < chunk_text.len() {
            if let Some(deadline) = deadline {
                if std::time::Instant::now() > deadline {
                    break;
                }
            }
            let end = window_end_offset(chunk_text, offset, MAX_SCAN_CHUNK_BYTES);
            let window_chunk = window_chunk(chunk, offset, end);
            let backend = self.select_backend_for_file(window_chunk.data.len() as u64);
            for mut raw_match in self.scan_inner(&window_chunk, backend, deadline) {
                if record_window_match(
                    chunk_text,
                    offset,
                    &mut raw_match,
                    &mut seen,
                    &mut seen_order,
                ) {
                    all_matches.push(raw_match);
                }
            }
            if end >= chunk_text.len() {
                break;
            }
            offset = next_window_offset(chunk_text, end, WINDOW_OVERLAP_BYTES);
        }

        all_matches
    }
}

pub fn window_end_offset(text: &str, start: usize, max_len: usize) -> usize {
    let mut end = (start + max_len).min(text.len());
    while end < text.len() && !text.is_char_boundary(end) {
        end += 1;
    }
    end
}

pub fn next_window_offset(text: &str, current_end: usize, overlap: usize) -> usize {
    let mut next = current_end.saturating_sub(overlap);
    while next < text.len() && !text.is_char_boundary(next) {
        next += 1;
    }
    next
}

pub fn window_chunk(chunk: &Chunk, start: usize, end: usize) -> Chunk {
    Chunk {
        data: chunk.data.as_str()[start..end].to_string().into(),
        metadata: chunk.metadata.clone(),
    }
}

pub fn record_window_match(
    text: &str,
    window_offset: usize,
    m: &mut RawMatch,
    seen: &mut HashSet<(Arc<str>, Arc<str>, usize)>,
    seen_order: &mut VecDeque<(Arc<str>, Arc<str>, usize)>,
) -> bool {
    m.location.offset += window_offset;
    if m.location.line.is_some() {
        m.location.line = Some(line_number_for_offset(text, m.location.offset));
    }

    let key = (
        m.detector_id.clone(),
        m.credential.clone(),
        m.location.offset,
    );
    if seen.contains(&key) {
        return false;
    }

    if seen.len() >= MAX_WINDOW_DEDUP_ENTRIES {
        if let Some(oldest) = seen_order.pop_front() {
            seen.remove(&oldest);
        }
    }
    seen.insert(key.clone());
    seen_order.push_back(key);
    true
}

pub fn line_number_for_offset(text: &str, offset: usize) -> usize {
    let safe_offset = floor_char_boundary(text, offset.min(text.len()));
    text[..safe_offset].chars().filter(|&ch| ch == '\n').count() + 1
}

pub fn floor_char_boundary(text: &str, index: usize) -> usize {
    if index >= text.len() {
        return text.len();
    }
    let mut i = index;
    while i > 0 && !text.is_char_boundary(i) {
        i -= 1;
    }
    i
}