Skip to main content

keyhog_scanner/engine/
windowed.rs

1use super::*;
2use std::collections::{HashSet, VecDeque};
3
4impl CompiledScanner {
5    pub(super) fn scan_windowed(
6        &self,
7        chunk: &Chunk,
8        deadline: Option<std::time::Instant>,
9    ) -> Vec<RawMatch> {
10        let chunk_text = &chunk.data;
11        if chunk_text.len() > 512 * 1024 * 1024 {
12            tracing::warn!(
13                "Chunk from {} exceeds 512MB limit ({} bytes), skipping to prevent OOM.",
14                chunk.metadata.path.as_deref().unwrap_or("unknown"),
15                chunk_text.len()
16            );
17            return Vec::new();
18        }
19        let mut all_matches = Vec::with_capacity((chunk_text.len() / 4096).max(16));
20        let mut seen = HashSet::new();
21        let mut seen_order = VecDeque::new();
22        let mut offset = 0usize;
23
24        while offset < chunk_text.len() {
25            if let Some(deadline) = deadline {
26                if std::time::Instant::now() > deadline {
27                    break;
28                }
29            }
30            let end = window_end_offset(chunk_text, offset, MAX_SCAN_CHUNK_BYTES);
31            let window_chunk = window_chunk(chunk, offset, end);
32            let backend = self.select_backend_for_file(window_chunk.data.len() as u64);
33            for mut raw_match in self.scan_inner(&window_chunk, backend, deadline) {
34                if record_window_match(
35                    chunk_text,
36                    offset,
37                    &mut raw_match,
38                    &mut seen,
39                    &mut seen_order,
40                ) {
41                    all_matches.push(raw_match);
42                }
43            }
44            if end >= chunk_text.len() {
45                break;
46            }
47            offset = next_window_offset(chunk_text, end, WINDOW_OVERLAP_BYTES);
48        }
49
50        all_matches
51    }
52}
53
54pub fn window_end_offset(text: &str, start: usize, max_len: usize) -> usize {
55    let mut end = (start + max_len).min(text.len());
56    while end < text.len() && !text.is_char_boundary(end) {
57        end += 1;
58    }
59    end
60}
61
62pub fn next_window_offset(text: &str, current_end: usize, overlap: usize) -> usize {
63    let mut next = current_end.saturating_sub(overlap);
64    while next < text.len() && !text.is_char_boundary(next) {
65        next += 1;
66    }
67    next
68}
69
70pub fn window_chunk(chunk: &Chunk, start: usize, end: usize) -> Chunk {
71    Chunk {
72        data: chunk.data.as_str()[start..end].to_string().into(),
73        metadata: chunk.metadata.clone(),
74    }
75}
76
77pub fn record_window_match(
78    text: &str,
79    window_offset: usize,
80    m: &mut RawMatch,
81    seen: &mut HashSet<(Arc<str>, Arc<str>, usize)>,
82    seen_order: &mut VecDeque<(Arc<str>, Arc<str>, usize)>,
83) -> bool {
84    m.location.offset += window_offset;
85    if m.location.line.is_some() {
86        m.location.line = Some(line_number_for_offset(text, m.location.offset));
87    }
88
89    let key = (
90        m.detector_id.clone(),
91        m.credential.clone(),
92        m.location.offset,
93    );
94    if seen.contains(&key) {
95        return false;
96    }
97
98    if seen.len() >= MAX_WINDOW_DEDUP_ENTRIES {
99        if let Some(oldest) = seen_order.pop_front() {
100            seen.remove(&oldest);
101        }
102    }
103    seen.insert(key.clone());
104    seen_order.push_back(key);
105    true
106}
107
108pub fn line_number_for_offset(text: &str, offset: usize) -> usize {
109    let safe_offset = floor_char_boundary(text, offset.min(text.len()));
110    text[..safe_offset].chars().filter(|&ch| ch == '\n').count() + 1
111}
112
113pub fn floor_char_boundary(text: &str, index: usize) -> usize {
114    if index >= text.len() {
115        return text.len();
116    }
117    let mut i = index;
118    while i > 0 && !text.is_char_boundary(i) {
119        i -= 1;
120    }
121    i
122}