Skip to main content

keyhog_scanner/engine/
windowed.rs

1use super::*;
2
3impl CompiledScanner {
4    pub(super) fn scan_windowed(
5        &self,
6        chunk: &Chunk,
7        deadline: Option<std::time::Instant>,
8    ) -> Vec<RawMatch> {
9        let chunk_text = &chunk.data;
10        if chunk_text.len() > 512 * 1024 * 1024 {
11            tracing::warn!(
12                "Chunk from {} exceeds 512MB limit ({} bytes), skipping to prevent OOM.",
13                chunk.metadata.path.as_deref().unwrap_or("unknown"),
14                chunk_text.len()
15            );
16            return Vec::new();
17        }
18        let mut all_matches = Vec::with_capacity((chunk_text.len() / 4096).max(16));
19        let mut seen = HashSet::new();
20        let mut seen_order = VecDeque::new();
21        let mut offset = 0usize;
22
23        while offset < chunk_text.len() {
24            if let Some(deadline) = deadline
25                && std::time::Instant::now() > deadline
26            {
27                break;
28            }
29            let end = window_end_offset(chunk_text, offset, MAX_SCAN_CHUNK_BYTES);
30            let window_chunk = window_chunk(chunk, offset, end);
31            let backend = self.select_backend_for_file(window_chunk.data.len() as u64);
32            for mut raw_match in self.scan_inner(&window_chunk, backend, deadline) {
33                if record_window_match(
34                    chunk_text,
35                    offset,
36                    &mut raw_match,
37                    &mut seen,
38                    &mut seen_order,
39                ) {
40                    all_matches.push(raw_match);
41                }
42            }
43            if end >= chunk_text.len() {
44                break;
45            }
46            offset = next_window_offset(chunk_text, end, WINDOW_OVERLAP_BYTES);
47        }
48
49        all_matches
50    }
51}
52
53pub fn window_end_offset(text: &str, start: usize, max_len: usize) -> usize {
54    let mut end = (start + max_len).min(text.len());
55    while end < text.len() && !text.is_char_boundary(end) {
56        end += 1;
57    }
58    end
59}
60
61pub fn next_window_offset(text: &str, current_end: usize, overlap: usize) -> usize {
62    let mut next = current_end.saturating_sub(overlap);
63    while next < text.len() && !text.is_char_boundary(next) {
64        next += 1;
65    }
66    next
67}
68
69pub fn window_chunk(chunk: &Chunk, start: usize, end: usize) -> Chunk {
70    Chunk {
71        data: chunk.data[start..end].to_string(),
72        metadata: chunk.metadata.clone(),
73    }
74}
75
76pub fn record_window_match(
77    text: &str,
78    window_offset: usize,
79    m: &mut RawMatch,
80    seen: &mut HashSet<(Arc<str>, Arc<str>, usize)>,
81    seen_order: &mut VecDeque<(Arc<str>, Arc<str>, usize)>,
82) -> bool {
83    m.location.offset += window_offset;
84    if m.location.line.is_some() {
85        m.location.line = Some(line_number_for_offset(text, m.location.offset));
86    }
87
88    let key = (
89        m.detector_id.clone(),
90        m.credential.clone(),
91        m.location.offset,
92    );
93    if seen.contains(&key) {
94        return false;
95    }
96
97    if seen.len() >= MAX_WINDOW_DEDUP_ENTRIES
98        && let Some(oldest) = seen_order.pop_front()
99    {
100        seen.remove(&oldest);
101    }
102    seen.insert(key.clone());
103    seen_order.push_back(key);
104    true
105}
106
107pub fn line_number_for_offset(text: &str, offset: usize) -> usize {
108    let safe_offset = floor_char_boundary(text, offset.min(text.len()));
109    text[..safe_offset].chars().filter(|&ch| ch == '\n').count() + 1
110}
111
112pub fn floor_char_boundary(text: &str, index: usize) -> usize {
113    if index >= text.len() {
114        return text.len();
115    }
116    let mut i = index;
117    while i > 0 && !text.is_char_boundary(i) {
118        i -= 1;
119    }
120    i
121}