keyhog_scanner/engine/
windowed.rs1use super::*;
2use std::collections::{HashSet, VecDeque};
3
4impl CompiledScanner {
5 pub(super) fn scan_windowed(
6 &self,
7 chunk: &Chunk,
8 deadline: Option<std::time::Instant>,
9 ) -> Vec<RawMatch> {
10 let chunk_text = &chunk.data;
11 if chunk_text.len() > 512 * 1024 * 1024 {
12 tracing::warn!(
13 "Chunk from {} exceeds 512MB limit ({} bytes), skipping to prevent OOM.",
14 chunk.metadata.path.as_deref().unwrap_or("unknown"),
15 chunk_text.len()
16 );
17 return Vec::new();
18 }
19 let mut all_matches = Vec::with_capacity((chunk_text.len() / 4096).max(16));
20 let mut seen = HashSet::new();
21 let mut seen_order = VecDeque::new();
22 let mut offset = 0usize;
23
24 while offset < chunk_text.len() {
25 if let Some(deadline) = deadline {
26 if std::time::Instant::now() > deadline {
27 break;
28 }
29 }
30 let end = window_end_offset(chunk_text, offset, MAX_SCAN_CHUNK_BYTES);
31 let window_chunk = window_chunk(chunk, offset, end);
32 let backend = self.select_backend_for_file(window_chunk.data.len() as u64);
33 for mut raw_match in self.scan_inner(&window_chunk, backend, deadline) {
34 if record_window_match(
35 chunk_text,
36 offset,
37 &mut raw_match,
38 &mut seen,
39 &mut seen_order,
40 ) {
41 all_matches.push(raw_match);
42 }
43 }
44 if end >= chunk_text.len() {
45 break;
46 }
47 offset = next_window_offset(chunk_text, end, WINDOW_OVERLAP_BYTES);
48 }
49
50 all_matches
51 }
52}
53
54pub fn window_end_offset(text: &str, start: usize, max_len: usize) -> usize {
55 let mut end = (start + max_len).min(text.len());
56 while end < text.len() && !text.is_char_boundary(end) {
57 end += 1;
58 }
59 end
60}
61
62pub fn next_window_offset(text: &str, current_end: usize, overlap: usize) -> usize {
63 let mut next = current_end.saturating_sub(overlap);
64 while next < text.len() && !text.is_char_boundary(next) {
65 next += 1;
66 }
67 next
68}
69
70pub fn window_chunk(chunk: &Chunk, start: usize, end: usize) -> Chunk {
71 Chunk {
72 data: chunk.data.as_str()[start..end].to_string().into(),
73 metadata: chunk.metadata.clone(),
74 }
75}
76
77pub fn record_window_match(
78 text: &str,
79 window_offset: usize,
80 m: &mut RawMatch,
81 seen: &mut HashSet<(Arc<str>, Arc<str>, usize)>,
82 seen_order: &mut VecDeque<(Arc<str>, Arc<str>, usize)>,
83) -> bool {
84 m.location.offset += window_offset;
85 if m.location.line.is_some() {
86 m.location.line = Some(line_number_for_offset(text, m.location.offset));
87 }
88
89 let key = (
90 m.detector_id.clone(),
91 m.credential.clone(),
92 m.location.offset,
93 );
94 if seen.contains(&key) {
95 return false;
96 }
97
98 if seen.len() >= MAX_WINDOW_DEDUP_ENTRIES {
99 if let Some(oldest) = seen_order.pop_front() {
100 seen.remove(&oldest);
101 }
102 }
103 seen.insert(key.clone());
104 seen_order.push_back(key);
105 true
106}
107
108pub fn line_number_for_offset(text: &str, offset: usize) -> usize {
109 let safe_offset = floor_char_boundary(text, offset.min(text.len()));
110 text[..safe_offset].chars().filter(|&ch| ch == '\n').count() + 1
111}
112
113pub fn floor_char_boundary(text: &str, index: usize) -> usize {
114 if index >= text.len() {
115 return text.len();
116 }
117 let mut i = index;
118 while i > 0 && !text.is_char_boundary(i) {
119 i -= 1;
120 }
121 i
122}