keyhog_scanner/engine/
windowed.rs1use super::*;
2
3impl CompiledScanner {
4 pub(super) fn scan_windowed(
5 &self,
6 chunk: &Chunk,
7 deadline: Option<std::time::Instant>,
8 ) -> Vec<RawMatch> {
9 let chunk_text = &chunk.data;
10 if chunk_text.len() > 512 * 1024 * 1024 {
11 tracing::warn!(
12 "Chunk from {} exceeds 512MB limit ({} bytes), skipping to prevent OOM.",
13 chunk.metadata.path.as_deref().unwrap_or("unknown"),
14 chunk_text.len()
15 );
16 return Vec::new();
17 }
18 let mut all_matches = Vec::with_capacity((chunk_text.len() / 4096).max(16));
19 let mut seen = HashSet::new();
20 let mut seen_order = VecDeque::new();
21 let mut offset = 0usize;
22
23 while offset < chunk_text.len() {
24 if let Some(deadline) = deadline
25 && std::time::Instant::now() > deadline
26 {
27 break;
28 }
29 let end = window_end_offset(chunk_text, offset, MAX_SCAN_CHUNK_BYTES);
30 let window_chunk = window_chunk(chunk, offset, end);
31 let backend = self.select_backend_for_file(window_chunk.data.len() as u64);
32 for mut raw_match in self.scan_inner(&window_chunk, backend, deadline) {
33 if record_window_match(
34 chunk_text,
35 offset,
36 &mut raw_match,
37 &mut seen,
38 &mut seen_order,
39 ) {
40 all_matches.push(raw_match);
41 }
42 }
43 if end >= chunk_text.len() {
44 break;
45 }
46 offset = next_window_offset(chunk_text, end, WINDOW_OVERLAP_BYTES);
47 }
48
49 all_matches
50 }
51}
52
53pub fn window_end_offset(text: &str, start: usize, max_len: usize) -> usize {
54 let mut end = (start + max_len).min(text.len());
55 while end < text.len() && !text.is_char_boundary(end) {
56 end += 1;
57 }
58 end
59}
60
61pub fn next_window_offset(text: &str, current_end: usize, overlap: usize) -> usize {
62 let mut next = current_end.saturating_sub(overlap);
63 while next < text.len() && !text.is_char_boundary(next) {
64 next += 1;
65 }
66 next
67}
68
69pub fn window_chunk(chunk: &Chunk, start: usize, end: usize) -> Chunk {
70 Chunk {
71 data: chunk.data[start..end].to_string(),
72 metadata: chunk.metadata.clone(),
73 }
74}
75
76pub fn record_window_match(
77 text: &str,
78 window_offset: usize,
79 m: &mut RawMatch,
80 seen: &mut HashSet<(Arc<str>, Arc<str>, usize)>,
81 seen_order: &mut VecDeque<(Arc<str>, Arc<str>, usize)>,
82) -> bool {
83 m.location.offset += window_offset;
84 if m.location.line.is_some() {
85 m.location.line = Some(line_number_for_offset(text, m.location.offset));
86 }
87
88 let key = (
89 m.detector_id.clone(),
90 m.credential.clone(),
91 m.location.offset,
92 );
93 if seen.contains(&key) {
94 return false;
95 }
96
97 if seen.len() >= MAX_WINDOW_DEDUP_ENTRIES
98 && let Some(oldest) = seen_order.pop_front()
99 {
100 seen.remove(&oldest);
101 }
102 seen.insert(key.clone());
103 seen_order.push_back(key);
104 true
105}
106
107pub fn line_number_for_offset(text: &str, offset: usize) -> usize {
108 let safe_offset = floor_char_boundary(text, offset.min(text.len()));
109 text[..safe_offset].chars().filter(|&ch| ch == '\n').count() + 1
110}
111
112pub fn floor_char_boundary(text: &str, index: usize) -> usize {
113 if index >= text.len() {
114 return text.len();
115 }
116 let mut i = index;
117 while i > 0 && !text.is_char_boundary(i) {
118 i -= 1;
119 }
120 i
121}