use keyhog_core::{Chunk, ChunkMetadata, RawMatch};
use super::{floor_char_boundary, CompiledScanner};
const MAX_BOUNDARY: usize = 1024;
pub fn scan_chunk_boundaries(
scanner: &CompiledScanner,
chunks: &[Chunk],
per_chunk_results: &mut [Vec<RawMatch>],
) {
if chunks.len() < 2 {
return;
}
debug_assert_eq!(chunks.len(), per_chunk_results.len());
use std::collections::HashMap;
let mut groups: HashMap<(&str, &str), Vec<usize>> = HashMap::new();
for (i, c) in chunks.iter().enumerate() {
let Some(path) = c.metadata.path.as_deref() else {
continue;
};
groups
.entry((c.metadata.source_type.as_str(), path))
.or_default()
.push(i);
}
for (_, mut indices) in groups {
if indices.len() < 2 {
continue;
}
indices.sort_by_key(|&i| chunks[i].metadata.base_offset);
for w in indices.windows(2) {
let (ai, bi) = (w[0], w[1]);
scan_one_pair(scanner, &chunks[ai], &chunks[bi], ai, bi, per_chunk_results);
}
}
}
fn scan_one_pair(
scanner: &CompiledScanner,
a: &Chunk,
b: &Chunk,
ai: usize,
bi: usize,
per_chunk_results: &mut [Vec<RawMatch>],
) {
if ai >= per_chunk_results.len() || bi >= per_chunk_results.len() {
return;
}
let a_bytes = a.data.as_ref().as_bytes();
let b_bytes = b.data.as_ref().as_bytes();
let a_end = a.metadata.base_offset.saturating_add(a_bytes.len());
if a_end != b.metadata.base_offset {
return;
}
if a_bytes.is_empty() || b_bytes.is_empty() {
return;
}
let tail_start = a_bytes.len().saturating_sub(MAX_BOUNDARY);
let tail_start = floor_char_boundary(a.data.as_ref(), tail_start);
let tail = &a.data.as_ref()[tail_start..];
let head_end = b_bytes.len().min(MAX_BOUNDARY);
let head_end = floor_char_boundary(b.data.as_ref(), head_end);
let head = &b.data.as_ref()[..head_end];
if tail.is_empty() || head.is_empty() {
return;
}
let Some(boundary_base_offset) = a.metadata.base_offset.checked_add(tail_start) else {
return;
};
let mut buf = String::with_capacity(tail.len() + head.len());
buf.push_str(tail);
let seam_local = buf.len();
buf.push_str(head);
let boundary_chunk = Chunk {
data: buf.into(),
metadata: ChunkMetadata {
base_offset: boundary_base_offset,
..b.metadata.clone()
},
};
let boundary_matches = scanner.scan(&boundary_chunk);
let Some(seam_file_offset) = boundary_base_offset.checked_add(seam_local) else {
return;
};
for m in boundary_matches {
let start = m.location.offset;
let end = start.saturating_add(m.credential.as_ref().len());
if !(start < seam_file_offset && end > seam_file_offset) {
continue;
}
let already_seen = per_chunk_results[ai]
.iter()
.chain(per_chunk_results[bi].iter())
.any(|x| {
x.location.offset == m.location.offset
&& x.detector_id == m.detector_id
&& x.credential_hash == m.credential_hash
});
if already_seen {
continue;
}
per_chunk_results[bi].push(m);
}
}