use memmap2::MmapOptions;
use std::fs::File;
use std::path::Path;
pub(super) fn read_file_buffered(path: &Path) -> Option<String> {
let bytes = read_file_safe(path).ok()?;
decode_text_file(&bytes)
}
fn open_file_safe(path: &Path) -> std::io::Result<File> {
let mut options = std::fs::OpenOptions::new();
options.read(true);
#[cfg(unix)]
{
use std::os::unix::fs::OpenOptionsExt;
options.custom_flags(libc::O_NOFOLLOW);
}
#[cfg(windows)]
{
if let Ok(meta) = std::fs::symlink_metadata(path) {
if meta.file_type().is_symlink() {
return Err(std::io::Error::new(
std::io::ErrorKind::PermissionDenied,
"refusing to follow symlink (Windows safety guard)",
));
}
}
}
options.open(path)
}
pub(super) fn read_file_safe(path: &Path) -> std::io::Result<Vec<u8>> {
let mut file = open_file_safe(path)?;
#[cfg(target_os = "linux")]
{
use std::os::unix::io::AsRawFd;
let fd = file.as_raw_fd();
unsafe { libc::posix_fadvise(fd, 0, 0, libc::POSIX_FADV_SEQUENTIAL) };
}
let mut bytes = Vec::new();
std::io::Read::read_to_end(&mut file, &mut bytes)?;
Ok(bytes)
}
pub(super) fn read_file_mmap(path: &Path) -> Option<String> {
let mut file = open_file_safe(path).ok()?;
#[cfg(unix)]
{
use std::os::unix::io::AsRawFd;
let fd = file.as_raw_fd();
if unsafe { libc::flock(fd, libc::LOCK_SH | libc::LOCK_NB) } != 0 {
let mut bytes = Vec::new();
if std::io::Read::read_to_end(&mut file, &mut bytes).is_ok() {
return decode_text_file(&bytes);
}
return None;
}
}
let mmap = match unsafe { MmapOptions::new().map(&file) } {
Ok(m) => m,
Err(_) => {
let mut bytes = Vec::new();
if std::io::Read::read_to_end(&mut file, &mut bytes).is_ok() {
return decode_text_file(&bytes);
}
return None;
}
};
#[cfg(unix)]
{
unsafe {
libc::madvise(
mmap.as_ptr() as *mut libc::c_void,
mmap.len(),
libc::MADV_SEQUENTIAL,
);
}
}
let result = decode_text_file(&mmap);
#[cfg(unix)]
{
use std::os::unix::io::AsRawFd;
let fd = file.as_raw_fd();
unsafe { libc::flock(fd, libc::LOCK_UN) };
}
result
}
pub(super) enum FileBytes {
Mmap(memmap2::Mmap),
Owned(Vec<u8>),
}
impl FileBytes {
pub fn as_slice(&self) -> &[u8] {
match self {
FileBytes::Mmap(m) => m,
FileBytes::Owned(v) => v,
}
}
#[cfg(test)]
pub fn len(&self) -> usize {
self.as_slice().len()
}
}
pub(super) fn read_file_for_compressed_input(
path: &Path,
size_cap: u64,
) -> Option<FileBytes> {
let file = open_file_safe(path).ok()?;
let metadata = file.metadata().ok()?;
if metadata.len() > size_cap {
tracing::warn!(
path = %path.display(),
size = metadata.len(),
cap = size_cap,
"compressed file exceeds size cap; refusing to map"
);
return None;
}
if metadata.len() == 0 {
return Some(FileBytes::Owned(Vec::new()));
}
#[cfg(unix)]
{
use std::os::unix::io::AsRawFd;
if unsafe { libc::flock(file.as_raw_fd(), libc::LOCK_SH | libc::LOCK_NB) } != 0 {
return std::fs::read(path).ok().map(FileBytes::Owned);
}
}
match unsafe { MmapOptions::new().map(&file) } {
Ok(mmap) => {
#[cfg(unix)]
{
unsafe {
libc::madvise(
mmap.as_ptr() as *mut libc::c_void,
mmap.len(),
libc::MADV_SEQUENTIAL,
);
}
use std::os::unix::io::AsRawFd;
unsafe { libc::flock(file.as_raw_fd(), libc::LOCK_UN) };
}
Some(FileBytes::Mmap(mmap))
}
Err(_) => {
#[cfg(unix)]
{
use std::os::unix::io::AsRawFd;
unsafe { libc::flock(file.as_raw_fd(), libc::LOCK_UN) };
}
std::fs::read(path).ok().map(FileBytes::Owned)
}
}
}
pub(super) struct FileWindow {
pub offset: usize,
pub text: String,
}
pub(super) fn read_file_windowed_mmap(
path: &Path,
window_size: usize,
overlap: usize,
) -> Option<Vec<FileWindow>> {
debug_assert!(window_size > overlap, "window must exceed overlap");
let file = open_file_safe(path).ok()?;
#[cfg(unix)]
{
use std::os::unix::io::AsRawFd;
let fd = file.as_raw_fd();
if unsafe { libc::flock(fd, libc::LOCK_SH | libc::LOCK_NB) } != 0 {
return None;
}
}
let mmap = match unsafe { MmapOptions::new().map(&file) } {
Ok(m) => m,
Err(_) => {
#[cfg(unix)]
{
use std::os::unix::io::AsRawFd;
unsafe { libc::flock(file.as_raw_fd(), libc::LOCK_UN) };
}
return None;
}
};
#[cfg(unix)]
{
unsafe {
libc::madvise(
mmap.as_ptr() as *mut libc::c_void,
mmap.len(),
libc::MADV_SEQUENTIAL,
);
}
}
let windows = slice_into_windows(&mmap, window_size, overlap);
#[cfg(unix)]
{
use std::os::unix::io::AsRawFd;
unsafe { libc::flock(file.as_raw_fd(), libc::LOCK_UN) };
}
Some(windows)
}
pub(super) fn slice_into_windows(
bytes: &[u8],
window_size: usize,
overlap: usize,
) -> Vec<FileWindow> {
assert!(window_size > overlap, "window must exceed overlap");
if bytes.is_empty() {
return Vec::new();
}
let stride = window_size - overlap;
let total = bytes.len();
let mut out = Vec::with_capacity(total.div_ceil(stride));
let mut offset = 0usize;
while offset < total {
let end = (offset + window_size).min(total);
let slice = &bytes[offset..end];
let text = String::from_utf8_lossy(slice).into_owned();
out.push(FileWindow { offset, text });
if end >= total {
break;
}
offset += stride;
}
out
}
fn decode_text_file(bytes: &[u8]) -> Option<String> {
if has_binary_magic(bytes) || has_utf16_nul_pattern(bytes) {
return None;
}
if let Some(text) = decode_utf16(bytes) {
return Some(text);
}
let bytes = bytes.strip_prefix(&[0xEF, 0xBB, 0xBF]).unwrap_or(bytes);
if let Ok(s) = std::str::from_utf8(bytes) {
if looks_binary_header_check(bytes) {
return None;
}
return Some(s.to_owned());
}
if looks_binary(bytes) {
return None;
}
Some(String::from_utf8_lossy(bytes).into_owned())
}
fn looks_binary_header_check(bytes: &[u8]) -> bool {
let window = &bytes[..bytes.len().min(4096)];
if window.is_empty() {
return false;
}
let mut suspicious: u32 = 0;
for &byte in window {
if byte < 0x20 && !matches!(byte, b'\n' | b'\r' | b'\t' | 0x0C) {
suspicious += 1;
if (suspicious as usize) * 20 > window.len() {
return true;
}
}
}
false
}
fn looks_binary(bytes: &[u8]) -> bool {
if has_binary_magic(bytes) || has_utf16_nul_pattern(bytes) {
return true;
}
if let Some(first_nul) = memchr::memchr(0, bytes) {
if first_nul < 1024 {
let is_utf16 = bytes.len() >= 4
&& ((bytes[0] == 0 && bytes[1] != 0) || (bytes[0] != 0 && bytes[1] == 0));
if !is_utf16 {
return true;
}
}
}
let total = bytes.len() as u64;
if total == 0 {
return false;
}
let mut suspicious: u64 = 0;
for (i, &byte) in bytes.iter().enumerate() {
let is_susp = byte < 0x20 && !matches!(byte, b'\n' | b'\r' | b'\t' | 0x0C);
if is_susp {
suspicious += 1;
if suspicious * 20 > total {
return true;
}
}
if i & 0xFFF == 0xFFF {
let scanned = (i as u64) + 1;
let remaining = total - scanned;
if (suspicious + remaining) * 20 <= total {
return false;
}
}
}
suspicious * 20 > total
}
fn has_binary_magic(bytes: &[u8]) -> bool {
const MAGIC_HEADERS: &[&[u8]] = &[
b"%PDF-",
b"PK\x03\x04",
b"\x89PNG\r\n\x1a\n",
b"\xD0\xCF\x11\xE0",
];
MAGIC_HEADERS.iter().any(|header| bytes.starts_with(header))
}
fn has_utf16_nul_pattern(bytes: &[u8]) -> bool {
bytes.len() >= 4
&& (bytes[0] == 0xFF && bytes[1] == 0xFE || bytes[0] == 0xFE && bytes[1] == 0xFF)
}
fn decode_utf16(bytes: &[u8]) -> Option<String> {
let (little_endian, payload) = if let Some(rest) = bytes.strip_prefix(&[0xFF, 0xFE]) {
(true, rest)
} else if let Some(rest) = bytes.strip_prefix(&[0xFE, 0xFF]) {
(false, rest)
} else {
return None;
};
let chunks = payload.chunks_exact(2);
if !chunks.remainder().is_empty() {
return None;
}
let units = chunks.map(|chunk| {
if little_endian {
u16::from_le_bytes([chunk[0], chunk[1]])
} else {
u16::from_be_bytes([chunk[0], chunk[1]])
}
});
let mut out = String::with_capacity(payload.len() / 2);
for r in char::decode_utf16(units) {
out.push(r.ok()?);
}
Some(out)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn looks_binary_empty_input_is_text() {
assert!(!looks_binary(&[]));
}
#[test]
fn looks_binary_clean_ascii_is_text() {
let s = "hello world\nfoo = bar\n".repeat(1024);
assert!(!looks_binary(s.as_bytes()));
}
#[test]
fn looks_binary_dense_controls_is_binary() {
let mut bytes = vec![b'a'; 1024];
for b in bytes.iter_mut().take(200) {
*b = 0x03; }
assert!(looks_binary(&bytes));
}
#[test]
fn looks_binary_sparse_controls_is_text() {
let mut bytes = vec![b'a'; 1000];
for b in bytes.iter_mut().take(50) {
*b = 0x03;
}
assert!(!looks_binary(&bytes));
}
#[test]
fn looks_binary_short_circuit_matches_full_scan() {
for size in [1, 100, 4095, 4096, 4097, 8192, 16384, 100_000] {
for density in [0u8, 1, 4, 5, 6, 50] {
let mut bytes = vec![b'.'; size];
for i in (0..size)
.step_by(100usize.saturating_div(density.max(1) as usize).max(1))
.take((size * density as usize) / 100)
{
bytes[i] = 0x03;
}
let suspicious = bytes
.iter()
.filter(|&&b| b < 0x20 && !matches!(b, b'\n' | b'\r' | b'\t' | 0x0C))
.count() as u64;
let expected = suspicious * 20 > bytes.len().max(1) as u64;
assert_eq!(
looks_binary(&bytes),
expected,
"size={size} density={density}"
);
}
}
}
#[test]
fn decode_utf16_le_round_trip() {
let s = "hello, 世界! 🌍";
let mut bytes = vec![0xFF, 0xFE];
for u in s.encode_utf16() {
bytes.extend_from_slice(&u.to_le_bytes());
}
assert_eq!(decode_utf16(&bytes).as_deref(), Some(s));
}
#[test]
fn decode_utf16_be_round_trip() {
let s = "hello, 世界! 🌍";
let mut bytes = vec![0xFE, 0xFF];
for u in s.encode_utf16() {
bytes.extend_from_slice(&u.to_be_bytes());
}
assert_eq!(decode_utf16(&bytes).as_deref(), Some(s));
}
#[test]
fn decode_utf16_no_bom_is_none() {
let s = "hello";
let mut bytes = Vec::new();
for u in s.encode_utf16() {
bytes.extend_from_slice(&u.to_le_bytes());
}
assert!(decode_utf16(&bytes).is_none());
}
#[test]
fn decode_utf16_odd_length_payload_is_none() {
let bytes = [0xFF, 0xFE, 0x68];
assert!(decode_utf16(&bytes).is_none());
}
#[test]
fn decode_utf16_unpaired_surrogate_is_none() {
let bytes = [0xFF, 0xFE, 0x00, 0xD8, b'a', 0x00];
assert!(decode_utf16(&bytes).is_none());
}
#[test]
fn decode_text_file_valid_utf8_takes_fast_path() {
let s = "let x = 1;\nfn main() {}\n".repeat(500);
assert_eq!(decode_text_file(s.as_bytes()).as_deref(), Some(s.as_str()));
}
#[test]
fn decode_text_file_with_bom_strips_bom() {
let mut bytes = vec![0xEF, 0xBB, 0xBF];
bytes.extend_from_slice(b"hello world");
assert_eq!(decode_text_file(&bytes).as_deref(), Some("hello world"));
}
#[test]
fn decode_text_file_pdf_magic_is_rejected() {
let mut bytes = b"%PDF-1.7\n".to_vec();
bytes.extend_from_slice(&vec![b'a'; 4096]);
assert!(decode_text_file(&bytes).is_none());
}
#[test]
fn decode_text_file_invalid_utf8_falls_back_to_lossy() {
let mut bytes = b"valid prefix ".to_vec();
bytes.push(0xFF); bytes.extend_from_slice(b" suffix");
let decoded = decode_text_file(&bytes).expect("lossy fallback runs");
assert!(decoded.contains("valid prefix"));
assert!(decoded.contains("suffix"));
assert!(decoded.contains('\u{FFFD}'));
}
#[test]
fn decode_text_file_dense_controls_in_header_rejected() {
let mut bytes = vec![b'a'; 4096];
for b in bytes.iter_mut().take(400) {
*b = 0x01;
}
assert!(decode_text_file(&bytes).is_none());
}
#[test]
fn slice_into_windows_empty_input_returns_empty() {
assert!(slice_into_windows(&[], 64, 8).is_empty());
}
#[test]
fn slice_into_windows_smaller_than_window_yields_one_window() {
let bytes = b"hello, world";
let ws = slice_into_windows(bytes, 64, 8);
assert_eq!(ws.len(), 1);
assert_eq!(ws[0].offset, 0);
assert_eq!(ws[0].text, "hello, world");
}
#[test]
fn slice_into_windows_exactly_one_window_size() {
let bytes = vec![b'a'; 64];
let ws = slice_into_windows(&bytes, 64, 8);
assert_eq!(ws.len(), 1);
assert_eq!(ws[0].offset, 0);
assert_eq!(ws[0].text.len(), 64);
}
#[test]
fn slice_into_windows_one_byte_over_window_emits_two_windows() {
let bytes: Vec<u8> = (0..65u8).collect();
let ws = slice_into_windows(&bytes, 64, 8);
assert_eq!(ws.len(), 2);
assert_eq!(ws[0].offset, 0);
assert_eq!(ws[0].text.len(), 64);
assert_eq!(ws[1].offset, 56);
assert_eq!(ws[1].text.len(), 9);
}
#[test]
fn slice_into_windows_overlap_bytes_match_between_neighbours() {
let bytes: Vec<u8> = b"0123456789abcdefghijklmnopqrstuvwxyz".iter().copied().cycle().take(200).collect();
let ws = slice_into_windows(&bytes, 100, 16);
assert!(ws.len() >= 2);
for pair in ws.windows(2) {
let prev = &pair[0];
let next = &pair[1];
let prev_tail = &prev.text.as_bytes()[prev.text.len() - 16..];
let next_head = &next.text.as_bytes()[..16];
assert_eq!(prev_tail, next_head, "overlap mismatch at {}", next.offset);
assert_eq!(next.offset - prev.offset, 100 - 16);
}
}
#[test]
fn slice_into_windows_offsets_cover_the_whole_input() {
let bytes: Vec<u8> = (b'a'..=b'z').cycle().take(10_000).collect();
let ws = slice_into_windows(&bytes, 256, 32);
let mut covered = vec![false; bytes.len()];
for w in &ws {
assert_eq!(
w.text.len(),
(w.offset + w.text.len()).min(bytes.len()) - w.offset,
"ASCII input → text len equals slice len"
);
let end = (w.offset + w.text.len()).min(bytes.len());
covered[w.offset..end].fill(true);
}
assert!(
covered.iter().all(|&c| c),
"every byte must be covered by some window"
);
}
#[test]
fn slice_into_windows_secret_straddling_cut_present_in_both_windows() {
let mut bytes = vec![b'.'; 200];
let secret = b"AKIAIOSFODNN7EXAMPLE";
bytes[100..100 + secret.len()].copy_from_slice(secret);
let ws = slice_into_windows(&bytes, 128, 32);
assert_eq!(ws.len(), 2, "expected exactly 2 windows for len=200, ws=128, ov=32");
let s = std::str::from_utf8(secret).unwrap();
assert!(ws[0].text.contains(s), "window 0 must carry the straddling secret");
assert!(ws[1].text.contains(s), "window 1 must carry the straddling secret");
}
#[test]
fn slice_into_windows_invalid_utf8_at_boundary_decodes_lossy() {
let mut bytes = vec![b'a'; 120];
bytes[63] = 0xE2;
bytes[64] = 0x98;
bytes[65] = 0x83;
let ws = slice_into_windows(&bytes, 64, 8);
assert_eq!(ws.len(), 2, "expected 2 windows for len=120, ws=64, ov=8");
assert!(ws[0].text.ends_with('\u{FFFD}'));
assert!(ws[1].text.contains('☃'));
}
#[test]
fn slice_into_windows_large_input_window_count_matches_formula() {
let bytes = vec![b'x'; 4096];
let ws = slice_into_windows(&bytes, 1024, 64);
assert_eq!(ws.len(), 5);
assert_eq!(ws[0].offset, 0);
assert_eq!(ws[1].offset, 960);
assert_eq!(ws[2].offset, 1920);
assert_eq!(ws[3].offset, 2880);
assert_eq!(ws[4].offset, 3840);
assert_eq!(ws[4].text.len(), 256);
}
#[test]
#[should_panic(expected = "window must exceed overlap")]
fn slice_into_windows_panics_when_overlap_geq_window() {
slice_into_windows(b"abc", 16, 16);
}
#[test]
fn read_file_windowed_mmap_roundtrip_matches_pure_helper() {
let dir = tempfile::tempdir().unwrap();
let path = dir.path().join("big.txt");
let bytes: Vec<u8> = (0..u8::MAX).cycle().take(8192).collect();
std::fs::write(&path, &bytes).unwrap();
let pure = slice_into_windows(&bytes, 1024, 32);
let mapped = read_file_windowed_mmap(&path, 1024, 32).expect("mmap windows");
assert_eq!(pure.len(), mapped.len());
for (a, b) in pure.iter().zip(mapped.iter()) {
assert_eq!(a.offset, b.offset);
assert_eq!(a.text, b.text);
}
}
#[test]
fn read_file_for_compressed_input_returns_full_contents_via_mmap() {
let dir = tempfile::tempdir().unwrap();
let path = dir.path().join("blob.bin");
let payload: Vec<u8> = (0..=255u8).cycle().take(8192).collect();
std::fs::write(&path, &payload).unwrap();
let fb = read_file_for_compressed_input(&path, 1024 * 1024).expect("read ok");
assert_eq!(fb.as_slice(), &payload[..]);
assert_eq!(fb.len(), payload.len());
}
#[test]
fn read_file_for_compressed_input_handles_empty_file() {
let dir = tempfile::tempdir().unwrap();
let path = dir.path().join("empty.bin");
std::fs::write(&path, b"").unwrap();
let fb = read_file_for_compressed_input(&path, 1024).expect("empty ok");
assert!(fb.as_slice().is_empty());
assert_eq!(fb.len(), 0);
}
#[test]
fn read_file_for_compressed_input_refuses_oversize_input() {
let dir = tempfile::tempdir().unwrap();
let path = dir.path().join("big.bin");
std::fs::write(&path, vec![0u8; 4096]).unwrap();
let fb = read_file_for_compressed_input(&path, 1024);
assert!(fb.is_none(), "input exceeding size_cap must return None");
let fb = read_file_for_compressed_input(&path, 4096);
assert!(fb.is_some(), "input at-or-below size_cap must succeed");
}
#[test]
fn read_file_for_compressed_input_returns_none_for_missing_path() {
let fb = read_file_for_compressed_input(
std::path::Path::new("/nonexistent/keyhog/test/path"),
1024,
);
assert!(fb.is_none());
}
#[test]
fn read_file_windowed_mmap_handles_empty_file() {
let dir = tempfile::tempdir().unwrap();
let path = dir.path().join("empty.txt");
std::fs::write(&path, b"").unwrap();
if let Some(v) = read_file_windowed_mmap(&path, 1024, 32) {
assert!(v.is_empty());
}
}
}