use std::path::{Path, PathBuf};
use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
use std::sync::OnceLock;
static TOTAL_HTML_BYTES_IN_MEMORY: AtomicUsize = AtomicUsize::new(0);
static PAGES_ON_DISK: AtomicUsize = AtomicUsize::new(0);
static SPOOL_FILE_COUNTER: AtomicU64 = AtomicU64::new(0);
static SPOOL_DIR: OnceLock<SpoolDirHandle> = OnceLock::new();
struct SpoolDirHandle {
_dir: tempfile::TempDir,
path: PathBuf,
}
fn base_memory_budget() -> usize {
static VAL: OnceLock<usize> = OnceLock::new();
*VAL.get_or_init(|| {
std::env::var("SPIDER_HTML_MEMORY_BUDGET")
.ok()
.and_then(|v| v.parse().ok())
.unwrap_or(512 * 1024 * 1024)
})
}
fn base_per_page_threshold() -> usize {
static VAL: OnceLock<usize> = OnceLock::new();
*VAL.get_or_init(|| {
std::env::var("SPIDER_HTML_PAGE_SPOOL_SIZE")
.ok()
.and_then(|v| v.parse().ok())
.unwrap_or(2 * 1024 * 1024)
})
}
#[inline]
fn effective_per_page_threshold(mem_state: i8) -> usize {
match mem_state {
s if s >= 2 => 0,
1 => base_per_page_threshold() / 2,
_ => base_per_page_threshold(),
}
}
#[inline]
fn effective_budget(mem_state: i8) -> usize {
match mem_state {
s if s >= 2 => 0,
1 => base_memory_budget() * 3 / 4,
_ => base_memory_budget(),
}
}
#[inline]
pub fn track_bytes_add(n: usize) {
TOTAL_HTML_BYTES_IN_MEMORY.fetch_add(n, Ordering::Relaxed);
}
#[inline]
pub fn track_bytes_sub(n: usize) {
let _ = TOTAL_HTML_BYTES_IN_MEMORY.fetch_update(Ordering::Relaxed, Ordering::Relaxed, |cur| {
Some(cur.saturating_sub(n))
});
}
#[inline]
pub fn total_bytes_in_memory() -> usize {
TOTAL_HTML_BYTES_IN_MEMORY.load(Ordering::Relaxed)
}
#[inline]
pub fn track_page_spooled() {
PAGES_ON_DISK.fetch_add(1, Ordering::Relaxed);
}
#[inline]
pub fn track_page_unspooled() {
let _ = PAGES_ON_DISK.fetch_update(Ordering::Relaxed, Ordering::Relaxed, |cur| {
Some(cur.saturating_sub(1))
});
}
#[inline]
pub fn pages_on_disk() -> usize {
PAGES_ON_DISK.load(Ordering::Relaxed)
}
pub fn should_spool(html_len: usize) -> bool {
let mem_state = crate::utils::detect_system::get_process_memory_state_sync();
if mem_state >= 2 {
return true;
}
let budget = effective_budget(mem_state);
let current = total_bytes_in_memory();
if current.saturating_add(html_len) > budget {
return true;
}
if mem_state >= 1 && html_len > effective_per_page_threshold(mem_state) {
return true;
}
false
}
pub fn spool_dir() -> &'static Path {
&SPOOL_DIR
.get_or_init(|| {
if let Ok(custom) = std::env::var("SPIDER_HTML_SPOOL_DIR") {
let dir = PathBuf::from(&custom);
let _ = std::fs::create_dir_all(&dir);
match tempfile::Builder::new()
.prefix("spider_html_")
.tempdir_in(&dir)
{
Ok(td) => {
let path = td.path().to_path_buf();
return SpoolDirHandle { _dir: td, path };
}
Err(_) => {
return SpoolDirHandle {
_dir: tempfile::Builder::new()
.prefix("spider_html_fallback_")
.tempdir()
.expect("failed to create temp dir"),
path: dir,
};
}
}
}
let td = tempfile::Builder::new()
.prefix("spider_html_")
.tempdir()
.expect("failed to create temp dir for HTML spool");
let path = td.path().to_path_buf();
SpoolDirHandle { _dir: td, path }
})
.path
}
pub fn next_spool_path() -> PathBuf {
let id = SPOOL_FILE_COUNTER.fetch_add(1, Ordering::Relaxed);
spool_dir().join(format!("{id}.sphtml"))
}
pub fn spool_write(path: &Path, data: &[u8]) -> std::io::Result<()> {
std::fs::write(path, data)
}
pub fn spool_read(path: &Path) -> std::io::Result<Vec<u8>> {
std::fs::read(path)
}
pub fn spool_read_bytes(path: &Path) -> std::io::Result<bytes::Bytes> {
std::fs::read(path).map(bytes::Bytes::from)
}
pub fn spool_delete(path: &Path) {
let _ = std::fs::remove_file(path);
}
pub fn cleanup_spool_dir() {
if let Some(handle) = SPOOL_DIR.get() {
let _ = std::fs::remove_dir_all(&handle.path);
}
}
pub fn spool_stream_chunks<F>(path: &Path, chunk_size: usize, mut cb: F) -> std::io::Result<usize>
where
F: FnMut(&[u8]) -> bool,
{
use std::io::Read;
let mut file = std::fs::File::open(path)?;
let mut buf = vec![0u8; chunk_size];
let mut total = 0usize;
loop {
let n = file.read(&mut buf)?;
if n == 0 {
break;
}
total = total.saturating_add(n);
if !cb(&buf[..n]) {
break;
}
}
Ok(total)
}
#[cfg(test)]
pub(crate) mod tests {
use super::*;
pub fn base_per_page_helper() -> usize {
base_per_page_threshold()
}
pub fn base_budget_helper() -> usize {
base_memory_budget()
}
#[test]
fn test_byte_accounting_saturating() {
let base = total_bytes_in_memory();
track_bytes_add(1000);
assert_eq!(total_bytes_in_memory(), base + 1000);
track_bytes_sub(600);
assert_eq!(total_bytes_in_memory(), base + 400);
track_bytes_sub(400);
assert_eq!(total_bytes_in_memory(), base);
let before_sat = total_bytes_in_memory();
track_bytes_sub(before_sat + 1);
assert_eq!(total_bytes_in_memory(), 0);
track_bytes_add(before_sat);
}
#[test]
fn test_page_disk_counter() {
{
let base = pages_on_disk();
track_page_spooled();
track_page_spooled();
assert_eq!(pages_on_disk(), base + 2);
track_page_unspooled();
assert_eq!(pages_on_disk(), base + 1);
track_page_unspooled();
assert_eq!(pages_on_disk(), base);
}
}
#[test]
fn test_adaptive_thresholds() {
let base_pp = base_per_page_threshold();
let base_budget = base_memory_budget();
assert_eq!(effective_per_page_threshold(0), base_pp);
assert_eq!(effective_budget(0), base_budget);
assert_eq!(effective_per_page_threshold(1), base_pp / 2);
assert_eq!(effective_budget(1), base_budget * 3 / 4);
assert_eq!(effective_per_page_threshold(2), 0);
assert_eq!(effective_budget(2), 0);
assert_eq!(effective_per_page_threshold(3), 0);
assert_eq!(effective_budget(3), 0);
}
#[test]
fn test_spool_write_read_delete() {
let dir = std::env::temp_dir().join("spider_spool_test_rw");
let _ = std::fs::create_dir_all(&dir);
let path = dir.join("test.sphtml");
let data = b"<html><body>hello</body></html>";
spool_write(&path, data).unwrap();
let read_back = spool_read(&path).unwrap();
assert_eq!(&read_back, data);
let bytes = spool_read_bytes(&path).unwrap();
assert_eq!(&bytes[..], data);
spool_delete(&path);
assert!(!path.exists());
spool_delete(&path);
let _ = std::fs::remove_dir_all(&dir);
}
#[test]
fn test_spool_read_nonexistent() {
let path = std::env::temp_dir().join("spider_spool_does_not_exist.sphtml");
assert!(spool_read(&path).is_err());
assert!(spool_read_bytes(&path).is_err());
}
#[test]
fn test_spool_stream_chunks() {
let dir = std::env::temp_dir().join("spider_spool_stream_test2");
let _ = std::fs::create_dir_all(&dir);
let path = dir.join("stream.sphtml");
let data = b"abcdefghijklmnopqrstuvwxyz";
spool_write(&path, data).unwrap();
let mut collected = Vec::new();
let total = spool_stream_chunks(&path, 10, |chunk| {
collected.extend_from_slice(chunk);
true
})
.unwrap();
assert_eq!(collected, data);
assert_eq!(total, data.len());
spool_delete(&path);
let _ = std::fs::remove_dir_all(&dir);
}
#[test]
fn test_spool_stream_early_stop() {
let dir = std::env::temp_dir().join("spider_spool_stream_stop");
let _ = std::fs::create_dir_all(&dir);
let path = dir.join("stop.sphtml");
let data = vec![0u8; 100];
spool_write(&path, &data).unwrap();
let mut count = 0usize;
spool_stream_chunks(&path, 10, |_| {
count += 1;
count < 3 })
.unwrap();
assert_eq!(count, 3);
spool_delete(&path);
let _ = std::fs::remove_dir_all(&dir);
}
#[test]
fn test_spool_stream_nonexistent() {
let path = std::env::temp_dir().join("spider_spool_no_exist.sphtml");
let result = spool_stream_chunks(&path, 10, |_| true);
assert!(result.is_err());
}
#[test]
fn test_next_spool_path_unique() {
let p1 = next_spool_path();
let p2 = next_spool_path();
let p3 = next_spool_path();
assert_ne!(p1, p2);
assert_ne!(p2, p3);
assert_eq!(p1.extension().unwrap(), "sphtml");
}
#[test]
fn test_spool_dir_is_stable() {
let d1 = spool_dir();
let d2 = spool_dir();
assert_eq!(d1, d2);
}
#[test]
fn test_spool_empty_data() {
let path = next_spool_path();
spool_write(&path, b"").unwrap();
let read_back = spool_read(&path).unwrap();
assert!(read_back.is_empty());
let mut chunks = 0;
spool_stream_chunks(&path, 10, |_| {
chunks += 1;
true
})
.unwrap();
assert_eq!(chunks, 0, "empty file should produce zero chunks");
spool_delete(&path);
}
#[test]
fn test_spool_large_data_stream() {
let size = 1024 * 1024;
let data: Vec<u8> = (0..size).map(|i| (i % 256) as u8).collect();
let path = next_spool_path();
spool_write(&path, &data).unwrap();
let mut collected = Vec::with_capacity(size);
let total = spool_stream_chunks(&path, 65536, |chunk| {
collected.extend_from_slice(chunk);
true
})
.unwrap();
assert_eq!(total, size);
assert_eq!(collected, data);
spool_delete(&path);
}
}