use crate::ExtractorRegistry;
use crate::types::{ExtractContext, ExtractLimits, ExtractWarning};
use orbok_core::{ErrorCategory, OrbokError, SourceId};
use orbok_fs::ValidatedPath;
use std::fs;
use std::path::Path;
fn validated(path: &Path) -> ValidatedPath {
ValidatedPath {
source_id: SourceId::generate(),
canonical: fs::canonicalize(path).unwrap(),
}
}
fn ctx_with_limits(limits: ExtractLimits) -> ExtractContext {
ExtractContext { limits }
}
#[test]
fn text_file_over_size_limit_returns_too_large() {
let dir = tempfile::tempdir().unwrap();
let file = dir.path().join("big.txt");
fs::write(&file, "Hello world.\n").unwrap();
let mut limits = ExtractLimits::default();
limits.max_file_bytes = 5; let ctx = ctx_with_limits(limits);
let result = ExtractorRegistry::default().extract_with_context(&validated(&file), &ctx);
match result {
Err(OrbokError::Extraction { category, .. }) => {
assert_eq!(category, ErrorCategory::FileTooLarge);
}
other => panic!("expected FileTooLarge, got {other:?}"),
}
}
#[test]
fn markdown_segment_limit_enforced_with_warning() {
let dir = tempfile::tempdir().unwrap();
let file = dir.path().join("many_paras.md");
let content: String = (0..50).map(|i| format!("Paragraph {i}.\n\n")).collect();
fs::write(&file, &content).unwrap();
let mut limits = ExtractLimits::default();
limits.max_segments = 5; let ctx = ctx_with_limits(limits);
let output = ExtractorRegistry::default()
.extract_with_context(&validated(&file), &ctx)
.unwrap();
assert!(
output.segments.len() <= 5,
"segments {} must not exceed limit 5",
output.segments.len()
);
assert!(
output
.warnings
.iter()
.any(|w| matches!(w, ExtractWarning::SizeLimitReached { .. })),
"SizeLimitReached warning must be emitted"
);
}
#[test]
fn char_limit_truncates_output_with_warning() {
let dir = tempfile::tempdir().unwrap();
let file = dir.path().join("long.txt");
let content = "a".repeat(10_000);
fs::write(&file, &content).unwrap();
let mut limits = ExtractLimits::default();
limits.max_extracted_chars = 100;
let ctx = ctx_with_limits(limits);
let output = ExtractorRegistry::default()
.extract_with_context(&validated(&file), &ctx)
.unwrap();
assert!(
output.char_count <= 100,
"char_count {} must not exceed limit 100",
output.char_count
);
assert!(
output
.warnings
.iter()
.any(|w| matches!(w, ExtractWarning::SizeLimitReached { .. })),
"SizeLimitReached warning must be emitted on char truncation"
);
}
#[test]
fn html_byte_limit_returns_too_large() {
let dir = tempfile::tempdir().unwrap();
let file = dir.path().join("big.html");
fs::write(&file, "<p>Hello world.</p>").unwrap();
let mut limits = ExtractLimits::default();
limits.max_html_bytes = 3;
let ctx = ctx_with_limits(limits);
let result = ExtractorRegistry::default().extract_with_context(&validated(&file), &ctx);
match result {
Err(OrbokError::Extraction { category, .. }) => {
assert_eq!(category, ErrorCategory::FileTooLarge);
}
other => panic!("expected FileTooLarge, got {other:?}"),
}
}
#[test]
fn docx_zip_entry_limit_enforced() {
let dir = tempfile::tempdir().unwrap();
let file = dir.path().join("big.docx");
fs::write(&file, b"PK\x03\x04fake docx data".repeat(10).as_slice()).unwrap();
let mut limits = ExtractLimits::default();
limits.max_zip_entry_bytes = 5; let ctx = ctx_with_limits(limits);
let result = ExtractorRegistry::default().extract_with_context(&validated(&file), &ctx);
assert!(
result.is_err(),
"oversized/malformed DOCX must return an error"
);
}
#[test]
fn clean_extraction_has_no_warnings() {
let dir = tempfile::tempdir().unwrap();
let file = dir.path().join("clean.txt");
fs::write(&file, "Hello world.\n").unwrap();
let output = ExtractorRegistry::default()
.extract_with_context(&validated(&file), &ExtractContext::default())
.unwrap();
assert!(
output.warnings.is_empty(),
"clean file must produce no warnings, got {:?}",
output.warnings
);
}