use crate::types::{DocumentExtractor, ExtractContext, ExtractedChunk, LocationKind};
use crate::{ExtractorRegistry, chunk};
use orbok_core::{ErrorCategory, OrbokError, SourceId};
use orbok_fs::ValidatedPath;
use std::fs;
use std::path::Path;
fn validated(path: &Path) -> ValidatedPath {
ValidatedPath {
source_id: SourceId::generate(),
canonical: fs::canonicalize(path).unwrap(),
}
}
struct PanickingExtractor;
impl DocumentExtractor for PanickingExtractor {
fn name(&self) -> &'static str {
"panic-test"
}
fn version(&self) -> &'static str {
"v0"
}
fn supported_extensions(&self) -> &'static [&'static str] {
&["panic_test"]
}
fn extract_with_context(
&self,
_path: &ValidatedPath,
_context: &ExtractContext,
) -> orbok_core::OrbokResult<crate::types::ExtractOutput> {
panic!("intentional test panic in extractor");
}
fn extract(
&self,
path: &ValidatedPath,
) -> orbok_core::OrbokResult<crate::types::ExtractOutput> {
self.extract_with_context(path, &ExtractContext::default())
}
}
#[test]
fn panicking_extractor_returns_parser_panic_error() {
let dir = tempfile::tempdir().unwrap();
let file = dir.path().join("test.panic_test");
fs::write(&file, "content").unwrap();
let vp = validated(&file);
let extractor = PanickingExtractor;
let ctx = ExtractContext::default();
let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
extractor.extract_with_context(&vp, &ctx)
}));
let registry = crate::registry::ExtractorRegistry::new_with(vec![Box::new(PanickingExtractor)]);
let safe_result = registry.extract_safely(&vp, &ctx);
assert!(result.is_err(), "raw call panics");
match safe_result {
Err(OrbokError::Extraction { category, .. }) => {
assert_eq!(category, ErrorCategory::ParserPanic);
}
other => panic!("expected ParserPanic, got {other:?}"),
}
}
#[test]
fn missing_file_returns_source_missing() {
let vp = ValidatedPath {
source_id: SourceId::generate(),
canonical: std::path::PathBuf::from("/nonexistent/path/to/file.txt"),
};
let result = ExtractorRegistry::default().extract_with_context(&vp, &ExtractContext::default());
match result {
Err(OrbokError::Extraction { category, .. }) => {
assert_eq!(category, ErrorCategory::SourceMissing);
}
other => panic!("expected SourceMissing, got {other:?}"),
}
}
#[test]
fn invalid_utf8_returns_encoding_error() {
let dir = tempfile::tempdir().unwrap();
let file = dir.path().join("bad.txt");
fs::write(&file, b"\xFF\xFE invalid bytes").unwrap();
let result = ExtractorRegistry::default()
.extract_with_context(&validated(&file), &ExtractContext::default());
match result {
Err(OrbokError::Extraction { category, .. }) => {
assert_eq!(category, ErrorCategory::EncodingError);
}
other => panic!("expected EncodingError, got {other:?}"),
}
}
#[test]
fn unsupported_extension_returns_unsupported_type() {
let dir = tempfile::tempdir().unwrap();
let file = dir.path().join("file.xyz_unknown");
fs::write(&file, "content").unwrap();
let result = ExtractorRegistry::default()
.extract_with_context(&validated(&file), &ExtractContext::default());
match result {
Err(OrbokError::Extraction { category, .. }) => {
assert_eq!(category, ErrorCategory::UnsupportedType);
}
other => panic!("expected UnsupportedType, got {other:?}"),
}
}
#[test]
fn plain_text_segments_use_lines_location_kind() {
let dir = tempfile::tempdir().unwrap();
let file = dir.path().join("test.txt");
fs::write(&file, "Para one.\n\nPara two.\n").unwrap();
let output = ExtractorRegistry::default()
.extract_with_context(&validated(&file), &ExtractContext::default())
.unwrap();
for seg in &output.segments {
assert_eq!(
seg.location_kind,
LocationKind::Lines,
"plain text must use Lines, got {:?}",
seg.location_kind
);
}
}
#[test]
fn markdown_segments_use_lines_location_kind() {
let dir = tempfile::tempdir().unwrap();
let file = dir.path().join("test.md");
fs::write(&file, "# Heading\n\nContent.\n").unwrap();
let output = ExtractorRegistry::default()
.extract_with_context(&validated(&file), &ExtractContext::default())
.unwrap();
for seg in &output.segments {
assert_eq!(seg.location_kind, LocationKind::Lines);
}
}
#[test]
fn html_segments_use_blocks_location_kind() {
let dir = tempfile::tempdir().unwrap();
let file = dir.path().join("test.html");
fs::write(&file, "<p>Hello world.</p><p>Second paragraph.</p>").unwrap();
let output = ExtractorRegistry::default()
.extract_with_context(&validated(&file), &ExtractContext::default())
.unwrap();
for seg in &output.segments {
assert_eq!(
seg.location_kind,
LocationKind::Blocks,
"HTML must use Blocks, got {:?}",
seg.location_kind
);
}
}
#[test]
fn chunker_propagates_location_kind() {
let dir = tempfile::tempdir().unwrap();
let file = dir.path().join("test.md");
fs::write(&file, "# Section\n\nContent paragraph.\n").unwrap();
let output = ExtractorRegistry::default()
.extract_with_context(&validated(&file), &ExtractContext::default())
.unwrap();
let chunks: Vec<ExtractedChunk> = chunk(&output, "test.md");
assert_eq!(chunks[0].location_kind, LocationKind::Lines);
}
#[test]
fn chunker_returns_extracted_chunk_not_chunk_spec() {
let dir = tempfile::tempdir().unwrap();
let file = dir.path().join("test.txt");
fs::write(&file, "Hello.\n").unwrap();
let output = ExtractorRegistry::default()
.extract_with_context(&validated(&file), &ExtractContext::default())
.unwrap();
let chunks: Vec<ExtractedChunk> = chunk(&output, "test.txt");
assert!(!chunks.is_empty());
assert_eq!(chunks[0].chunk_kind, "document");
}