use crate::normalize::normalize_document;
use crate::types::{LocationQuality, SegmentKind};
use crate::{DocumentExtractor, ExtractorRegistry};
use orbok_core::{ErrorCategory, OrbokError, SourceId};
use orbok_fs::ValidatedPath;
use std::fs;
use std::path::Path;
fn validated(path: &Path) -> ValidatedPath {
ValidatedPath {
source_id: SourceId::generate(),
canonical: fs::canonicalize(path).unwrap(),
}
}
#[test]
fn norm_v1_rules() {
let input = "\u{FEFF}line one \r\nline\u{0007} two\rline three\t.\n";
let normalized = normalize_document(input);
assert_eq!(normalized, "line one\nline two\nline three\t.\n");
}
#[test]
fn norm_v1_preserves_japanese() {
let input = "日本語のテキスト。\n改行も保持される。\n";
assert_eq!(normalize_document(input), input);
}
#[test]
fn plain_text_paragraph_lines_exact() {
let dir = tempfile::tempdir().unwrap();
let file = dir.path().join("notes.txt");
fs::write(&file, "para one line 1\npara one line 2\n\npara two\n").unwrap();
let out = ExtractorRegistry::default()
.extract(&validated(&file))
.unwrap();
assert_eq!(out.extractor_name, "plain_text");
assert_eq!(out.normalization_version, "norm-v1");
assert_eq!(out.segments.len(), 2);
assert_eq!(out.segments[0].line_start, 1);
assert_eq!(out.segments[0].line_end, 2);
assert_eq!(out.segments[1].line_start, 4);
assert_eq!(out.segments[1].line_end, 4);
assert!(
out.segments
.iter()
.all(|s| s.location_quality == LocationQuality::Exact)
);
}
#[test]
fn markdown_structure_and_heading_paths() {
let dir = tempfile::tempdir().unwrap();
let file = dir.path().join("guide.md");
fs::write(
&file,
"# Guide\n\n## Install\n\nRun the installer.\n\n```sh\ncargo install orbok\n```\n\n## Use\n\nOpen the app.\n",
)
.unwrap();
let out = ExtractorRegistry::default()
.extract(&validated(&file))
.unwrap();
assert_eq!(out.extractor_name, "markdown");
let headings: Vec<_> = out
.segments
.iter()
.filter(|s| s.kind == SegmentKind::Heading)
.collect();
assert_eq!(headings.len(), 3);
let install_para = out
.segments
.iter()
.find(|s| s.text == "Run the installer.")
.unwrap();
assert_eq!(
install_para.heading_path.as_deref(),
Some("Guide > Install")
);
assert_eq!(install_para.line_start, 5);
let code = out
.segments
.iter()
.find(|s| s.kind == SegmentKind::CodeBlock)
.unwrap();
assert_eq!(code.text, "cargo install orbok");
assert_eq!(code.line_start, 7);
assert_eq!(code.line_end, 9);
let use_para = out
.segments
.iter()
.find(|s| s.text == "Open the app.")
.unwrap();
assert_eq!(use_para.heading_path.as_deref(), Some("Guide > Use"));
}
#[test]
fn empty_file_yields_no_segments() {
let dir = tempfile::tempdir().unwrap();
let file = dir.path().join("empty.txt");
fs::write(&file, "").unwrap();
let out = ExtractorRegistry::default()
.extract(&validated(&file))
.unwrap();
assert!(out.segments.is_empty());
assert_eq!(out.char_count, 0);
}
#[test]
fn invalid_utf8_is_encoding_error() {
let dir = tempfile::tempdir().unwrap();
let file = dir.path().join("bad.txt");
fs::write(&file, [0xFFu8, 0xFE, 0x00, 0x41]).unwrap();
let err = ExtractorRegistry::default()
.extract(&validated(&file))
.unwrap_err();
match err {
OrbokError::Extraction { category, .. } => {
assert_eq!(category, ErrorCategory::EncodingError)
}
other => panic!("unexpected error {other:?}"),
}
}
#[test]
fn unknown_extension_is_unsupported() {
let dir = tempfile::tempdir().unwrap();
let file = dir.path().join("image.xyz");
fs::write(&file, "binaryish").unwrap();
let err = ExtractorRegistry::default()
.extract(&validated(&file))
.unwrap_err();
match err {
OrbokError::Extraction { category, .. } => {
assert_eq!(category, ErrorCategory::UnsupportedType)
}
other => panic!("unexpected error {other:?}"),
}
}
#[test]
fn registry_selection_by_extension() {
let registry = ExtractorRegistry::default();
assert_eq!(registry.select("md").unwrap().name(), "markdown");
assert_eq!(registry.select("rs").unwrap().name(), "plain_text");
assert!(registry.select("xyz").is_none());
}
#[test]
fn unclosed_fence_is_robust() {
let dir = tempfile::tempdir().unwrap();
let file = dir.path().join("broken.md");
fs::write(&file, "# T\n\n```\nnever closed\n").unwrap();
let out = ExtractorRegistry::default()
.extract(&validated(&file))
.unwrap();
let code = out
.segments
.iter()
.find(|s| s.kind == SegmentKind::CodeBlock)
.unwrap();
assert_eq!(code.text, "never closed");
}
use crate::chunker::chunk;
use crate::types::{ExtractOutput, ExtractedSegment};
fn extract_str(text: &str) -> ExtractOutput {
let segments = text
.split("\n\n")
.filter(|s| !s.trim().is_empty())
.enumerate()
.map(|(i, para)| ExtractedSegment {
kind: SegmentKind::Paragraph,
text: para.trim().to_string(),
line_start: (i as u32 * 2) + 1,
line_end: (i as u32 * 2) + 2,
heading_path: None,
location_quality: LocationQuality::Exact,
})
.collect();
ExtractOutput {
extractor_name: "test".into(),
extractor_version: "v1".into(),
normalization_version: "norm-v1".into(),
segments,
char_count: text.len() as u64,
}
}
#[test]
fn short_text_becomes_one_document_chunk() {
let output = extract_str("Hello world.");
let specs = chunk(&output, "doc.txt");
assert!(!specs.is_empty());
assert_eq!(specs[0].chunk_kind, "document");
assert!(specs.len() >= 1);
}
#[test]
fn long_text_becomes_multiple_chunks() {
let long_para = "word ".repeat(500); let output = extract_str(&long_para);
let specs = chunk(&output, "long.txt");
assert!(
specs.len() > 1,
"long text must produce multiple chunks, got {}",
specs.len()
);
}
#[test]
fn markdown_headings_create_section_chunks() {
use crate::types::ExtractedSegment;
let segments = vec![
ExtractedSegment {
kind: SegmentKind::Heading,
text: "Authentication".into(),
line_start: 1,
line_end: 1,
heading_path: Some("Authentication".into()),
location_quality: LocationQuality::Exact,
},
ExtractedSegment {
kind: SegmentKind::Paragraph,
text: "Tokens expire after 24 hours.".into(),
line_start: 3,
line_end: 4,
heading_path: Some("Authentication".into()),
location_quality: LocationQuality::Exact,
},
ExtractedSegment {
kind: SegmentKind::Heading,
text: "Storage".into(),
line_start: 6,
line_end: 6,
heading_path: Some("Storage".into()),
location_quality: LocationQuality::Exact,
},
ExtractedSegment {
kind: SegmentKind::Paragraph,
text: "Data is stored locally.".into(),
line_start: 8,
line_end: 9,
heading_path: Some("Storage".into()),
location_quality: LocationQuality::Exact,
},
];
let output = ExtractOutput {
extractor_name: "markdown".into(),
extractor_version: "v1".into(),
normalization_version: "norm-v1".into(),
segments,
char_count: 80,
};
let specs = chunk(&output, "guide.md");
assert!(
specs.len() >= 3,
"expected parent + 2 section chunks, got {}",
specs.len()
);
let section_kinds: Vec<&str> = specs[1..].iter().map(|s| s.chunk_kind).collect();
assert!(
section_kinds.contains(&"section"),
"expected section chunks"
);
}
#[test]
fn chunk_hash_stable_for_identical_text() {
let output = extract_str("identical content");
let specs1 = chunk(&output, "f.txt");
let specs2 = chunk(&output, "f.txt");
assert_eq!(specs1[0].normalized_text, specs2[0].normalized_text);
}
#[test]
fn fallback_chunks_have_approximate_quality() {
let long_para = "word ".repeat(400);
let output = extract_str(&long_para);
let specs = chunk(&output, "long.txt");
for spec in specs.iter().filter(|s| s.chunk_kind == "fallback") {
assert_eq!(
spec.location_quality, "approximate",
"fallback chunk must have approximate quality"
);
}
}
#[test]
fn children_point_to_parent() {
let output = extract_str("Para one.\n\nPara two.\n\nPara three.");
let specs = chunk(&output, "multi.txt");
for spec in specs.iter().skip(1) {
assert_eq!(
spec.parent_idx,
Some(0),
"child chunk {} must point to parent",
spec.chunk_ordinal
);
}
}
mod v07_features;