use crate::{ChunkAndIndexWorker, ExtractionWorker, run_pending};
use orbok_cache::CacheService;
use orbok_core::{
FileStatus, HiddenFilePolicy, IndexMode, JobType, PersistenceMode, SourceType, SymlinkPolicy,
};
use orbok_db::Catalog;
use orbok_db::repo::{
FileRepository, IndexJobRepository, NewFile, NewSource, ObservedMetadata, SourceRepository,
};
use orbok_extract::{
ExtractorRegistry,
types::{DocumentExtractor, LocationQuality, SegmentKind},
};
use orbok_fs::ValidatedPath;
use orbok_search::{HybridSearchService, SearchMode};
use std::fs;
use std::path::PathBuf;
fn catalog_in(root: &std::path::Path) -> (Catalog, CacheService) {
(
Catalog::open(root.join("catalog.sqlite3")).unwrap(),
CacheService::new(root),
)
}
fn validated(path: &std::path::Path) -> ValidatedPath {
ValidatedPath {
source_id: orbok_core::SourceId::from_string("s1".to_string()),
canonical: fs::canonicalize(path).unwrap(),
}
}
fn minimal_docx(content: &str) -> Vec<u8> {
use std::io::Write;
let xml = format!(
r#"<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:p><w:r><w:t>{}</w:t></w:r></w:p>
<w:p><w:r><w:t>Second paragraph here.</w:t></w:r></w:p>
</w:body></w:document>"#,
content
);
let mut buf = std::io::Cursor::new(Vec::new());
{
let mut zip = zip::ZipWriter::new(&mut buf);
let opts: zip::write::SimpleFileOptions = zip::write::SimpleFileOptions::default();
zip.start_file("[Content_Types].xml", opts).unwrap();
zip.write_all(b"<Types/>").unwrap();
zip.start_file("word/document.xml", opts).unwrap();
zip.write_all(xml.as_bytes()).unwrap();
zip.finish().unwrap();
}
buf.into_inner()
}
#[test]
fn docx_extractor_produces_paragraph_segments() {
use orbok_extract::types::DocumentExtractor;
let docx_bytes = minimal_docx("Authentication tokens expire after 24 hours.");
let dir = tempfile::tempdir().unwrap();
let path = dir.path().join("test.docx");
fs::write(&path, &docx_bytes).unwrap();
let vp = validated(&path);
let out = orbok_extract::registry::ExtractorRegistry::default()
.extract(&vp)
.unwrap();
assert_eq!(out.extractor_name, "docx");
assert!(!out.segments.is_empty(), "DOCX must produce segments");
for seg in &out.segments {
assert_eq!(
seg.location_quality,
LocationQuality::Approximate,
"DOCX segments must be Approximate"
);
assert_eq!(seg.kind, SegmentKind::Paragraph);
}
let combined: String = out
.segments
.iter()
.map(|s| s.text.as_str())
.collect::<Vec<_>>()
.join(" ");
assert!(
combined.contains("Authentication") || combined.contains("tokens"),
"extracted text should contain document content: {combined}"
);
}
#[test]
fn docx_extractor_missing_file_returns_error() {
use orbok_extract::docx::DocxExtractor;
let vp = ValidatedPath {
source_id: orbok_core::SourceId::from_string("s1".to_string()),
canonical: PathBuf::from("/nonexistent/file.docx"),
};
assert!(DocxExtractor.extract(&vp).is_err());
}
#[test]
fn docx_registered_in_registry() {
let reg = ExtractorRegistry::default();
assert_eq!(reg.select("docx").unwrap().name(), "docx");
}
const SAMPLE_HTML: &str = r#"<!DOCTYPE html>
<html>
<head><title>Test Page</title><style>body{color:red}</style></head>
<body>
<h1>Authentication Guide</h1>
<p>Tokens expire after <strong>24 hours</strong>. Error code ERR-4042 fires on expiry.</p>
<h2>Token Rotation</h2>
<p>The client_secret must be rotated every 90 days.</p>
<script>alert('ignored')</script>
<ul><li>Step one</li><li>Step two</li></ul>
</body>
</html>"#;
#[test]
fn html_extractor_strips_tags_preserves_text() {
use orbok_extract::html::HtmlExtractor;
let dir = tempfile::tempdir().unwrap();
let path = dir.path().join("test.html");
fs::write(&path, SAMPLE_HTML).unwrap();
let out = HtmlExtractor.extract(&validated(&path)).unwrap();
assert_eq!(out.extractor_name, "html");
assert!(!out.segments.is_empty());
let combined: String = out
.segments
.iter()
.map(|s| s.text.as_str())
.collect::<Vec<_>>()
.join(" ");
assert!(
combined.contains("ERR-4042") || combined.contains("Tokens"),
"HTML text should be extracted: {combined}"
);
assert!(
!combined.contains("alert"),
"script content must be stripped"
);
assert!(
!combined.contains("body{color"),
"style content must be stripped"
);
}
#[test]
fn html_extractor_tracks_heading_path() {
use orbok_extract::html::HtmlExtractor;
let dir = tempfile::tempdir().unwrap();
let path = dir.path().join("guide.html");
fs::write(&path, SAMPLE_HTML).unwrap();
let out = HtmlExtractor.extract(&validated(&path)).unwrap();
let headings: Vec<_> = out
.segments
.iter()
.filter(|s| s.kind == SegmentKind::Heading)
.collect();
assert!(
!headings.is_empty(),
"HTML extractor should produce heading segments"
);
}
#[test]
fn html_location_quality_is_approximate() {
use orbok_extract::html::HtmlExtractor;
let dir = tempfile::tempdir().unwrap();
let path = dir.path().join("test.html");
fs::write(&path, SAMPLE_HTML).unwrap();
let out = HtmlExtractor.extract(&validated(&path)).unwrap();
for seg in &out.segments {
assert_ne!(
seg.location_quality,
LocationQuality::Exact,
"HTML must not claim Exact location quality"
);
}
}
#[test]
fn html_registered_in_registry() {
let reg = ExtractorRegistry::default();
assert_eq!(reg.select("html").unwrap().name(), "html");
assert_eq!(reg.select("htm").unwrap().name(), "html");
}
#[test]
fn e2e_full_pipeline_write_scan_index_search() {
let dir = tempfile::tempdir().unwrap();
let (catalog, cache) = catalog_in(dir.path());
let root = fs::canonicalize(dir.path()).unwrap();
fs::write(dir.path().join("auth.md"),
"# Authentication\n\nRefresh tokens expire after 24 hours.\nError code ERR-4042 on missing token.\n").unwrap();
fs::write(dir.path().join("storage.md"),
"# Storage\n\nOrbok stores derived indexes not source copies.\ncleanup removes snippet cache.\n").unwrap();
fs::write(
dir.path().join("search.md"),
"# Search\n\nHybrid search combines FTS5 keyword and vector embeddings via RRF.\n",
)
.unwrap();
fs::write(
dir.path().join("config.html"),
"<h1>Configuration</h1><p>Set client_secret in environment variables.</p>",
)
.unwrap();
let src = SourceRepository::new(&catalog)
.insert(NewSource {
source_type: SourceType::Directory,
persistence_mode: PersistenceMode::Persistent,
display_name: Some("e2e-test".into()),
original_path: root.to_string_lossy().into(),
canonical_path: root.to_string_lossy().into(),
index_mode: IndexMode::Balanced,
include_patterns: vec![],
exclude_patterns: vec![],
hidden_file_policy: HiddenFilePolicy::Exclude,
symlink_policy: SymlinkPolicy::Ignore,
max_file_size_bytes: None,
})
.unwrap();
{
use orbok_fs::{ScanRequest, Scanner};
use std::sync::atomic::AtomicBool;
Scanner::new(&catalog)
.scan(
&ScanRequest {
source_id: src.source_id.clone(),
force_hash: false,
enqueue_index_jobs: true,
},
&AtomicBool::new(false),
)
.unwrap();
}
let pending = IndexJobRepository::new(&catalog).list_queued(100).unwrap();
assert!(!pending.is_empty(), "scanner must enqueue jobs");
let extract_w = ExtractionWorker::new(&catalog, &cache);
let chunk_w = ChunkAndIndexWorker::new(&catalog, &cache);
run_pending(&catalog, &extract_w, &chunk_w, None, 200).unwrap();
let remaining = IndexJobRepository::new(&catalog).list_queued(100).unwrap();
assert!(
remaining.is_empty(),
"{} jobs still queued after pipeline",
remaining.len()
);
let search = HybridSearchService::keyword_only(&catalog);
let results = search.search("ERR-4042", SearchMode::Exact, 10).unwrap();
assert!(!results.is_empty(), "ERR-4042 must be found");
assert!(
results[0].display_path.contains("auth"),
"top result for ERR-4042 must be auth.md, got: {}",
results[0].display_path
);
let results2 = search
.search("snippet cache cleanup", SearchMode::Auto, 10)
.unwrap();
assert!(
!results2.is_empty(),
"cache cleanup query must return results"
);
let results3 = search
.search("client_secret", SearchMode::Exact, 10)
.unwrap();
assert!(
!results3.is_empty(),
"HTML content must be indexed and searchable"
);
}
#[test]
fn all_documented_file_types_have_extractor() {
let reg = ExtractorRegistry::default();
let supported = [
"txt", "md", "html", "htm", "pdf", "docx", "rs", "py", "js", "ts", "go", "sql", "toml",
"yaml", "json",
];
for ext in &supported {
assert!(
reg.select(ext).is_some(),
"documented extension '.{ext}' has no registered extractor"
);
}
}
#[test]
fn plugin_registry_all_extractors_have_privacy_notes() {
use orbok_extract::PluginRegistry;
let reg = PluginRegistry::default();
assert!(
reg.len() >= 5,
"expect markdown, docx, html, plain-text, pdf"
);
for m in reg.manifests() {
assert!(
!m.privacy_note.is_empty(),
"plugin {} missing privacy_note",
m.plugin_id
);
assert!(
!m.license.is_empty(),
"plugin {} missing license",
m.plugin_id
);
}
}
#[test]
fn startup_recovery_clean_on_fresh_catalog() {
use crate::run_startup_recovery;
let dir = tempfile::tempdir().unwrap();
let (catalog, _) = catalog_in(dir.path());
let cache_path = dir.path().join("orbok-cache.sqlite3");
let report = run_startup_recovery(&catalog, &cache_path).unwrap();
assert_eq!(report.jobs_reset, 0);
assert_eq!(report.jobs_pending, 0);
assert!(!report.cache_rebuilt);
}
#[test]
fn pipeline_leaves_no_running_jobs_after_completion() {
let dir = tempfile::tempdir().unwrap();
let (catalog, cache) = catalog_in(dir.path());
fs::write(dir.path().join("note.md"), "# Note\nSome content.\n").unwrap();
let root = fs::canonicalize(dir.path())
.unwrap()
.to_string_lossy()
.to_string();
let src = SourceRepository::new(&catalog)
.insert(NewSource {
source_type: SourceType::Directory,
persistence_mode: PersistenceMode::Persistent,
display_name: None,
original_path: root.clone(),
canonical_path: root,
index_mode: IndexMode::Balanced,
include_patterns: vec![],
exclude_patterns: vec![],
hidden_file_policy: HiddenFilePolicy::Exclude,
symlink_policy: SymlinkPolicy::Ignore,
max_file_size_bytes: None,
})
.unwrap();
{
use orbok_fs::{ScanRequest, Scanner};
use std::sync::atomic::AtomicBool;
Scanner::new(&catalog)
.scan(
&ScanRequest {
source_id: src.source_id.clone(),
force_hash: false,
enqueue_index_jobs: true,
},
&AtomicBool::new(false),
)
.unwrap();
}
run_pending(
&catalog,
&ExtractionWorker::new(&catalog, &cache),
&ChunkAndIndexWorker::new(&catalog, &cache),
None,
50,
)
.unwrap();
let conn = catalog.lock();
let running: i64 = conn
.query_row(
"SELECT COUNT(*) FROM index_jobs WHERE status='running'",
[],
|r| r.get(0),
)
.unwrap();
assert_eq!(
running, 0,
"no jobs should remain in running state after pipeline"
);
}