use orbok_db::Catalog;
use orbok_db::repo::{ModelRepository, ModelRole, ModelStatus, NewModel};
use orbok_db::repo::verify_model_sha256;
use orbok_embed::{RECOMMENDED_MODEL_DIMENSION, create_embedding_model, recommended_config};
use crate::{ExtractorRegistry};
use crate::types::{DocumentExtractor, LocationQuality};
use orbok_fs::ValidatedPath;
use orbok_models::{EmbeddingModel, EmbeddingModelConfig, InferenceBackend, MockEmbeddingModel};
use std::fs;
use std::path::PathBuf;
#[test]
fn mock_backend_embeds_without_model_files() {
let config = EmbeddingModelConfig {
weights_path: String::new(),
tokenizer_path: None,
dimension: 8,
max_seq_len: 512,
backend: InferenceBackend::Mock,
model_name: "mock".into(),
model_version: "v1".into(),
};
let model = create_embedding_model(&config).unwrap();
let v = model.embed_batch(&["hello"]).unwrap();
assert_eq!(v.len(), 1);
assert_eq!(v[0].len(), 8);
}
#[cfg(not(feature = "tract"))]
#[test]
fn onnx_backend_returns_feature_error_when_not_compiled() {
let config = EmbeddingModelConfig {
weights_path: "/nonexistent.onnx".into(),
tokenizer_path: None,
dimension: 384,
max_seq_len: 512,
backend: InferenceBackend::OnnxRuntime,
model_name: "test".into(),
model_version: "v1".into(),
};
match create_embedding_model(&config) {
Err(e) => {
let msg = e.to_string();
assert!(msg.contains("tract") || msg.contains("compiled"),
"error should name the feature flag: {msg}");
}
Ok(_) => panic!("should fail without tract feature"),
}
}
#[test]
fn recommended_config_meets_rfc021_spec() {
let cfg = recommended_config("/path/to/model.onnx");
assert_eq!(cfg.dimension, RECOMMENDED_MODEL_DIMENSION);
assert_eq!(cfg.dimension, 384, "multilingual-e5-small is 384-dim");
assert!(cfg.model_name.contains("multilingual") || cfg.model_name == "multilingual-e5-small");
}
#[test]
fn storage_impact_384_dim_is_half_of_768() {
let bytes_per_chunk_384 = 384 * 4u64; let bytes_per_chunk_768 = 768 * 4u64;
assert_eq!(bytes_per_chunk_384 * 2, bytes_per_chunk_768);
let chunks = 100_000u64;
assert!(chunks * bytes_per_chunk_384 < 200 * 1024 * 1024,
"384-dim storage for 100k chunks should be < 200 MB");
}
#[test]
fn multilingual_e5_small_is_the_recommendation() {
assert_eq!(orbok_embed::RECOMMENDED_HF_MODEL_ID, "intfloat/multilingual-e5-small");
assert_eq!(orbok_embed::RECOMMENDED_MODEL_DIMENSION, 384);
}
const MINIMAL_PDF: &[u8] = b"%PDF-1.4
1 0 obj
<< /Type /Catalog /Pages 2 0 R >>
endobj
2 0 obj
<< /Type /Pages /Kids [3 0 R] /Count 1 >>
endobj
3 0 obj
<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792]
/Contents 4 0 R /Resources << /Font << /F1 5 0 R >> >> >>
endobj
4 0 obj
<< /Length 44 >>
stream
BT /F1 12 Tf 100 700 Td (Hello PDF world) Tj ET
endstream
endobj
5 0 obj
<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>
endobj
xref
0 6
0000000000 65535 f
0000000009 00000 n
0000000058 00000 n
0000000115 00000 n
0000000266 00000 n
0000000360 00000 n
trailer
<< /Size 6 /Root 1 0 R >>
startxref
441
%%EOF";
#[test]
fn pdf_extractor_extracts_text_from_valid_pdf() {
use crate::pdf::PdfExtractor;
let dir = tempfile::tempdir().unwrap();
let pdf_path = dir.path().join("test.pdf");
fs::write(&pdf_path, MINIMAL_PDF).unwrap();
let canonical = fs::canonicalize(&pdf_path).unwrap();
let vp = ValidatedPath {
source_id: orbok_core::SourceId::from_string("s1".to_string()),
canonical,
};
match PdfExtractor.extract(&vp) {
Ok(output) => {
assert_eq!(output.extractor_name, "pdf-lopdf");
for seg in &output.segments {
assert_eq!(seg.location_quality, LocationQuality::PageOnly,
"PDF segments must use PageOnly quality");
}
}
Err(_e) => {
}
}
}
#[test]
fn pdf_extractor_missing_file_returns_typed_error() {
use crate::pdf::PdfExtractor;
let vp = ValidatedPath {
source_id: orbok_core::SourceId::from_string("s1".to_string()),
canonical: PathBuf::from("/nonexistent/file.pdf"),
};
let result = PdfExtractor.extract(&vp);
assert!(result.is_err(), "missing file must return error");
}
#[test]
fn pdf_extractor_registered_in_registry() {
let registry = ExtractorRegistry::default();
let dir = tempfile::tempdir().unwrap();
let pdf_path = dir.path().join("test.pdf");
fs::write(&pdf_path, MINIMAL_PDF).unwrap();
let canonical = fs::canonicalize(&pdf_path).unwrap();
let vp = ValidatedPath {
source_id: orbok_core::SourceId::from_string("s1".to_string()),
canonical,
};
match registry.extract(&vp) {
Err(e) if e.to_string().contains("unsupported") => {
panic!("PDF must not be unsupported: {e}");
}
_ => {} }
}
#[test]
fn pdf_location_quality_is_page_only() {
use crate::pdf::PdfExtractor;
let dir = tempfile::tempdir().unwrap();
let pdf_path = dir.path().join("test.pdf");
fs::write(&pdf_path, MINIMAL_PDF).unwrap();
let vp = ValidatedPath {
source_id: orbok_core::SourceId::from_string("s1".to_string()),
canonical: fs::canonicalize(&pdf_path).unwrap(),
};
if let Ok(output) = PdfExtractor.extract(&vp) {
for seg in &output.segments {
assert_ne!(seg.location_quality, LocationQuality::Exact,
"PDF segments must never claim Exact location quality");
}
}
}
#[test]
fn model_integrity_check_passes_correct_hash() {
let dir = tempfile::tempdir().unwrap();
let model_file = dir.path().join("model.bin");
let contents = vec![0xABu8; 1024];
fs::write(&model_file, &contents).unwrap();
let expected_hash = {
use sha2::Digest;
format!("{:x}", sha2::Sha256::digest(&contents))
};
let result = verify_model_sha256(
&model_file.to_string_lossy(),
&expected_hash,
).unwrap();
assert!(result, "correct hash should pass verification");
}
#[test]
fn model_integrity_check_fails_wrong_hash() {
let dir = tempfile::tempdir().unwrap();
let model_file = dir.path().join("model.bin");
fs::write(&model_file, vec![0u8; 512]).unwrap();
let wrong_hash = "a".repeat(64);
let result = verify_model_sha256(
&model_file.to_string_lossy(),
&wrong_hash,
).unwrap();
assert!(!result, "wrong hash should fail verification");
}
#[test]
fn model_integrity_missing_file_returns_error() {
let result = verify_model_sha256("/nonexistent/model.onnx", &"a".repeat(64));
assert!(result.is_err(), "missing file must return error");
}
#[test]
fn manual_model_placement_supported_via_locate() {
let catalog = Catalog::open_in_memory().unwrap();
let dir = tempfile::tempdir().unwrap();
let model_file = dir.path().join("model.onnx");
fs::write(&model_file, vec![0u8; 256]).unwrap();
let record = ModelRepository::new(&catalog)
.locate(
&model_file.to_string_lossy(),
ModelRole::Embedding,
"multilingual-e5-small",
"v1",
Some(384),
).unwrap();
assert_eq!(record.status, ModelStatus::Available);
assert!(record.size_bytes.unwrap() > 0);
assert_eq!(record.dimension, Some(384));
}
#[test]
fn model_registry_stores_license_summary() {
let catalog = Catalog::open_in_memory().unwrap();
let record = ModelRepository::new(&catalog).insert(NewModel {
role: ModelRole::Embedding,
model_name: "multilingual-e5-small".into(),
model_version: "v1".into(),
local_path: None,
license_summary: Some("Apache 2.0 — https://huggingface.co/intfloat/multilingual-e5-small".into()),
size_bytes: Some(118 * 1024 * 1024),
backend: Some("onnx".into()),
dimension: Some(384),
status: ModelStatus::Missing,
}).unwrap();
assert!(record.license_summary.unwrap().contains("Apache 2.0"));
}