#![cfg(feature = "documents")]
use std::path::Path;
use anyhow::Context as _;
use kreuzberg::core::mime;
use kreuzberg::embeddings::{EMBEDDING_PRESETS, EmbeddingPreset};
use crate::config::{DocumentsConfig, LlmConfig};
use crate::extract::doc::{DocConfig, FileMapDoc, extract_doc};
use crate::hashing::{self, Hash};
use crate::lance::DocumentRow;
use crate::store::Store;
#[derive(Debug, Clone)]
pub(crate) struct PendingDocBatch {
pub rel_path: String,
pub chunk_count: usize,
pub embedding_dim: u16,
pub rows: Vec<DocumentRow>,
}
pub(crate) fn preset_dim(name: &str) -> anyhow::Result<u16> {
let preset: &EmbeddingPreset = EMBEDDING_PRESETS
.iter()
.find(|p| p.name == name)
.with_context(|| format!("unknown kreuzberg embedding preset: {name}"))?;
u16::try_from(preset.dimensions)
.with_context(|| format!("preset {name} dimensions {} exceeds u16", preset.dimensions))
}
pub(crate) fn doc_config_from(cfg: &DocumentsConfig, llm: &LlmConfig) -> DocConfig {
DocConfig {
max_characters: cfg.max_characters,
overlap: cfg.overlap,
embedding_preset: Some(cfg.embedding_preset.clone()),
embed: cfg.embed,
language: cfg.language.clone(),
keywords: cfg.keywords.clone(),
ner: cfg.ner.clone(),
summarization: cfg.summarization.clone(),
llm: llm.clone(),
}
}
pub(crate) fn should_extract_document(abs: &Path, cfg: &DocumentsConfig) -> Option<String> {
if !cfg.enabled {
return None;
}
let mime_type = mime::detect_mime_type(abs, false).ok()?;
if cfg.mime_allowlist.is_empty() {
return Some(mime_type);
}
let allowed = cfg
.mime_allowlist
.iter()
.any(|entry| matches_mime(entry, &mime_type));
if allowed { Some(mime_type) } else { None }
}
fn matches_mime(entry: &str, mime_type: &str) -> bool {
if entry == mime_type {
return true;
}
if let Some(prefix) = entry.strip_suffix('/') {
return mime_type.starts_with(&format!("{prefix}/"));
}
false
}
#[allow(clippy::too_many_arguments)]
pub(crate) fn extract_and_persist_doc(
store: &Store,
rel: &str,
abs: &Path,
bytes: &[u8],
mime_type: &str,
cfg: &DocumentsConfig,
llm: &LlmConfig,
scope: &str,
) -> Result<Option<PendingDocBatch>, anyhow::Error> {
let doc_config = doc_config_from(cfg, llm);
let doc: FileMapDoc = extract_doc(abs, Some(mime_type), &doc_config)
.with_context(|| format!("extract document {rel}"))?;
let hash: Hash = hashing::hash_bytes(bytes);
store
.write_doc(&hash, &doc)
.with_context(|| format!("write doc blob for {rel}"))?;
if doc.embedding_dim == 0 || doc.chunks.is_empty() {
return Ok(Some(PendingDocBatch {
rel_path: rel.to_string(),
chunk_count: doc.chunks.len(),
embedding_dim: doc.embedding_dim,
rows: Vec::new(),
}));
}
let rows: Vec<DocumentRow> = doc
.chunks
.iter()
.enumerate()
.map(|(idx, chunk)| DocumentRow {
scope: scope.to_string(),
path: rel.to_string(),
chunk_idx: u32::try_from(idx).unwrap_or(u32::MAX),
mime_type: doc.mime_type.clone(),
text: chunk.text.clone(),
byte_start: chunk.byte_start,
byte_end: chunk.byte_end,
embedding: chunk.embedding.clone(),
})
.collect();
Ok(Some(PendingDocBatch {
rel_path: rel.to_string(),
chunk_count: rows.len(),
embedding_dim: doc.embedding_dim,
rows,
}))
}
pub(crate) fn flush_document_batches(
store: &mut Store,
scope: &str,
batches: Vec<PendingDocBatch>,
embedding_model: &str,
) -> usize {
let mut inserted = 0usize;
let Some(dim) = batches
.iter()
.find(|b| b.embedding_dim > 0)
.map(|b| b.embedding_dim)
else {
return 0;
};
match preset_dim(embedding_model) {
Ok(expected) if expected != dim => {
tracing::error!(
preset = %embedding_model,
expected,
actual = dim,
"preset/runtime dim mismatch — refusing to write document batch"
);
return 0;
}
Ok(_) => {}
Err(error) => {
tracing::error!(
?error,
preset = %embedding_model,
"unknown embedding preset — refusing to write document batch"
);
return 0;
}
}
let lance = match store.lance_or_open(dim, embedding_model) {
Ok(s) => s.clone(),
Err(error) => {
tracing::error!(?error, "open LanceStore for document batch failed");
return 0;
}
};
for batch in batches {
if batch.rows.is_empty() {
continue;
}
match lance.replace_document(scope, &batch.rel_path, batch.rows) {
Ok(()) => inserted += 1,
Err(error) => {
tracing::warn!(
rel = %batch.rel_path,
?error,
"lance replace_document failed; document search may be incomplete"
);
}
}
}
inserted
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn preset_dim_for_balanced_returns_768() {
let dim = preset_dim("balanced").expect("balanced preset");
assert_eq!(dim, 768);
}
#[test]
fn preset_dim_for_unknown_errors() {
let err = preset_dim("does-not-exist").expect_err("unknown preset");
let msg = err.to_string();
assert!(
msg.contains("does-not-exist"),
"error should name the preset; got: {msg}"
);
}
#[test]
fn matches_mime_exact_and_prefix() {
assert!(matches_mime("application/pdf", "application/pdf"));
assert!(matches_mime("image/", "image/png"));
assert!(matches_mime("image/", "image/jpeg"));
assert!(!matches_mime("image/", "video/mp4"));
assert!(!matches_mime("application/pdf", "application/json"));
}
#[test]
fn doc_config_from_propagates_language_settings() {
use crate::config::DocLanguageConfig;
let cfg = DocumentsConfig {
language: DocLanguageConfig {
auto_detect: true,
min_confidence: 0.5,
detect_multiple: true,
..Default::default()
},
..Default::default()
};
let doc_cfg = doc_config_from(&cfg, &LlmConfig::default());
assert!(doc_cfg.language.auto_detect);
assert_eq!(doc_cfg.language.min_confidence, 0.5);
assert!(doc_cfg.language.detect_multiple);
}
#[test]
fn doc_config_from_propagates_summarization_and_llm() {
use crate::config::{SummarizationConfig, SummarizationStrategy};
let cfg = DocumentsConfig {
summarization: SummarizationConfig {
enabled: true,
strategy: SummarizationStrategy::Abstractive,
max_tokens: Some(150),
},
..Default::default()
};
let llm = LlmConfig {
model: "openai/gpt-4o".to_string(),
..Default::default()
};
let doc_cfg = doc_config_from(&cfg, &llm);
assert!(doc_cfg.summarization.enabled);
assert_eq!(doc_cfg.summarization.max_tokens, Some(150));
assert_eq!(doc_cfg.llm.model, "openai/gpt-4o");
}
#[test]
fn should_extract_document_respects_disabled_flag() {
let cfg = DocumentsConfig {
enabled: false,
..Default::default()
};
let out = should_extract_document(Path::new("dummy.pdf"), &cfg);
assert!(out.is_none());
}
}