use std::path::Path;
use kreuzberg::LanguageDetectionConfig;
use kreuzberg::core::config::ExtractionConfig;
use kreuzberg::core::config::processing::{ChunkingConfig, EmbeddingConfig};
use kreuzberg::core::extractor::extract_file_sync;
use serde::{Deserialize, Serialize};
use super::{ExtractError, SCHEMA_VER};
use crate::config::{
DocLanguageConfig, KeywordAlgorithm, KeywordsConfig, LlmConfig, NerBackend, NerConfig,
SummarizationConfig, SummarizationStrategy,
};
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct FileMapDoc {
pub schema_ver: u16,
pub mime_type: String,
pub content: String,
pub metadata: Vec<(String, String)>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub detected_languages: Vec<String>,
pub chunks: Vec<DocChunk>,
pub embedding_model: String,
pub embedding_dim: u16,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub keywords: Vec<DocKeyword>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub entities: Vec<DocEntity>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub summary: Option<DocSummary>,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct DocKeyword {
pub text: String,
pub score: f32,
pub algorithm: String,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct DocEntity {
pub category: String,
pub text: String,
pub start: u32,
pub end: u32,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub confidence: Option<f32>,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct DocSummary {
pub text: String,
pub strategy: String,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub token_count: Option<u32>,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct DocChunk {
pub byte_start: u32,
pub byte_end: u32,
pub text: String,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub embedding: Vec<f32>,
}
#[derive(Debug, Clone)]
pub struct DocConfig {
pub max_characters: usize,
pub overlap: usize,
pub embedding_preset: Option<String>,
pub embed: bool,
pub language: DocLanguageConfig,
pub keywords: KeywordsConfig,
pub ner: NerConfig,
pub summarization: SummarizationConfig,
pub llm: LlmConfig,
}
impl Default for DocConfig {
fn default() -> Self {
Self {
max_characters: 1000,
overlap: 200,
embedding_preset: Some("balanced".to_string()),
embed: true,
language: DocLanguageConfig::default(),
keywords: KeywordsConfig::default(),
ner: NerConfig::default(),
summarization: SummarizationConfig::default(),
llm: LlmConfig::default(),
}
}
}
impl DocConfig {
fn to_kreuzberg(&self) -> ExtractionConfig {
let embedding = if self.embed {
Some(EmbeddingConfig::default())
} else {
None
};
let chunking = ChunkingConfig {
max_characters: self.max_characters,
overlap: self.overlap,
embedding,
preset: self.embedding_preset.clone(),
..Default::default()
};
let language_detection = if self.language.auto_detect {
Some(LanguageDetectionConfig {
enabled: true,
min_confidence: self.language.min_confidence,
detect_multiple: self.language.detect_multiple,
})
} else {
None
};
let keywords = self.kreuzberg_keywords();
let ner = self.kreuzberg_ner();
let summarization = self.kreuzberg_summarization();
ExtractionConfig {
chunking: Some(chunking),
language_detection,
keywords,
ner,
summarization,
..Default::default()
}
}
fn kreuzberg_summarization(&self) -> Option<kreuzberg::SummarizationConfig> {
if !self.summarization.enabled {
return None;
}
let mut sc = kreuzberg::SummarizationConfig {
strategy: match self.summarization.strategy {
SummarizationStrategy::Extractive => kreuzberg::SummaryStrategy::Extractive,
SummarizationStrategy::Abstractive => kreuzberg::SummaryStrategy::Abstractive,
},
max_tokens: self.summarization.max_tokens,
llm: None,
};
if matches!(
self.summarization.strategy,
SummarizationStrategy::Abstractive
) {
sc.llm = self.llm.to_kreuzberg();
if sc.llm.is_none() {
tracing::warn!(
"summarization.strategy = abstractive but llm.model unset; falling back to extractive"
);
sc.strategy = kreuzberg::SummaryStrategy::Extractive;
}
}
Some(sc)
}
fn kreuzberg_keywords(&self) -> Option<kreuzberg::KeywordConfig> {
if !self.keywords.enabled {
return None;
}
let ngram = if self.keywords.ngram_range.len() == 2 {
(self.keywords.ngram_range[0], self.keywords.ngram_range[1])
} else {
(1, 3)
};
let mut kc = kreuzberg::KeywordConfig {
algorithm: match self.keywords.algorithm {
KeywordAlgorithm::Yake => kreuzberg::KeywordAlgorithm::Yake,
KeywordAlgorithm::Rake => kreuzberg::KeywordAlgorithm::Rake,
},
max_keywords: self.keywords.max_keywords,
min_score: self.keywords.min_score,
ngram_range: ngram,
language: None,
yake_params: None,
rake_params: None,
};
if let Some(v) = self.keywords.yake_params.as_ref() {
match serde_json::from_value::<kreuzberg::keywords::YakeParams>(v.clone()) {
Ok(p) => kc.yake_params = Some(p),
Err(e) => {
tracing::warn!(error = %e, "invalid yake_params; using kreuzberg defaults")
}
}
}
if let Some(v) = self.keywords.rake_params.as_ref() {
match serde_json::from_value::<kreuzberg::keywords::RakeParams>(v.clone()) {
Ok(p) => kc.rake_params = Some(p),
Err(e) => {
tracing::warn!(error = %e, "invalid rake_params; using kreuzberg defaults")
}
}
}
if self.keywords.yake_params.is_some() && self.keywords.algorithm != KeywordAlgorithm::Yake
{
tracing::warn!(
algorithm = ?self.keywords.algorithm,
"yake_params set but algorithm is not Yake; params ignored"
);
}
if self.keywords.rake_params.is_some() && self.keywords.algorithm != KeywordAlgorithm::Rake
{
tracing::warn!(
algorithm = ?self.keywords.algorithm,
"rake_params set but algorithm is not Rake; params ignored"
);
}
Some(kc)
}
fn kreuzberg_ner(&self) -> Option<kreuzberg::core::config::ner::NerConfig> {
if !self.ner.enabled {
return None;
}
let llm = if matches!(self.ner.backend, NerBackend::Llm) {
let cfg = self.llm.to_kreuzberg();
if cfg.is_none() {
tracing::warn!(
"ner.backend = llm but llm.model is unset; NER will fall back to ONNX inside kreuzberg"
);
}
cfg
} else {
None
};
Some(kreuzberg::core::config::ner::NerConfig {
backend: match self.ner.backend {
NerBackend::Onnx => kreuzberg::core::config::ner::NerBackendKind::Onnx,
NerBackend::Llm => kreuzberg::core::config::ner::NerBackendKind::Llm,
},
categories: self
.ner
.categories
.iter()
.map(|s| kreuzberg::types::entity::EntityCategory::from(s.clone()))
.collect(),
model: self.ner.model.clone(),
llm,
custom_labels: self.ner.custom_labels.clone(),
})
}
}
pub fn extract_doc(
path: &Path,
mime_type: Option<&str>,
config: &DocConfig,
) -> Result<FileMapDoc, ExtractError> {
let krz_config = config.to_kreuzberg();
let result = extract_file_sync(path, mime_type, &krz_config)
.map_err(|e| ExtractError::Document(e.to_string()))?;
let mut chunks: Vec<DocChunk> = Vec::new();
let mut embedding_dim: u16 = 0;
if let Some(input_chunks) = result.chunks {
for c in input_chunks {
let dim = c.embedding.as_ref().map(|v| v.len()).unwrap_or(0);
if dim > 0 && embedding_dim == 0 {
embedding_dim = u16::try_from(dim).unwrap_or(u16::MAX);
}
chunks.push(DocChunk {
byte_start: u32::try_from(c.metadata.byte_start).unwrap_or(u32::MAX),
byte_end: u32::try_from(c.metadata.byte_end).unwrap_or(u32::MAX),
text: c.content,
embedding: c.embedding.unwrap_or_default(),
});
}
}
let embedding_model = if embedding_dim > 0 {
config
.embedding_preset
.clone()
.unwrap_or_else(|| "default".to_string())
} else {
String::new()
};
let metadata = metadata_pairs(&result.metadata);
let keywords: Vec<DocKeyword> = result
.extracted_keywords
.unwrap_or_default()
.into_iter()
.map(|k| DocKeyword {
text: k.text,
score: k.score,
algorithm: keyword_algorithm_str(&k.algorithm).to_string(),
})
.collect();
let entities: Vec<DocEntity> = result
.entities
.unwrap_or_default()
.into_iter()
.map(|e| DocEntity {
category: entity_category_str(&e.category),
text: e.text,
start: e.start,
end: e.end,
confidence: e.confidence,
})
.collect();
let summary = result.summary.map(|s| DocSummary {
text: s.text,
strategy: s.strategy.to_string(),
token_count: s.token_count,
});
Ok(FileMapDoc {
schema_ver: SCHEMA_VER,
mime_type: result.mime_type.into_owned(),
content: result.content,
metadata,
detected_languages: result.detected_languages.unwrap_or_default(),
chunks,
embedding_model,
embedding_dim,
keywords,
entities,
summary,
})
}
fn keyword_algorithm_str(alg: &kreuzberg::KeywordAlgorithm) -> &'static str {
match alg {
kreuzberg::KeywordAlgorithm::Yake => "yake",
kreuzberg::KeywordAlgorithm::Rake => "rake",
}
}
fn entity_category_str(category: &kreuzberg::types::entity::EntityCategory) -> String {
use kreuzberg::types::entity::EntityCategory::*;
match category {
Person => "person".to_string(),
Organization => "organization".to_string(),
Location => "location".to_string(),
Date => "date".to_string(),
Time => "time".to_string(),
Money => "money".to_string(),
Percent => "percent".to_string(),
Email => "email".to_string(),
Phone => "phone".to_string(),
Url => "url".to_string(),
Custom(s) => s.clone(),
}
}
fn metadata_pairs(metadata: &kreuzberg::types::Metadata) -> Vec<(String, String)> {
match serde_json::to_value(metadata) {
Ok(serde_json::Value::Object(map)) => map
.into_iter()
.filter_map(|(k, v)| {
let value_str = match v {
serde_json::Value::Null => return None,
serde_json::Value::String(s) => s,
other => other.to_string(),
};
Some((k, value_str))
})
.collect(),
_ => Vec::new(),
}
}