use std::fmt;
use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
#[serde(deny_unknown_fields)]
pub struct DocumentsConfig {
#[serde(default = "DocumentsConfig::default_enabled")]
pub enabled: bool,
#[serde(default)]
#[schemars(inner(length(min = 1)))]
pub mime_allowlist: Vec<String>,
#[serde(default = "DocumentsConfig::default_max_characters")]
#[schemars(range(min = 64))]
pub max_characters: usize,
#[serde(default = "DocumentsConfig::default_overlap")]
#[schemars(range(min = 0))]
pub overlap: usize,
#[serde(default = "DocumentsConfig::default_embedding_preset")]
pub embedding_preset: String,
#[serde(default = "DocumentsConfig::default_embed")]
pub embed: bool,
#[serde(default)]
pub language: DocLanguageConfig,
#[serde(default)]
pub reranker: RerankerConfig,
#[serde(default)]
pub keywords: KeywordsConfig,
#[serde(default)]
pub ner: NerConfig,
#[serde(default)]
pub summarization: SummarizationConfig,
#[serde(default)]
pub ocr: OcrConfig,
#[serde(default)]
pub output: OutputConfig,
}
impl DocumentsConfig {
fn default_enabled() -> bool {
true
}
fn default_max_characters() -> usize {
1000
}
fn default_overlap() -> usize {
200
}
fn default_embedding_preset() -> String {
"balanced".to_string()
}
fn default_embed() -> bool {
true
}
}
impl Default for DocumentsConfig {
fn default() -> Self {
Self {
enabled: Self::default_enabled(),
mime_allowlist: Vec::new(),
max_characters: Self::default_max_characters(),
overlap: Self::default_overlap(),
embedding_preset: Self::default_embedding_preset(),
embed: Self::default_embed(),
language: DocLanguageConfig::default(),
reranker: RerankerConfig::default(),
keywords: KeywordsConfig::default(),
ner: NerConfig::default(),
summarization: SummarizationConfig::default(),
ocr: OcrConfig::default(),
output: OutputConfig::default(),
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
#[serde(deny_unknown_fields)]
pub struct DocLanguageConfig {
#[serde(default = "DocLanguageConfig::default_auto_detect")]
pub auto_detect: bool,
#[serde(default = "DocLanguageConfig::default_min_confidence")]
#[schemars(range(min = 0.0, max = 1.0))]
pub min_confidence: f64,
#[serde(default)]
pub detect_multiple: bool,
#[serde(default)]
pub preferred_languages: Vec<String>,
}
impl DocLanguageConfig {
fn default_auto_detect() -> bool {
true
}
fn default_min_confidence() -> f64 {
0.8
}
}
impl Default for DocLanguageConfig {
fn default() -> Self {
Self {
auto_detect: Self::default_auto_detect(),
min_confidence: Self::default_min_confidence(),
detect_multiple: false,
preferred_languages: Vec::new(),
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
#[serde(deny_unknown_fields)]
pub struct RerankerConfig {
#[serde(default)]
pub enabled: bool,
#[serde(default = "RerankerConfig::default_preset")]
pub preset: String,
#[serde(default = "RerankerConfig::default_top_k")]
pub top_k: usize,
}
impl RerankerConfig {
fn default_preset() -> String {
"bge-reranker-base".to_string()
}
fn default_top_k() -> usize {
10
}
}
impl Default for RerankerConfig {
fn default() -> Self {
Self {
enabled: false,
preset: Self::default_preset(),
top_k: Self::default_top_k(),
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
#[serde(deny_unknown_fields)]
pub struct KeywordsConfig {
#[serde(default)]
pub enabled: bool,
#[serde(default)]
pub algorithm: KeywordAlgorithm,
#[serde(default = "KeywordsConfig::default_max_keywords")]
pub max_keywords: usize,
#[serde(default)]
#[schemars(range(min = 0.0))]
pub min_score: f32,
#[serde(default = "KeywordsConfig::default_ngram_range")]
#[schemars(length(min = 2, max = 2))]
pub ngram_range: Vec<usize>,
#[serde(default)]
pub yake_params: Option<serde_json::Value>,
#[serde(default)]
pub rake_params: Option<serde_json::Value>,
}
impl KeywordsConfig {
fn default_max_keywords() -> usize {
10
}
fn default_ngram_range() -> Vec<usize> {
vec![1, 3]
}
}
impl Default for KeywordsConfig {
fn default() -> Self {
Self {
enabled: false,
algorithm: KeywordAlgorithm::default(),
max_keywords: Self::default_max_keywords(),
min_score: 0.0,
ngram_range: Self::default_ngram_range(),
yake_params: None,
rake_params: None,
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, JsonSchema, Default)]
#[serde(rename_all = "lowercase")]
pub enum KeywordAlgorithm {
#[default]
Yake,
Rake,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize, JsonSchema)]
#[serde(deny_unknown_fields)]
pub struct NerConfig {
#[serde(default)]
pub enabled: bool,
#[serde(default)]
pub backend: NerBackend,
#[serde(default)]
pub model: Option<String>,
#[serde(default)]
pub categories: Vec<String>,
#[serde(default)]
pub custom_labels: Vec<String>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, JsonSchema, Default)]
#[serde(rename_all = "lowercase")]
pub enum NerBackend {
#[default]
Onnx,
Llm,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize, JsonSchema)]
#[serde(deny_unknown_fields)]
pub struct SummarizationConfig {
#[serde(default)]
pub enabled: bool,
#[serde(default)]
pub strategy: SummarizationStrategy,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub max_tokens: Option<u32>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, JsonSchema, Default)]
#[serde(rename_all = "lowercase")]
pub enum SummarizationStrategy {
#[default]
Extractive,
Abstractive,
}
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
#[serde(deny_unknown_fields)]
pub struct OcrConfig {
#[serde(default)]
pub backend: OcrBackend,
#[serde(default = "OcrConfig::default_languages")]
pub languages: Vec<String>,
}
impl OcrConfig {
fn default_languages() -> Vec<String> {
vec!["eng".to_string()]
}
}
impl Default for OcrConfig {
fn default() -> Self {
Self {
backend: OcrBackend::default(),
languages: Self::default_languages(),
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, JsonSchema, Default)]
#[serde(rename_all = "lowercase")]
pub enum OcrBackend {
#[default]
Tesseract,
Paddle,
Vlm,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize, JsonSchema)]
#[serde(deny_unknown_fields)]
pub struct OutputConfig {
#[serde(default)]
pub format: OutputFormat,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, JsonSchema, Default)]
#[serde(rename_all = "lowercase")]
pub enum OutputFormat {
#[default]
Json,
Toon,
}
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema, Default)]
#[serde(deny_unknown_fields)]
pub struct LlmConfig {
#[serde(default)]
pub model: String,
#[serde(default)]
pub api_key: ApiKey,
#[serde(default)]
pub base_url: Option<String>,
#[serde(default)]
pub temperature: Option<f64>,
#[serde(default)]
pub timeout_secs: Option<u64>,
#[serde(default)]
pub max_retries: Option<u32>,
#[serde(default)]
pub max_tokens: Option<u64>,
}
#[cfg(feature = "intelligence")]
impl LlmConfig {
pub fn to_kreuzberg(&self) -> Option<kreuzberg::LlmConfig> {
if self.model.is_empty() {
return None;
}
Some(kreuzberg::LlmConfig {
model: self.model.clone(),
api_key: self.api_key.resolve().map(|s| s.expose().to_string()),
base_url: self.base_url.clone(),
timeout_secs: self.timeout_secs,
max_retries: self.max_retries,
temperature: self.temperature,
max_tokens: self.max_tokens,
})
}
}
#[derive(Debug, Clone, Deserialize, JsonSchema, Default)]
#[serde(untagged)]
pub enum ApiKey {
Literal(String),
Env { env: String },
#[default]
Unset,
}
impl PartialEq for ApiKey {
fn eq(&self, other: &Self) -> bool {
match (self, other) {
(Self::Literal(a), Self::Literal(b)) => a == b,
(Self::Env { env: a }, Self::Env { env: b }) => a == b,
(Self::Unset, Self::Unset) => true,
_ => false,
}
}
}
impl serde::Serialize for ApiKey {
fn serialize<S: serde::Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
match self {
Self::Literal(_) => serializer.serialize_str("<redacted>"),
Self::Env { env } => {
use serde::ser::SerializeStruct;
let mut s = serializer.serialize_struct("EnvRef", 1)?;
s.serialize_field("env", env)?;
s.end()
}
Self::Unset => serializer.serialize_none(),
}
}
}
impl ApiKey {
pub fn resolve(&self) -> Option<SecretString> {
match self {
Self::Literal(s) if !s.is_empty() => Some(SecretString(s.clone())),
Self::Literal(_) => None,
Self::Env { env } => match std::env::var(env) {
Ok(v) if !v.is_empty() => Some(SecretString(v)),
_ => None,
},
Self::Unset => None,
}
}
}
#[derive(Clone, Serialize, Deserialize)]
#[serde(transparent)]
pub struct SecretString(String);
impl SecretString {
pub fn new(value: String) -> Self {
Self(value)
}
pub fn expose(&self) -> &str {
&self.0
}
}
impl fmt::Debug for SecretString {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.write_str("\"<redacted>\"")
}
}
impl fmt::Display for SecretString {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.write_str("<redacted>")
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn secret_string_redacts_in_debug() {
let s = SecretString::new("hunter2".to_string());
assert_eq!(format!("{s:?}"), "\"<redacted>\"");
assert_eq!(format!("{s}"), "<redacted>");
assert_eq!(s.expose(), "hunter2");
}
#[test]
fn api_key_unset_resolves_to_none() {
assert!(ApiKey::Unset.resolve().is_none());
}
#[test]
fn api_key_literal_resolves_to_value() {
let k = ApiKey::Literal("sk-test".to_string());
let resolved = k.resolve().expect("literal resolves");
assert_eq!(resolved.expose(), "sk-test");
}
#[test]
fn api_key_literal_empty_resolves_to_none() {
assert!(ApiKey::Literal(String::new()).resolve().is_none());
}
#[test]
fn api_key_env_reads_environment() {
unsafe {
std::env::set_var("BASEMIND_TEST_API_KEY_PRESENT", "value-123");
}
let k = ApiKey::Env {
env: "BASEMIND_TEST_API_KEY_PRESENT".to_string(),
};
let resolved = k.resolve().expect("env resolves");
assert_eq!(resolved.expose(), "value-123");
unsafe {
std::env::remove_var("BASEMIND_TEST_API_KEY_PRESENT");
}
}
#[test]
fn api_key_env_missing_resolves_to_none() {
unsafe {
std::env::remove_var("BASEMIND_TEST_API_KEY_MISSING");
}
let k = ApiKey::Env {
env: "BASEMIND_TEST_API_KEY_MISSING".to_string(),
};
assert!(k.resolve().is_none());
}
#[test]
fn api_key_deserialises_literal_string() {
let k: ApiKey = serde_json::from_str("\"sk-test\"").expect("parse");
match k {
ApiKey::Literal(s) => assert_eq!(s, "sk-test"),
other => panic!("expected Literal, got {other:?}"),
}
}
#[test]
fn api_key_deserialises_env_table() {
let k: ApiKey = serde_json::from_str(r#"{"env":"OPENAI_API_KEY"}"#).expect("parse");
match k {
ApiKey::Env { env } => assert_eq!(env, "OPENAI_API_KEY"),
other => panic!("expected Env, got {other:?}"),
}
}
#[test]
fn api_key_literal_never_serializes_cleartext() {
let key = ApiKey::Literal("sk-supersecret".to_string());
let json = serde_json::to_string(&key).expect("serialize");
assert!(
!json.contains("sk-supersecret"),
"raw secret leaked: {json}"
);
assert!(
json.contains("<redacted>"),
"redaction marker missing: {json}"
);
}
#[test]
fn api_key_env_serializes_env_name_only() {
let key = ApiKey::Env {
env: "OPENAI_API_KEY".to_string(),
};
let json = serde_json::to_string(&key).expect("serialize");
assert!(json.contains("OPENAI_API_KEY"), "env name missing: {json}");
assert!(json.contains("\"env\""), "env field missing: {json}");
}
#[test]
fn api_key_unset_serializes_to_null() {
let key = ApiKey::Unset;
let json = serde_json::to_string(&key).expect("serialize");
assert_eq!(json, "null");
}
}