use crate::{ExtractionConfig, ExtractionResult as KreuzbergResult};
fn merge_configs(base: &ExtractionConfig, override_json: serde_json::Value) -> Result<ExtractionConfig, String> {
let mut config_json =
serde_json::to_value(base).map_err(|e| format!("Failed to serialize base config to JSON: {}", e))?;
if let serde_json::Value::Object(json_obj) = override_json
&& let Some(config_obj) = config_json.as_object_mut()
{
for (key, value) in json_obj {
config_obj.insert(key, value);
}
}
serde_json::from_value(config_json).map_err(|e| format!("Failed to deserialize merged config: {}", e))
}
pub(super) fn build_config(
default_config: &ExtractionConfig,
config_json: Option<serde_json::Value>,
) -> Result<ExtractionConfig, String> {
if let Some(json) = config_json {
merge_configs(default_config, json)
} else {
Ok(default_config.clone())
}
}
pub(super) fn format_extraction_result(result: &KreuzbergResult) -> String {
serde_json::to_string_pretty(result).unwrap_or_default()
}
#[cfg(test)]
mod tests {
use super::*;
use std::borrow::Cow;
#[test]
fn test_build_config_with_no_config() {
let default_config = ExtractionConfig::default();
let config = build_config(&default_config, None).unwrap();
assert_eq!(config.use_cache, default_config.use_cache);
}
#[test]
fn test_build_config_with_config_json() {
let default_config = ExtractionConfig::default();
let config_json = serde_json::json!({
"use_cache": false
});
let config = build_config(&default_config, Some(config_json)).unwrap();
assert!(!config.use_cache);
}
#[test]
fn test_build_config_with_invalid_config_json() {
let default_config = ExtractionConfig::default();
let config_json = serde_json::json!({
"use_cache": "not_a_boolean"
});
let result = build_config(&default_config, Some(config_json));
assert!(result.is_err());
assert!(result.unwrap_err().contains("Failed to deserialize"));
}
#[test]
fn test_build_config_preserves_default_config_settings() {
let default_config = ExtractionConfig {
use_cache: false,
..Default::default()
};
let config = build_config(&default_config, None).unwrap();
assert!(!config.use_cache);
}
#[test]
fn test_build_config_overrides_default_settings() {
let default_config = ExtractionConfig {
use_cache: true,
..Default::default()
};
let config_json = serde_json::json!({
"use_cache": false
});
let config = build_config(&default_config, Some(config_json)).unwrap();
assert!(!config.use_cache);
}
#[test]
fn test_build_config_merges_partial_config() {
let default_config = ExtractionConfig {
use_cache: false,
enable_quality_processing: true,
force_ocr: false,
..Default::default()
};
let config_json = serde_json::json!({
"force_ocr": true
});
let config = build_config(&default_config, Some(config_json)).unwrap();
assert!(!config.use_cache, "use_cache should be preserved from default config");
assert!(
config.enable_quality_processing,
"enable_quality_processing should be preserved"
);
assert!(config.force_ocr, "force_ocr should be overridden to true");
}
#[test]
fn test_build_config_merges_nested_config() {
let default_config = ExtractionConfig {
use_cache: true,
..Default::default()
};
let config_json = serde_json::json!({
"output_format": "markdown"
});
let config = build_config(&default_config, Some(config_json)).unwrap();
assert!(config.use_cache, "use_cache should be preserved from default config");
assert_eq!(
config.output_format,
crate::core::config::formats::OutputFormat::Markdown,
"output_format should be overridden to markdown"
);
}
#[test]
fn test_build_config_merges_with_custom_defaults() {
let default_config = ExtractionConfig {
use_cache: false,
enable_quality_processing: true,
force_ocr: false,
..Default::default()
};
let config_json = serde_json::json!({
"force_ocr": true,
});
let config = build_config(&default_config, Some(config_json)).unwrap();
assert!(config.force_ocr, "force_ocr should be overridden to true");
assert!(
!config.use_cache,
"use_cache should be preserved from default config (false)"
);
assert!(
config.enable_quality_processing,
"enable_quality_processing should be preserved (true)"
);
}
#[test]
fn test_build_config_merges_multiple_fields() {
let default_config = ExtractionConfig {
use_cache: true,
enable_quality_processing: false,
force_ocr: true,
..Default::default()
};
let config_json = serde_json::json!({
"use_cache": false,
"output_format": "markdown",
});
let config = build_config(&default_config, Some(config_json)).unwrap();
assert!(!config.use_cache, "use_cache should be overridden to false");
assert_eq!(
config.output_format,
crate::core::config::formats::OutputFormat::Markdown,
"output_format should be overridden to markdown"
);
assert!(
config.force_ocr,
"force_ocr should be preserved from default config (true)"
);
assert!(
!config.enable_quality_processing,
"enable_quality_processing should be preserved (false)"
);
}
#[test]
fn test_build_config_boolean_override_to_default_value() {
let base = ExtractionConfig {
use_cache: false,
..Default::default()
};
let override_json = serde_json::json!({"use_cache": true});
let merged = build_config(&base, Some(override_json)).unwrap();
assert!(
merged.use_cache,
"Should use explicit override even if it matches default"
);
}
#[test]
fn test_format_extraction_result_is_valid_json() {
let result = KreuzbergResult {
content: "Sample extracted text".to_string(),
mime_type: Cow::Borrowed("text/plain"),
metadata: crate::Metadata::default(),
tables: vec![],
detected_languages: None,
chunks: None,
images: None,
pages: None,
elements: None,
ocr_elements: None,
djot_content: None,
document: None,
#[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
extracted_keywords: None,
quality_score: None,
processing_warnings: Vec::new(),
annotations: None,
};
let formatted = format_extraction_result(&result);
let parsed: serde_json::Value = serde_json::from_str(&formatted).expect("Should be valid JSON");
assert_eq!(parsed["content"], "Sample extracted text");
assert_eq!(parsed["mime_type"], "text/plain");
assert!(parsed["metadata"].is_object());
}
#[test]
fn test_format_extraction_result_includes_tables() {
let result = KreuzbergResult {
content: "Document with tables".to_string(),
mime_type: Cow::Borrowed("application/pdf"),
metadata: crate::Metadata::default(),
tables: vec![crate::Table {
cells: vec![
vec!["Col1".to_string(), "Col2".to_string()],
vec!["A".to_string(), "B".to_string()],
],
page_number: 1,
markdown: "| Col1 | Col2 |\n|------|------|\n| A | B |".to_string(),
bounding_box: None,
}],
detected_languages: None,
chunks: None,
images: None,
pages: None,
elements: None,
ocr_elements: None,
djot_content: None,
document: None,
#[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
extracted_keywords: None,
quality_score: None,
processing_warnings: Vec::new(),
annotations: None,
};
let formatted = format_extraction_result(&result);
let parsed: serde_json::Value = serde_json::from_str(&formatted).expect("Should be valid JSON");
assert_eq!(parsed["tables"].as_array().unwrap().len(), 1);
assert_eq!(parsed["tables"][0]["page_number"], 1);
}
#[test]
fn test_format_extraction_result_includes_chunks_when_present() {
let result = KreuzbergResult {
content: "Chunked text".to_string(),
mime_type: Cow::Borrowed("text/plain"),
metadata: crate::Metadata::default(),
tables: vec![],
detected_languages: None,
chunks: Some(vec![crate::Chunk {
content: "Chunk 1".to_string(),
embedding: None,
metadata: crate::ChunkMetadata {
byte_start: 0,
byte_end: 7,
token_count: None,
chunk_index: 0,
total_chunks: 1,
first_page: None,
last_page: None,
heading_context: None,
},
}]),
images: None,
pages: None,
elements: None,
ocr_elements: None,
djot_content: None,
document: None,
#[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
extracted_keywords: None,
quality_score: None,
processing_warnings: Vec::new(),
annotations: None,
};
let formatted = format_extraction_result(&result);
let parsed: serde_json::Value = serde_json::from_str(&formatted).expect("Should be valid JSON");
assert_eq!(parsed["chunks"].as_array().unwrap().len(), 1);
assert_eq!(parsed["chunks"][0]["content"], "Chunk 1");
}
#[test]
fn test_format_extraction_result_omits_none_fields() {
let result = KreuzbergResult {
content: "Simple text".to_string(),
mime_type: Cow::Borrowed("text/plain"),
metadata: crate::Metadata::default(),
tables: vec![],
detected_languages: None,
chunks: None,
images: None,
pages: None,
elements: None,
ocr_elements: None,
djot_content: None,
document: None,
#[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
extracted_keywords: None,
quality_score: None,
processing_warnings: Vec::new(),
annotations: None,
};
let formatted = format_extraction_result(&result);
let parsed: serde_json::Value = serde_json::from_str(&formatted).expect("Should be valid JSON");
assert!(parsed.get("chunks").is_none());
assert!(parsed.get("images").is_none());
assert!(parsed.get("detected_languages").is_none());
}
}