use crate::core::config::llm::StructuredExtractionConfig;
use serde_json::Value;
fn sanitize_schema_for_provider(schema: &Value, model: &str) -> Value {
let needs_strip = !model.starts_with("openai/");
if needs_strip {
strip_additional_properties(schema)
} else {
schema.clone()
}
}
fn strip_additional_properties(schema: &Value) -> Value {
match schema {
Value::Object(map) => {
let mut cleaned = serde_json::Map::new();
for (key, value) in map {
if key == "additionalProperties" {
continue;
}
cleaned.insert(key.clone(), strip_additional_properties(value));
}
Value::Object(cleaned)
}
Value::Array(arr) => Value::Array(arr.iter().map(strip_additional_properties).collect()),
other => other.clone(),
}
}
pub async fn extract_structured(
content: &str,
config: &StructuredExtractionConfig,
) -> crate::Result<(serde_json::Value, Option<crate::types::LlmUsage>)> {
use liter_llm::LlmClient;
let client = super::client::create_client(&config.llm)?;
let template = config
.prompt
.as_deref()
.unwrap_or(super::prompts::STRUCTURED_EXTRACTION_TEMPLATE);
let schema_json = serde_json::to_string_pretty(&config.schema)
.map_err(|e| crate::KreuzbergError::validation(format!("Failed to serialize schema for prompt: {e}")))?;
let ctx = minijinja::context! {
content => content,
schema => schema_json,
schema_name => &config.schema_name,
schema_description => config.schema_description.as_deref().unwrap_or(""),
};
let prompt = super::prompts::render_template(template, &ctx)?;
let sanitized_schema = sanitize_schema_for_provider(&config.schema, &config.llm.model);
let mut request = liter_llm::ChatCompletionRequest::default();
request.model = config.llm.model.clone();
request.messages = vec![liter_llm::Message::User(liter_llm::UserMessage {
content: liter_llm::UserContent::Text(prompt),
name: None,
})];
request.temperature = config.llm.temperature;
request.max_tokens = config.llm.max_tokens;
request.response_format = Some(liter_llm::ResponseFormat::JsonSchema {
json_schema: liter_llm::JsonSchemaFormat {
name: config.schema_name.clone(),
description: config.schema_description.clone(),
schema: sanitized_schema,
strict: Some(config.strict),
},
});
let response = client
.chat(request)
.await
.map_err(|e| crate::KreuzbergError::parsing(format!("LLM structured extraction request failed: {e}")))?;
let usage = super::usage::extract_usage_from_chat(&response, "structured_extraction");
let text = response
.choices
.first()
.and_then(|c| c.message.content.as_deref())
.ok_or_else(|| {
crate::KreuzbergError::parsing(format!(
"LLM structured extraction returned no content (model={}, {} choices)",
config.llm.model,
response.choices.len()
))
})?;
let cleaned = text
.trim()
.strip_prefix("```json")
.or_else(|| text.trim().strip_prefix("```"))
.and_then(|s| s.strip_suffix("```"))
.map(|s| s.trim())
.unwrap_or(text.trim());
let value = serde_json::from_str(cleaned).map_err(|e| {
crate::KreuzbergError::parsing(format!(
"LLM structured extraction returned invalid JSON (model={}): {e}\nRaw response: {}",
config.llm.model,
&text[..text.floor_char_boundary(text.len().min(200))]
))
})?;
Ok((value, usage))
}
#[cfg(test)]
mod tests {
use super::*;
use serde_json::json;
#[test]
fn test_sanitize_schema_strips_for_non_openai() {
let schema = json!({
"type": "object",
"properties": {
"name": { "type": "string" },
"nested": {
"type": "object",
"properties": { "x": { "type": "integer" } },
"additionalProperties": false
}
},
"required": ["name"],
"additionalProperties": false
});
let sanitized = sanitize_schema_for_provider(&schema, "gemini/gemini-2.5-flash");
assert!(sanitized.get("additionalProperties").is_none());
assert!(sanitized["properties"]["nested"].get("additionalProperties").is_none());
assert_eq!(sanitized["type"], "object");
assert_eq!(sanitized["required"], json!(["name"]));
}
#[test]
fn test_sanitize_schema_preserves_for_openai() {
let schema = json!({
"type": "object",
"properties": { "a": { "type": "string" } },
"required": ["a"],
"additionalProperties": false
});
let sanitized = sanitize_schema_for_provider(&schema, "openai/gpt-4o");
assert_eq!(sanitized, schema);
}
}