use std::collections::BTreeMap;
use serde::{Deserialize, Serialize};
pub type IngestionConfigDto = serde_json::Value;
pub type IngestionPendingReviewDto = serde_json::Value;
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct IngestionStatsDto {
#[serde(default)]
pub extra: BTreeMap<String, serde_json::Value>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct IngestMarkdownRequest {
pub content: String,
pub owner_id: String,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub config: Option<IngestionConfigDto>,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct IngestMarkdownResponse {
pub success: bool,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub session_id: Option<String>,
#[serde(default)]
pub stats: IngestionStatsDto,
#[serde(default)]
pub pending_review: Vec<IngestionPendingReviewDto>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MarkdownDocumentDto {
pub document_id: String,
pub content: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct IngestMarkdownBatchRequest {
pub documents: Vec<MarkdownDocumentDto>,
pub owner_id: String,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub config: Option<IngestionConfigDto>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum DocumentSource {
Base64 { data: String, filename: String },
Url(String),
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum DocumentType {
Pdf,
Docx,
Pptx,
Xlsx,
Html,
Image,
Markdown,
Unknown,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum DocumentParser {
Docling,
DotsOcr,
OlmOcr,
#[default]
Auto,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct OcrConfigDto {
#[serde(default)]
pub parser: DocumentParser,
#[serde(default = "default_true")]
pub enable_fallback: bool,
#[serde(default)]
pub use_llm_enhancement: bool,
#[serde(default)]
pub force_ocr: bool,
#[serde(default)]
pub languages: Vec<String>,
#[serde(default)]
pub extract_images: bool,
#[serde(default = "default_true")]
pub extract_tables: bool,
}
fn default_true() -> bool {
true
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct IngestDocumentRequest {
pub document: DocumentSource,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub document_type: Option<DocumentType>,
pub owner_id: String,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub ocr_config: Option<OcrConfigDto>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub ingestion_config: Option<IngestionConfigDto>,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct ParsedDocumentMetadataDto {
#[serde(default)]
pub extra: BTreeMap<String, serde_json::Value>,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct DocumentExtractionReportDto {
#[serde(default)]
pub extra: BTreeMap<String, serde_json::Value>,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct ExtractedTableDto {
#[serde(default)]
pub extra: BTreeMap<String, serde_json::Value>,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct DocumentParseStatsDto {
pub parse_time_ms: u64,
pub parser_used: String,
pub pages_processed: u64,
pub tables_extracted: u64,
pub images_extracted: u64,
#[serde(default)]
pub warnings: Vec<String>,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct IngestDocumentResponse {
pub success: bool,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub session_id: Option<String>,
pub parsed_metadata: ParsedDocumentMetadataDto,
pub extraction_report: DocumentExtractionReportDto,
#[serde(default)]
pub tables: Vec<ExtractedTableDto>,
pub parse_stats: DocumentParseStatsDto,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DocumentBatchItem {
pub document_id: String,
pub document: DocumentSource,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub document_type: Option<DocumentType>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct IngestDocumentBatchRequest {
pub documents: Vec<DocumentBatchItem>,
pub owner_id: String,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub ocr_config: Option<OcrConfigDto>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub ingestion_config: Option<IngestionConfigDto>,
}