use serde::{Deserialize, Serialize};
use chrono::{DateTime, Utc};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DocumentChunk {
pub id: String,
pub content: String,
pub chunk_index: usize,
pub position: ChunkPosition,
pub metadata: ChunkMetadata,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ChunkPosition {
pub page: Option<u32>,
pub line: Option<u32>,
pub start_offset: Option<usize>,
pub end_offset: Option<usize>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ChunkMetadata {
pub size: usize,
pub language: Option<String>,
pub confidence: Option<f32>,
pub format_specific: Option<serde_json::Value>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct UniversalOutput {
pub document_metadata: DocumentMetadata,
pub chunks: Vec<DocumentChunk>,
pub processing_info: ProcessingInfo,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DocumentMetadata {
pub filename: String,
pub filepath: String,
pub document_type: DocumentType,
pub file_size: u64,
pub created_at: Option<DateTime<Utc>>,
pub modified_at: Option<DateTime<Utc>>,
pub title: Option<String>,
pub author: Option<String>,
pub format_metadata: serde_json::Value,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub enum DocumentType {
PDF,
TXT,
JSON,
CSV,
DOCX,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ProcessingInfo {
pub processor: String,
pub processor_version: String,
pub processed_at: DateTime<Utc>,
pub processing_time_ms: u64,
pub total_chunks: usize,
pub total_content_size: usize,
pub processing_params: ProcessingParams,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ProcessingParams {
pub max_chunk_size: usize,
pub chunk_overlap: usize,
pub text_cleaning: bool,
pub language_detection: bool,
pub format_specific: serde_json::Value,
}
impl Default for ProcessingParams {
fn default() -> Self {
Self {
max_chunk_size: 1000,
chunk_overlap: 100,
text_cleaning: true,
language_detection: false,
format_specific: serde_json::Value::Null,
}
}
}
impl ProcessingParams {
pub fn with_chunk_size(mut self, size: usize) -> Self {
self.max_chunk_size = size;
self
}
pub fn with_chunk_overlap(mut self, overlap: usize) -> Self {
self.chunk_overlap = overlap;
self
}
pub fn with_text_cleaning(mut self, enabled: bool) -> Self {
self.text_cleaning = enabled;
self
}
pub fn with_language_detection(mut self, enabled: bool) -> Self {
self.language_detection = enabled;
self
}
pub fn with_format_specific(mut self, metadata: serde_json::Value) -> Self {
self.format_specific = metadata;
self
}
}
impl DocumentType {
pub fn from_extension(ext: &str) -> Option<Self> {
match ext.to_lowercase().as_str() {
"pdf" => Some(DocumentType::PDF),
"txt" => Some(DocumentType::TXT),
"json" => Some(DocumentType::JSON),
"csv" => Some(DocumentType::CSV),
"docx" => Some(DocumentType::DOCX),
_ => None,
}
}
pub fn to_string(&self) -> &'static str {
match self {
DocumentType::PDF => "PDF",
DocumentType::TXT => "TXT",
DocumentType::JSON => "JSON",
DocumentType::CSV => "CSV",
DocumentType::DOCX => "DOCX",
}
}
}