1use crate::Vector;
7use serde::{Deserialize, Serialize};
8use std::collections::HashMap;
9
10#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
12pub enum DocumentFormat {
13 Pdf,
14 Html,
15 Xml,
16 Markdown,
17 PlainText,
18 Docx,
19 Pptx,
20 Xlsx,
21 Rtf,
22 Epub,
23 Json,
24 Csv,
25 Image,
27 Audio,
28 Video,
29 Unknown,
30}
31
32#[derive(Debug, Clone, Serialize, Deserialize)]
34pub struct ContentExtractionConfig {
35 pub extract_text: bool,
37 pub extract_metadata: bool,
39 pub extract_images: bool,
41 pub extract_tables: bool,
43 pub extract_links: bool,
45 pub max_content_length: usize,
47 pub preserve_structure: bool,
49 pub extract_page_info: bool,
51 pub detect_language: bool,
53 pub chunking_strategy: ChunkingStrategy,
55 pub extract_multimedia_features: bool,
57 pub generate_image_embeddings: bool,
59 pub extract_audio_features: bool,
61 pub extract_video_features: bool,
63 pub max_image_resolution: Option<(u32, u32)>,
65}
66
67impl Default for ContentExtractionConfig {
68 fn default() -> Self {
69 Self {
70 extract_text: true,
71 extract_metadata: true,
72 extract_images: false,
73 extract_tables: true,
74 extract_links: true,
75 max_content_length: 1_000_000, preserve_structure: true,
77 extract_page_info: true,
78 detect_language: true,
79 chunking_strategy: ChunkingStrategy::Paragraph,
80 extract_multimedia_features: false,
81 generate_image_embeddings: false,
82 extract_audio_features: false,
83 extract_video_features: false,
84 max_image_resolution: Some((1920, 1080)), }
86 }
87}
88
89#[derive(Debug, Clone, Serialize, Deserialize)]
91pub enum ChunkingStrategy {
92 Paragraph,
94 Sentence,
96 FixedTokens(usize),
98 Semantic,
100 Page,
102 Custom(String),
104}
105
106#[derive(Debug, Clone, Serialize, Deserialize)]
108pub struct ExtractedContent {
109 pub format: DocumentFormat,
111 pub text: String,
113 pub metadata: HashMap<String, String>,
115 pub images: Vec<ExtractedImage>,
117 pub tables: Vec<ExtractedTable>,
119 pub links: Vec<ExtractedLink>,
121 pub structure: DocumentStructure,
123 pub chunks: Vec<ContentChunk>,
125 pub language: Option<String>,
127 pub processing_stats: ProcessingStats,
129 pub audio_content: Vec<ExtractedAudio>,
131 pub video_content: Vec<ExtractedVideo>,
133 pub cross_modal_embeddings: Vec<CrossModalEmbedding>,
135}
136
137#[derive(Debug, Clone, Serialize, Deserialize)]
139pub struct ExtractedImage {
140 pub data: String,
142 pub format: String,
144 pub width: u32,
146 pub height: u32,
148 pub alt_text: Option<String>,
150 pub caption: Option<String>,
152 pub location: ContentLocation,
154 pub visual_features: Option<ImageFeatures>,
156 pub embedding: Option<Vector>,
158 pub detected_objects: Vec<DetectedObject>,
160 pub classification_labels: Vec<ClassificationLabel>,
162}
163
164#[derive(Debug, Clone, Serialize, Deserialize)]
166pub struct ImageFeatures {
167 pub color_histogram: Option<Vec<f32>>,
169 pub texture_features: Option<Vec<f32>>,
171 pub edge_features: Option<Vec<f32>>,
173 pub sift_features: Option<Vec<f32>>,
175 pub cnn_features: Option<Vec<f32>>,
177 pub dominant_colors: Vec<(u8, u8, u8)>, pub complexity_metrics: ImageComplexityMetrics,
181}
182
183#[derive(Debug, Clone, Serialize, Deserialize)]
185pub struct DetectedObject {
186 pub label: String,
188 pub confidence: f32,
190 pub bbox: (u32, u32, u32, u32),
192}
193
194#[derive(Debug, Clone, Serialize, Deserialize)]
196pub struct ClassificationLabel {
197 pub label: String,
199 pub confidence: f32,
201}
202
203#[derive(Debug, Clone, Serialize, Deserialize)]
205pub struct ImageComplexityMetrics {
206 pub edge_density: f32,
208 pub color_diversity: f32,
210 pub texture_complexity: f32,
212 pub entropy: f32,
214}
215
216#[derive(Debug, Clone, Serialize, Deserialize)]
218pub struct ExtractedAudio {
219 pub data: String,
221 pub format: String,
223 pub duration: f32,
225 pub sample_rate: u32,
227 pub channels: u16,
229 pub audio_features: Option<AudioFeatures>,
231 pub embedding: Option<Vector>,
233 pub transcription: Option<String>,
235 pub music_analysis: Option<MusicAnalysis>,
237 pub speech_analysis: Option<SpeechAnalysis>,
239}
240
241#[derive(Debug, Clone, Serialize, Deserialize)]
243pub struct AudioFeatures {
244 pub mfcc: Option<Vec<f32>>,
246 pub spectral_features: Option<Vec<f32>>,
248 pub rhythm_features: Option<Vec<f32>>,
250 pub harmonic_features: Option<Vec<f32>>,
252 pub zero_crossing_rate: f32,
254 pub energy_metrics: AudioEnergyMetrics,
256}
257
258#[derive(Debug, Clone, Serialize, Deserialize)]
260pub struct AudioEnergyMetrics {
261 pub rms_energy: f32,
263 pub peak_amplitude: f32,
265 pub average_loudness: f32,
267 pub dynamic_range: f32,
269}
270
271#[derive(Debug, Clone, Serialize, Deserialize)]
273pub struct MusicAnalysis {
274 pub tempo: Option<f32>,
276 pub key: Option<String>,
278 pub time_signature: Option<String>,
280 pub genre: Option<String>,
282 pub valence: Option<f32>,
284 pub energy: Option<f32>,
286}
287
288#[derive(Debug, Clone, Serialize, Deserialize)]
290pub struct SpeechAnalysis {
291 pub language: Option<String>,
293 pub speaker_gender: Option<String>,
295 pub emotion: Option<String>,
297 pub speech_rate: Option<f32>,
299 pub pitch_stats: Option<PitchStatistics>,
301}
302
303#[derive(Debug, Clone, Serialize, Deserialize)]
305pub struct PitchStatistics {
306 pub mean_pitch: f32,
308 pub pitch_std: f32,
310 pub pitch_range: f32,
312}
313
314#[derive(Debug, Clone, Serialize, Deserialize)]
316pub struct ExtractedVideo {
317 pub data: String,
319 pub format: String,
321 pub duration: f32,
323 pub frame_rate: f32,
325 pub resolution: (u32, u32),
327 pub keyframes: Vec<VideoKeyframe>,
329 pub embedding: Option<Vector>,
331 pub audio_analysis: Option<ExtractedAudio>,
333 pub video_analysis: Option<VideoAnalysis>,
335}
336
337#[derive(Debug, Clone, Serialize, Deserialize)]
339pub struct VideoKeyframe {
340 pub timestamp: f32,
342 pub image: ExtractedImage,
344 pub scene_change_score: f32,
346}
347
348pub type ColorTimelineEntry = (f32, Vec<(u8, u8, u8)>);
350
351#[derive(Debug, Clone, Serialize, Deserialize)]
353pub struct VideoAnalysis {
354 pub scenes: Vec<VideoScene>,
356 pub motion_analysis: Option<MotionAnalysis>,
358 pub activity_level: f32,
360 pub color_timeline: Vec<ColorTimelineEntry>,
362}
363
364#[derive(Debug, Clone, Serialize, Deserialize)]
366pub struct VideoScene {
367 pub start_time: f32,
369 pub end_time: f32,
371 pub description: Option<String>,
373 pub representative_frame: Option<ExtractedImage>,
375}
376
377#[derive(Debug, Clone, Serialize, Deserialize)]
379pub struct MotionAnalysis {
380 pub average_motion: f32,
382 pub motion_variance: f32,
384 pub camera_motion: Option<String>,
386 pub object_motion: Vec<ObjectMotion>,
388}
389
390#[derive(Debug, Clone, Serialize, Deserialize)]
392pub struct ObjectMotion {
393 pub object_id: String,
395 pub trajectory: Vec<(f32, f32, f32)>,
397 pub speed: f32,
399}
400
401#[derive(Debug, Clone, Serialize, Deserialize)]
403pub struct ExtractedTable {
404 pub headers: Vec<String>,
406 pub rows: Vec<Vec<String>>,
408 pub caption: Option<String>,
410 pub location: ContentLocation,
412}
413
414#[derive(Debug, Clone, Serialize, Deserialize)]
416pub struct ExtractedLink {
417 pub url: String,
419 pub text: String,
421 pub title: Option<String>,
423 pub location: ContentLocation,
425}
426
427#[derive(Debug, Clone, Serialize, Deserialize)]
429pub struct DocumentStructure {
430 pub title: Option<String>,
432 pub headings: Vec<Heading>,
434 pub page_count: usize,
436 pub section_count: usize,
438 pub table_of_contents: Vec<TocEntry>,
440}
441
442#[derive(Debug, Clone, Serialize, Deserialize)]
444pub struct Heading {
445 pub level: usize,
447 pub text: String,
449 pub location: ContentLocation,
451}
452
453#[derive(Debug, Clone, Serialize, Deserialize)]
455pub struct TocEntry {
456 pub title: String,
458 pub level: usize,
460 pub page: Option<usize>,
462 pub location: ContentLocation,
464}
465
466#[derive(Debug, Clone, Serialize, Deserialize)]
468pub struct ContentChunk {
469 pub text: String,
471 pub chunk_type: ChunkType,
473 pub location: ContentLocation,
475 pub metadata: HashMap<String, String>,
477 pub embedding: Option<Vector>,
479}
480
481#[derive(Debug, Clone, Serialize, Deserialize)]
483pub enum ChunkType {
484 Paragraph,
485 Heading,
486 Table,
487 List,
488 Quote,
489 Code,
490 Caption,
491 Footnote,
492 Header,
493 Footer,
494}
495
496#[derive(Debug, Clone, Serialize, Deserialize)]
498pub struct ContentLocation {
499 pub page: Option<usize>,
501 pub section: Option<usize>,
503 pub char_offset: Option<usize>,
505 pub line: Option<usize>,
507 pub column: Option<usize>,
509}
510
511#[derive(Debug, Clone, Serialize, Deserialize)]
513pub struct CrossModalEmbedding {
514 pub embedding: Vector,
516 pub modalities: Vec<Modality>,
518 pub fusion_strategy: FusionStrategy,
520 pub confidence: f32,
522 pub content_ids: Vec<String>,
524}
525
526#[derive(Debug, Clone, Serialize, Deserialize)]
528pub enum Modality {
529 Text,
530 Image,
531 Audio,
532 Video,
533}
534
535#[derive(Debug, Clone, Serialize, Deserialize)]
537pub enum FusionStrategy {
538 Concatenation,
540 WeightedAverage(Vec<f32>), Attention,
544 LateFusion,
546 MlpFusion,
548 TransformerFusion,
550}
551
552#[derive(Debug, Clone, Default, Serialize, Deserialize)]
554pub struct ProcessingStats {
555 pub processing_time_ms: u64,
557 pub total_chars: usize,
559 pub total_words: usize,
561 pub image_count: usize,
563 pub table_count: usize,
565 pub link_count: usize,
567 pub chunk_count: usize,
569 pub audio_count: usize,
571 pub video_count: usize,
573 pub cross_modal_embedding_count: usize,
575 pub image_processing_time_ms: u64,
577 pub audio_processing_time_ms: u64,
579 pub video_processing_time_ms: u64,
581 pub warnings: Vec<String>,
583}