oxirs_vec/content_processing/
types.rs

1//! Data types and structures for content processing
2//!
3//! This module contains all the core data types, enums, and structures
4//! used throughout the content processing system.
5
6use crate::Vector;
7use serde::{Deserialize, Serialize};
8use std::collections::HashMap;
9
10/// Document format types supported by the content processor
11#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
12pub enum DocumentFormat {
13    Pdf,
14    Html,
15    Xml,
16    Markdown,
17    PlainText,
18    Docx,
19    Pptx,
20    Xlsx,
21    Rtf,
22    Epub,
23    Json,
24    Csv,
25    // Multimedia formats
26    Image,
27    Audio,
28    Video,
29    Unknown,
30}
31
32/// Content extraction configuration
33#[derive(Debug, Clone, Serialize, Deserialize)]
34pub struct ContentExtractionConfig {
35    /// Extract text content
36    pub extract_text: bool,
37    /// Extract metadata
38    pub extract_metadata: bool,
39    /// Extract images
40    pub extract_images: bool,
41    /// Extract tables
42    pub extract_tables: bool,
43    /// Extract links
44    pub extract_links: bool,
45    /// Maximum content length to extract
46    pub max_content_length: usize,
47    /// Preserve document structure
48    pub preserve_structure: bool,
49    /// Extract page/section information
50    pub extract_page_info: bool,
51    /// Language detection
52    pub detect_language: bool,
53    /// Content chunking strategy
54    pub chunking_strategy: ChunkingStrategy,
55    /// Extract multimedia features (image analysis, audio analysis, etc.)
56    pub extract_multimedia_features: bool,
57    /// Generate image embeddings using computer vision models
58    pub generate_image_embeddings: bool,
59    /// Extract audio features and generate embeddings
60    pub extract_audio_features: bool,
61    /// Extract video keyframes and generate embeddings
62    pub extract_video_features: bool,
63    /// Maximum image processing resolution
64    pub max_image_resolution: Option<(u32, u32)>,
65}
66
67impl Default for ContentExtractionConfig {
68    fn default() -> Self {
69        Self {
70            extract_text: true,
71            extract_metadata: true,
72            extract_images: false,
73            extract_tables: true,
74            extract_links: true,
75            max_content_length: 1_000_000, // 1MB
76            preserve_structure: true,
77            extract_page_info: true,
78            detect_language: true,
79            chunking_strategy: ChunkingStrategy::Paragraph,
80            extract_multimedia_features: false,
81            generate_image_embeddings: false,
82            extract_audio_features: false,
83            extract_video_features: false,
84            max_image_resolution: Some((1920, 1080)), // Full HD max
85        }
86    }
87}
88
89/// Content chunking strategies
90#[derive(Debug, Clone, Serialize, Deserialize)]
91pub enum ChunkingStrategy {
92    /// Split by paragraphs
93    Paragraph,
94    /// Split by sentences
95    Sentence,
96    /// Split by fixed token count
97    FixedTokens(usize),
98    /// Split by semantic sections
99    Semantic,
100    /// Split by pages/slides
101    Page,
102    /// Custom regex pattern
103    Custom(String),
104}
105
106/// Extracted document content
107#[derive(Debug, Clone, Serialize, Deserialize)]
108pub struct ExtractedContent {
109    /// Document format
110    pub format: DocumentFormat,
111    /// Raw text content
112    pub text: String,
113    /// Document metadata
114    pub metadata: HashMap<String, String>,
115    /// Extracted images (base64 encoded)
116    pub images: Vec<ExtractedImage>,
117    /// Extracted tables
118    pub tables: Vec<ExtractedTable>,
119    /// Extracted links
120    pub links: Vec<ExtractedLink>,
121    /// Document structure information
122    pub structure: DocumentStructure,
123    /// Content chunks for embedding
124    pub chunks: Vec<ContentChunk>,
125    /// Detected language
126    pub language: Option<String>,
127    /// Processing statistics
128    pub processing_stats: ProcessingStats,
129    /// Extracted audio content
130    pub audio_content: Vec<ExtractedAudio>,
131    /// Extracted video content
132    pub video_content: Vec<ExtractedVideo>,
133    /// Cross-modal embeddings (combining text, image, audio, video)
134    pub cross_modal_embeddings: Vec<CrossModalEmbedding>,
135}
136
137/// Extracted image information
138#[derive(Debug, Clone, Serialize, Deserialize)]
139pub struct ExtractedImage {
140    /// Image data (base64 encoded)
141    pub data: String,
142    /// Image format (JPEG, PNG, etc.)
143    pub format: String,
144    /// Width in pixels
145    pub width: u32,
146    /// Height in pixels
147    pub height: u32,
148    /// Alternative text
149    pub alt_text: Option<String>,
150    /// Caption
151    pub caption: Option<String>,
152    /// Page/location information
153    pub location: ContentLocation,
154    /// Extracted visual features (SIFT, HOG, color histograms, etc.)
155    pub visual_features: Option<ImageFeatures>,
156    /// Generated embedding vector
157    pub embedding: Option<Vector>,
158    /// Object detection results
159    pub detected_objects: Vec<DetectedObject>,
160    /// Image classification labels with confidence scores
161    pub classification_labels: Vec<ClassificationLabel>,
162}
163
164/// Image feature extraction results
165#[derive(Debug, Clone, Serialize, Deserialize)]
166pub struct ImageFeatures {
167    /// Color histogram features
168    pub color_histogram: Option<Vec<f32>>,
169    /// Texture features (LBP, GLCM, etc.)
170    pub texture_features: Option<Vec<f32>>,
171    /// Edge features
172    pub edge_features: Option<Vec<f32>>,
173    /// SIFT keypoints and descriptors
174    pub sift_features: Option<Vec<f32>>,
175    /// CNN features from pre-trained models
176    pub cnn_features: Option<Vec<f32>>,
177    /// Dominant colors
178    pub dominant_colors: Vec<(u8, u8, u8)>, // RGB tuples
179    /// Image complexity metrics
180    pub complexity_metrics: ImageComplexityMetrics,
181}
182
183/// Object detection result
184#[derive(Debug, Clone, Serialize, Deserialize)]
185pub struct DetectedObject {
186    /// Object class label
187    pub label: String,
188    /// Confidence score (0.0 to 1.0)
189    pub confidence: f32,
190    /// Bounding box coordinates (x, y, width, height)
191    pub bbox: (u32, u32, u32, u32),
192}
193
194/// Classification label with confidence
195#[derive(Debug, Clone, Serialize, Deserialize)]
196pub struct ClassificationLabel {
197    /// Class label
198    pub label: String,
199    /// Confidence score (0.0 to 1.0)
200    pub confidence: f32,
201}
202
203/// Image complexity metrics
204#[derive(Debug, Clone, Serialize, Deserialize)]
205pub struct ImageComplexityMetrics {
206    /// Edge density (0.0 to 1.0)
207    pub edge_density: f32,
208    /// Color diversity (0.0 to 1.0)
209    pub color_diversity: f32,
210    /// Texture complexity (0.0 to 1.0)
211    pub texture_complexity: f32,
212    /// Information entropy
213    pub entropy: f32,
214}
215
216/// Extracted audio information
217#[derive(Debug, Clone, Serialize, Deserialize)]
218pub struct ExtractedAudio {
219    /// Audio data (base64 encoded)
220    pub data: String,
221    /// Audio format (MP3, WAV, etc.)
222    pub format: String,
223    /// Duration in seconds
224    pub duration: f32,
225    /// Sample rate in Hz
226    pub sample_rate: u32,
227    /// Number of channels
228    pub channels: u16,
229    /// Extracted audio features
230    pub audio_features: Option<AudioFeatures>,
231    /// Generated embedding vector
232    pub embedding: Option<Vector>,
233    /// Transcribed text (if available)
234    pub transcription: Option<String>,
235    /// Music analysis (if music content)
236    pub music_analysis: Option<MusicAnalysis>,
237    /// Speech analysis (if speech content)
238    pub speech_analysis: Option<SpeechAnalysis>,
239}
240
241/// Audio feature extraction results
242#[derive(Debug, Clone, Serialize, Deserialize)]
243pub struct AudioFeatures {
244    /// Mel-frequency cepstral coefficients
245    pub mfcc: Option<Vec<f32>>,
246    /// Spectral features (centroid, rolloff, etc.)
247    pub spectral_features: Option<Vec<f32>>,
248    /// Rhythm and tempo features
249    pub rhythm_features: Option<Vec<f32>>,
250    /// Harmonic features
251    pub harmonic_features: Option<Vec<f32>>,
252    /// Zero-crossing rate
253    pub zero_crossing_rate: f32,
254    /// Energy and loudness metrics
255    pub energy_metrics: AudioEnergyMetrics,
256}
257
258/// Audio energy and loudness metrics
259#[derive(Debug, Clone, Serialize, Deserialize)]
260pub struct AudioEnergyMetrics {
261    /// RMS energy
262    pub rms_energy: f32,
263    /// Peak amplitude
264    pub peak_amplitude: f32,
265    /// Average loudness (LUFS)
266    pub average_loudness: f32,
267    /// Dynamic range
268    pub dynamic_range: f32,
269}
270
271/// Music analysis results
272#[derive(Debug, Clone, Serialize, Deserialize)]
273pub struct MusicAnalysis {
274    /// Detected tempo (BPM)
275    pub tempo: Option<f32>,
276    /// Key signature
277    pub key: Option<String>,
278    /// Time signature
279    pub time_signature: Option<String>,
280    /// Genre classification
281    pub genre: Option<String>,
282    /// Mood/valence (-1.0 to 1.0)
283    pub valence: Option<f32>,
284    /// Energy level (0.0 to 1.0)
285    pub energy: Option<f32>,
286}
287
288/// Speech analysis results
289#[derive(Debug, Clone, Serialize, Deserialize)]
290pub struct SpeechAnalysis {
291    /// Detected language
292    pub language: Option<String>,
293    /// Speaker gender (if detectable)
294    pub speaker_gender: Option<String>,
295    /// Speaker emotion
296    pub emotion: Option<String>,
297    /// Speech rate (words per minute)
298    pub speech_rate: Option<f32>,
299    /// Pitch statistics
300    pub pitch_stats: Option<PitchStatistics>,
301}
302
303/// Pitch statistics for speech analysis
304#[derive(Debug, Clone, Serialize, Deserialize)]
305pub struct PitchStatistics {
306    /// Mean pitch (Hz)
307    pub mean_pitch: f32,
308    /// Pitch standard deviation
309    pub pitch_std: f32,
310    /// Pitch range (max - min)
311    pub pitch_range: f32,
312}
313
314/// Extracted video information
315#[derive(Debug, Clone, Serialize, Deserialize)]
316pub struct ExtractedVideo {
317    /// Video data (base64 encoded or file path)
318    pub data: String,
319    /// Video format (MP4, AVI, etc.)
320    pub format: String,
321    /// Duration in seconds
322    pub duration: f32,
323    /// Frame rate (fps)
324    pub frame_rate: f32,
325    /// Video resolution (width, height)
326    pub resolution: (u32, u32),
327    /// Extracted keyframes
328    pub keyframes: Vec<VideoKeyframe>,
329    /// Generated embedding vector
330    pub embedding: Option<Vector>,
331    /// Audio track analysis
332    pub audio_analysis: Option<ExtractedAudio>,
333    /// Video analysis results
334    pub video_analysis: Option<VideoAnalysis>,
335}
336
337/// Video keyframe information
338#[derive(Debug, Clone, Serialize, Deserialize)]
339pub struct VideoKeyframe {
340    /// Timestamp in seconds
341    pub timestamp: f32,
342    /// Frame image data
343    pub image: ExtractedImage,
344    /// Scene change score (0.0 to 1.0)
345    pub scene_change_score: f32,
346}
347
348/// Type alias for color timeline entries (timestamp, dominant_colors)
349pub type ColorTimelineEntry = (f32, Vec<(u8, u8, u8)>);
350
351/// Video analysis results
352#[derive(Debug, Clone, Serialize, Deserialize)]
353pub struct VideoAnalysis {
354    /// Detected scenes with timestamps
355    pub scenes: Vec<VideoScene>,
356    /// Motion analysis
357    pub motion_analysis: Option<MotionAnalysis>,
358    /// Visual activity level
359    pub activity_level: f32,
360    /// Color characteristics over time
361    pub color_timeline: Vec<ColorTimelineEntry>,
362}
363
364/// Video scene detection result
365#[derive(Debug, Clone, Serialize, Deserialize)]
366pub struct VideoScene {
367    /// Scene start time in seconds
368    pub start_time: f32,
369    /// Scene end time in seconds
370    pub end_time: f32,
371    /// Scene description/label
372    pub description: Option<String>,
373    /// Representative keyframe
374    pub representative_frame: Option<ExtractedImage>,
375}
376
377/// Motion analysis for video
378#[derive(Debug, Clone, Serialize, Deserialize)]
379pub struct MotionAnalysis {
380    /// Average motion magnitude
381    pub average_motion: f32,
382    /// Motion variance
383    pub motion_variance: f32,
384    /// Camera motion type (pan, tilt, zoom, etc.)
385    pub camera_motion: Option<String>,
386    /// Object motion tracking
387    pub object_motion: Vec<ObjectMotion>,
388}
389
390/// Object motion tracking result
391#[derive(Debug, Clone, Serialize, Deserialize)]
392pub struct ObjectMotion {
393    /// Object identifier
394    pub object_id: String,
395    /// Motion trajectory (time, x, y)
396    pub trajectory: Vec<(f32, f32, f32)>,
397    /// Motion speed (pixels per second)
398    pub speed: f32,
399}
400
401/// Extracted table information
402#[derive(Debug, Clone, Serialize, Deserialize)]
403pub struct ExtractedTable {
404    /// Table headers
405    pub headers: Vec<String>,
406    /// Table rows
407    pub rows: Vec<Vec<String>>,
408    /// Table caption
409    pub caption: Option<String>,
410    /// Location in document
411    pub location: ContentLocation,
412}
413
414/// Extracted link information
415#[derive(Debug, Clone, Serialize, Deserialize)]
416pub struct ExtractedLink {
417    /// Link URL
418    pub url: String,
419    /// Link text
420    pub text: String,
421    /// Link title
422    pub title: Option<String>,
423    /// Location in document
424    pub location: ContentLocation,
425}
426
427/// Document structure information
428#[derive(Debug, Clone, Serialize, Deserialize)]
429pub struct DocumentStructure {
430    /// Document title
431    pub title: Option<String>,
432    /// Headings hierarchy
433    pub headings: Vec<Heading>,
434    /// Page count
435    pub page_count: usize,
436    /// Section count
437    pub section_count: usize,
438    /// Table of contents
439    pub table_of_contents: Vec<TocEntry>,
440}
441
442/// Heading information
443#[derive(Debug, Clone, Serialize, Deserialize)]
444pub struct Heading {
445    /// Heading level (1-6)
446    pub level: usize,
447    /// Heading text
448    pub text: String,
449    /// Location in document
450    pub location: ContentLocation,
451}
452
453/// Table of contents entry
454#[derive(Debug, Clone, Serialize, Deserialize)]
455pub struct TocEntry {
456    /// Section title
457    pub title: String,
458    /// Section level
459    pub level: usize,
460    /// Page number
461    pub page: Option<usize>,
462    /// Location reference
463    pub location: ContentLocation,
464}
465
466/// Content chunk for embedding
467#[derive(Debug, Clone, Serialize, Deserialize)]
468pub struct ContentChunk {
469    /// Chunk text content
470    pub text: String,
471    /// Chunk type
472    pub chunk_type: ChunkType,
473    /// Location in document
474    pub location: ContentLocation,
475    /// Associated metadata
476    pub metadata: HashMap<String, String>,
477    /// Embedding vector (if computed)
478    pub embedding: Option<Vector>,
479}
480
481/// Content chunk types
482#[derive(Debug, Clone, Serialize, Deserialize)]
483pub enum ChunkType {
484    Paragraph,
485    Heading,
486    Table,
487    List,
488    Quote,
489    Code,
490    Caption,
491    Footnote,
492    Header,
493    Footer,
494}
495
496/// Content location information
497#[derive(Debug, Clone, Serialize, Deserialize)]
498pub struct ContentLocation {
499    /// Page number (1-indexed)
500    pub page: Option<usize>,
501    /// Section number
502    pub section: Option<usize>,
503    /// Character offset in document
504    pub char_offset: Option<usize>,
505    /// Line number
506    pub line: Option<usize>,
507    /// Column number
508    pub column: Option<usize>,
509}
510
511/// Cross-modal embedding that combines multiple modalities
512#[derive(Debug, Clone, Serialize, Deserialize)]
513pub struct CrossModalEmbedding {
514    /// Combined embedding vector
515    pub embedding: Vector,
516    /// Modalities included in this embedding
517    pub modalities: Vec<Modality>,
518    /// Fusion strategy used
519    pub fusion_strategy: FusionStrategy,
520    /// Confidence score for the embedding quality
521    pub confidence: f32,
522    /// Associated content identifiers
523    pub content_ids: Vec<String>,
524}
525
526/// Modality types for cross-modal processing
527#[derive(Debug, Clone, Serialize, Deserialize)]
528pub enum Modality {
529    Text,
530    Image,
531    Audio,
532    Video,
533}
534
535/// Fusion strategies for combining modalities
536#[derive(Debug, Clone, Serialize, Deserialize)]
537pub enum FusionStrategy {
538    /// Simple concatenation of features
539    Concatenation,
540    /// Weighted average of embeddings
541    WeightedAverage(Vec<f32>), // weights for each modality
542    /// Attention-based fusion
543    Attention,
544    /// Late fusion with score combination
545    LateFusion,
546    /// Multi-layer perceptron fusion
547    MlpFusion,
548    /// Transformer-based fusion
549    TransformerFusion,
550}
551
552/// Processing statistics
553#[derive(Debug, Clone, Default, Serialize, Deserialize)]
554pub struct ProcessingStats {
555    /// Processing time in milliseconds
556    pub processing_time_ms: u64,
557    /// Total characters extracted
558    pub total_chars: usize,
559    /// Total words extracted
560    pub total_words: usize,
561    /// Number of images found
562    pub image_count: usize,
563    /// Number of tables found
564    pub table_count: usize,
565    /// Number of links found
566    pub link_count: usize,
567    /// Number of chunks created
568    pub chunk_count: usize,
569    /// Number of audio files processed
570    pub audio_count: usize,
571    /// Number of video files processed
572    pub video_count: usize,
573    /// Number of cross-modal embeddings generated
574    pub cross_modal_embedding_count: usize,
575    /// Total time spent on image processing (ms)
576    pub image_processing_time_ms: u64,
577    /// Total time spent on audio processing (ms)
578    pub audio_processing_time_ms: u64,
579    /// Total time spent on video processing (ms)
580    pub video_processing_time_ms: u64,
581    /// Processing warnings
582    pub warnings: Vec<String>,
583}