oxirs_vec/content_processing/
data_handlers.rs

1//! Data format handlers for content processing
2//!
3//! This module provides handlers for JSON, CSV, and other structured data formats.
4
5#[cfg(feature = "content-processing")]
6use crate::content_processing::{
7    ContentExtractionConfig, ContentLocation, DocumentFormat, DocumentStructure, ExtractedContent,
8    ExtractedTable, FormatHandler, ProcessingStats,
9};
10#[cfg(feature = "content-processing")]
11use anyhow::Result;
12#[cfg(feature = "content-processing")]
13use std::collections::HashMap;
14
15/// JSON handler
16#[cfg(feature = "content-processing")]
17pub struct JsonHandler;
18
19#[cfg(feature = "content-processing")]
20impl FormatHandler for JsonHandler {
21    fn extract_content(
22        &self,
23        data: &[u8],
24        _config: &ContentExtractionConfig,
25    ) -> Result<ExtractedContent> {
26        let json_str = String::from_utf8_lossy(data);
27
28        // Parse JSON and extract text content
29        let text = match serde_json::from_str::<serde_json::Value>(&json_str) {
30            Ok(value) => extract_text_from_json(&value),
31            Err(_) => json_str.to_string(),
32        };
33
34        Ok(ExtractedContent {
35            format: DocumentFormat::Json,
36            text,
37            metadata: HashMap::new(),
38            images: Vec::new(),
39            tables: Vec::new(),
40            links: Vec::new(),
41            structure: DocumentStructure {
42                title: None,
43                headings: Vec::new(),
44                page_count: 1,
45                section_count: 1,
46                table_of_contents: Vec::new(),
47            },
48            chunks: Vec::new(),
49            language: None,
50            processing_stats: ProcessingStats::default(),
51            audio_content: Vec::new(),
52            video_content: Vec::new(),
53            cross_modal_embeddings: Vec::new(),
54        })
55    }
56
57    fn can_handle(&self, data: &[u8]) -> bool {
58        let content = String::from_utf8_lossy(data);
59        serde_json::from_str::<serde_json::Value>(&content).is_ok()
60    }
61
62    fn supported_extensions(&self) -> Vec<&'static str> {
63        vec!["json"]
64    }
65}
66
67#[cfg(feature = "content-processing")]
68fn extract_text_from_json(value: &serde_json::Value) -> String {
69    match value {
70        serde_json::Value::String(s) => s.clone(),
71        serde_json::Value::Array(arr) => arr
72            .iter()
73            .map(extract_text_from_json)
74            .collect::<Vec<_>>()
75            .join(" "),
76        serde_json::Value::Object(obj) => obj
77            .values()
78            .map(extract_text_from_json)
79            .collect::<Vec<_>>()
80            .join(" "),
81        _ => value.to_string(),
82    }
83}
84
85/// CSV handler
86#[cfg(feature = "content-processing")]
87pub struct CsvHandler;
88
89#[cfg(feature = "content-processing")]
90impl FormatHandler for CsvHandler {
91    fn extract_content(
92        &self,
93        data: &[u8],
94        _config: &ContentExtractionConfig,
95    ) -> Result<ExtractedContent> {
96        let csv_str = String::from_utf8_lossy(data);
97
98        // Parse CSV and extract content
99        let (text, tables) = self.parse_csv(&csv_str);
100
101        Ok(ExtractedContent {
102            format: DocumentFormat::Csv,
103            text,
104            metadata: HashMap::new(),
105            images: Vec::new(),
106            tables,
107            links: Vec::new(),
108            structure: DocumentStructure {
109                title: None,
110                headings: Vec::new(),
111                page_count: 1,
112                section_count: 1,
113                table_of_contents: Vec::new(),
114            },
115            chunks: Vec::new(),
116            language: None,
117            processing_stats: ProcessingStats::default(),
118            audio_content: Vec::new(),
119            video_content: Vec::new(),
120            cross_modal_embeddings: Vec::new(),
121        })
122    }
123
124    fn can_handle(&self, data: &[u8]) -> bool {
125        let content = String::from_utf8_lossy(data);
126        // Basic CSV detection
127        content.contains(',') || content.contains(';')
128    }
129
130    fn supported_extensions(&self) -> Vec<&'static str> {
131        vec!["csv"]
132    }
133}
134
135#[cfg(feature = "content-processing")]
136impl CsvHandler {
137    fn parse_csv(&self, csv_str: &str) -> (String, Vec<ExtractedTable>) {
138        let lines: Vec<&str> = csv_str.lines().collect();
139        if lines.is_empty() {
140            return (String::new(), Vec::new());
141        }
142
143        // Simple CSV parsing (in practice, would use csv crate)
144        let headers: Vec<String> = lines[0].split(',').map(|s| s.trim().to_string()).collect();
145        let mut rows = Vec::new();
146
147        for line in lines.iter().skip(1) {
148            let row: Vec<String> = line.split(',').map(|s| s.trim().to_string()).collect();
149            if row.len() == headers.len() {
150                rows.push(row);
151            }
152        }
153
154        // Create text representation
155        let mut text_parts = vec![headers.join(" | ")];
156        for row in &rows {
157            text_parts.push(row.join(" | "));
158        }
159        let text = text_parts.join("\n");
160
161        // Create table structure
162        let table = ExtractedTable {
163            headers,
164            rows,
165            caption: None,
166            location: ContentLocation {
167                page: Some(1),
168                section: None,
169                char_offset: None,
170                line: None,
171                column: None,
172            },
173        };
174
175        (text, vec![table])
176    }
177}
178
179/// Fallback handler for unsupported formats
180#[cfg(feature = "content-processing")]
181pub struct FallbackHandler(pub String);
182
183#[cfg(feature = "content-processing")]
184impl FormatHandler for FallbackHandler {
185    fn extract_content(
186        &self,
187        data: &[u8],
188        _config: &ContentExtractionConfig,
189    ) -> Result<ExtractedContent> {
190        // Basic text extraction attempt
191        let text = if let Ok(utf8_text) = String::from_utf8(data.to_vec()) {
192            utf8_text
193        } else {
194            format!(
195                "Binary content ({} bytes) - {} format not fully supported",
196                data.len(),
197                self.0
198            )
199        };
200
201        Ok(ExtractedContent {
202            format: DocumentFormat::Unknown,
203            text,
204            metadata: {
205                let mut meta = HashMap::new();
206                meta.insert("format".to_string(), self.0.clone());
207                meta.insert("size".to_string(), data.len().to_string());
208                meta
209            },
210            images: Vec::new(),
211            tables: Vec::new(),
212            links: Vec::new(),
213            structure: DocumentStructure {
214                title: None,
215                headings: Vec::new(),
216                page_count: 1,
217                section_count: 1,
218                table_of_contents: Vec::new(),
219            },
220            chunks: Vec::new(),
221            language: None,
222            processing_stats: ProcessingStats::default(),
223            audio_content: Vec::new(),
224            video_content: Vec::new(),
225            cross_modal_embeddings: Vec::new(),
226        })
227    }
228
229    fn can_handle(&self, _data: &[u8]) -> bool {
230        true // Fallback handles everything
231    }
232
233    fn supported_extensions(&self) -> Vec<&'static str> {
234        vec![] // No specific extensions
235    }
236}