oxirs_vec/content_processing/
data_handlers.rs1#[cfg(feature = "content-processing")]
6use crate::content_processing::{
7 ContentExtractionConfig, ContentLocation, DocumentFormat, DocumentStructure, ExtractedContent,
8 ExtractedTable, FormatHandler, ProcessingStats,
9};
10#[cfg(feature = "content-processing")]
11use anyhow::Result;
12#[cfg(feature = "content-processing")]
13use std::collections::HashMap;
14
15#[cfg(feature = "content-processing")]
17pub struct JsonHandler;
18
19#[cfg(feature = "content-processing")]
20impl FormatHandler for JsonHandler {
21 fn extract_content(
22 &self,
23 data: &[u8],
24 _config: &ContentExtractionConfig,
25 ) -> Result<ExtractedContent> {
26 let json_str = String::from_utf8_lossy(data);
27
28 let text = match serde_json::from_str::<serde_json::Value>(&json_str) {
30 Ok(value) => extract_text_from_json(&value),
31 Err(_) => json_str.to_string(),
32 };
33
34 Ok(ExtractedContent {
35 format: DocumentFormat::Json,
36 text,
37 metadata: HashMap::new(),
38 images: Vec::new(),
39 tables: Vec::new(),
40 links: Vec::new(),
41 structure: DocumentStructure {
42 title: None,
43 headings: Vec::new(),
44 page_count: 1,
45 section_count: 1,
46 table_of_contents: Vec::new(),
47 },
48 chunks: Vec::new(),
49 language: None,
50 processing_stats: ProcessingStats::default(),
51 audio_content: Vec::new(),
52 video_content: Vec::new(),
53 cross_modal_embeddings: Vec::new(),
54 })
55 }
56
57 fn can_handle(&self, data: &[u8]) -> bool {
58 let content = String::from_utf8_lossy(data);
59 serde_json::from_str::<serde_json::Value>(&content).is_ok()
60 }
61
62 fn supported_extensions(&self) -> Vec<&'static str> {
63 vec!["json"]
64 }
65}
66
67#[cfg(feature = "content-processing")]
68fn extract_text_from_json(value: &serde_json::Value) -> String {
69 match value {
70 serde_json::Value::String(s) => s.clone(),
71 serde_json::Value::Array(arr) => arr
72 .iter()
73 .map(extract_text_from_json)
74 .collect::<Vec<_>>()
75 .join(" "),
76 serde_json::Value::Object(obj) => obj
77 .values()
78 .map(extract_text_from_json)
79 .collect::<Vec<_>>()
80 .join(" "),
81 _ => value.to_string(),
82 }
83}
84
85#[cfg(feature = "content-processing")]
87pub struct CsvHandler;
88
89#[cfg(feature = "content-processing")]
90impl FormatHandler for CsvHandler {
91 fn extract_content(
92 &self,
93 data: &[u8],
94 _config: &ContentExtractionConfig,
95 ) -> Result<ExtractedContent> {
96 let csv_str = String::from_utf8_lossy(data);
97
98 let (text, tables) = self.parse_csv(&csv_str);
100
101 Ok(ExtractedContent {
102 format: DocumentFormat::Csv,
103 text,
104 metadata: HashMap::new(),
105 images: Vec::new(),
106 tables,
107 links: Vec::new(),
108 structure: DocumentStructure {
109 title: None,
110 headings: Vec::new(),
111 page_count: 1,
112 section_count: 1,
113 table_of_contents: Vec::new(),
114 },
115 chunks: Vec::new(),
116 language: None,
117 processing_stats: ProcessingStats::default(),
118 audio_content: Vec::new(),
119 video_content: Vec::new(),
120 cross_modal_embeddings: Vec::new(),
121 })
122 }
123
124 fn can_handle(&self, data: &[u8]) -> bool {
125 let content = String::from_utf8_lossy(data);
126 content.contains(',') || content.contains(';')
128 }
129
130 fn supported_extensions(&self) -> Vec<&'static str> {
131 vec!["csv"]
132 }
133}
134
135#[cfg(feature = "content-processing")]
136impl CsvHandler {
137 fn parse_csv(&self, csv_str: &str) -> (String, Vec<ExtractedTable>) {
138 let lines: Vec<&str> = csv_str.lines().collect();
139 if lines.is_empty() {
140 return (String::new(), Vec::new());
141 }
142
143 let headers: Vec<String> = lines[0].split(',').map(|s| s.trim().to_string()).collect();
145 let mut rows = Vec::new();
146
147 for line in lines.iter().skip(1) {
148 let row: Vec<String> = line.split(',').map(|s| s.trim().to_string()).collect();
149 if row.len() == headers.len() {
150 rows.push(row);
151 }
152 }
153
154 let mut text_parts = vec![headers.join(" | ")];
156 for row in &rows {
157 text_parts.push(row.join(" | "));
158 }
159 let text = text_parts.join("\n");
160
161 let table = ExtractedTable {
163 headers,
164 rows,
165 caption: None,
166 location: ContentLocation {
167 page: Some(1),
168 section: None,
169 char_offset: None,
170 line: None,
171 column: None,
172 },
173 };
174
175 (text, vec![table])
176 }
177}
178
179#[cfg(feature = "content-processing")]
181pub struct FallbackHandler(pub String);
182
183#[cfg(feature = "content-processing")]
184impl FormatHandler for FallbackHandler {
185 fn extract_content(
186 &self,
187 data: &[u8],
188 _config: &ContentExtractionConfig,
189 ) -> Result<ExtractedContent> {
190 let text = if let Ok(utf8_text) = String::from_utf8(data.to_vec()) {
192 utf8_text
193 } else {
194 format!(
195 "Binary content ({} bytes) - {} format not fully supported",
196 data.len(),
197 self.0
198 )
199 };
200
201 Ok(ExtractedContent {
202 format: DocumentFormat::Unknown,
203 text,
204 metadata: {
205 let mut meta = HashMap::new();
206 meta.insert("format".to_string(), self.0.clone());
207 meta.insert("size".to_string(), data.len().to_string());
208 meta
209 },
210 images: Vec::new(),
211 tables: Vec::new(),
212 links: Vec::new(),
213 structure: DocumentStructure {
214 title: None,
215 headings: Vec::new(),
216 page_count: 1,
217 section_count: 1,
218 table_of_contents: Vec::new(),
219 },
220 chunks: Vec::new(),
221 language: None,
222 processing_stats: ProcessingStats::default(),
223 audio_content: Vec::new(),
224 video_content: Vec::new(),
225 cross_modal_embeddings: Vec::new(),
226 })
227 }
228
229 fn can_handle(&self, _data: &[u8]) -> bool {
230 true }
232
233 fn supported_extensions(&self) -> Vec<&'static str> {
234 vec![] }
236}