1#[cfg(feature = "content-processing")]
6use crate::content_processing::{
7 ContentExtractionConfig, ContentLocation, DocumentFormat, DocumentStructure, ExtractedContent,
8 FormatHandler, Heading, ProcessingStats,
9};
10#[cfg(feature = "content-processing")]
11use anyhow::Result;
12#[cfg(feature = "content-processing")]
13use std::collections::HashMap;
14
15#[cfg(feature = "content-processing")]
17pub struct PlainTextHandler;
18
19#[cfg(feature = "content-processing")]
20impl FormatHandler for PlainTextHandler {
21 fn extract_content(
22 &self,
23 data: &[u8],
24 _config: &ContentExtractionConfig,
25 ) -> Result<ExtractedContent> {
26 let text = String::from_utf8_lossy(data).to_string();
27
28 Ok(ExtractedContent {
29 format: DocumentFormat::PlainText,
30 text,
31 metadata: HashMap::new(),
32 images: Vec::new(),
33 tables: Vec::new(),
34 links: Vec::new(),
35 structure: DocumentStructure {
36 title: None,
37 headings: Vec::new(),
38 page_count: 1,
39 section_count: 1,
40 table_of_contents: Vec::new(),
41 },
42 chunks: Vec::new(),
43 language: None,
44 processing_stats: ProcessingStats::default(),
45 audio_content: Vec::new(),
46 video_content: Vec::new(),
47 cross_modal_embeddings: Vec::new(),
48 })
49 }
50
51 fn can_handle(&self, data: &[u8]) -> bool {
52 String::from_utf8(data.to_vec()).is_ok()
54 }
55
56 fn supported_extensions(&self) -> Vec<&'static str> {
57 vec!["txt", "text"]
58 }
59}
60
61#[cfg(feature = "content-processing")]
63pub struct HtmlHandler;
64
65#[cfg(feature = "content-processing")]
66impl FormatHandler for HtmlHandler {
67 fn extract_content(
68 &self,
69 data: &[u8],
70 config: &ContentExtractionConfig,
71 ) -> Result<ExtractedContent> {
72 let html = String::from_utf8_lossy(data);
73
74 let text = self.extract_text_from_html(&html);
76 let headings = self.extract_headings(&html);
77 let links = if config.extract_links {
78 self.extract_links(&html)
79 } else {
80 Vec::new()
81 };
82
83 let metadata = self.extract_metadata(&html);
84 let title = metadata.get("title").cloned();
85
86 Ok(ExtractedContent {
87 format: DocumentFormat::Html,
88 text,
89 metadata,
90 images: Vec::new(), tables: Vec::new(), links,
93 structure: DocumentStructure {
94 title,
95 headings,
96 page_count: 1,
97 section_count: 1,
98 table_of_contents: Vec::new(),
99 },
100 chunks: Vec::new(),
101 language: None,
102 processing_stats: ProcessingStats::default(),
103 audio_content: Vec::new(),
104 video_content: Vec::new(),
105 cross_modal_embeddings: Vec::new(),
106 })
107 }
108
109 fn can_handle(&self, data: &[u8]) -> bool {
110 let content = String::from_utf8_lossy(data);
111 content.contains("<html") || content.contains("<!DOCTYPE")
112 }
113
114 fn supported_extensions(&self) -> Vec<&'static str> {
115 vec!["html", "htm"]
116 }
117}
118
119#[cfg(feature = "content-processing")]
120impl HtmlHandler {
121 fn extract_text_from_html(&self, html: &str) -> String {
122 let mut text = html.to_string();
125
126 text = regex::Regex::new(r"<script[^>]*>.*?</script>")
128 .expect("valid regex pattern")
129 .replace_all(&text, "")
130 .to_string();
131 text = regex::Regex::new(r"<style[^>]*>.*?</style>")
132 .expect("valid regex pattern")
133 .replace_all(&text, "")
134 .to_string();
135
136 text = regex::Regex::new(r"<[^>]*>")
138 .expect("valid regex pattern")
139 .replace_all(&text, " ")
140 .to_string();
141
142 text = regex::Regex::new(r"\s+")
144 .expect("valid regex pattern")
145 .replace_all(&text, " ")
146 .to_string();
147
148 text.trim().to_string()
149 }
150
151 fn extract_headings(&self, html: &str) -> Vec<Heading> {
152 let mut headings = Vec::new();
153 let tag_remove_re = regex::Regex::new(r"<[^>]*>").expect("valid regex pattern");
154
155 for level in 1..=6 {
156 let pattern = format!(r"<h{}[^>]*>(.*?)</h{}>", level, level);
157 if let Ok(re) = regex::Regex::new(&pattern) {
158 for (i, capture) in re.captures_iter(html).enumerate() {
159 if let Some(heading_text) = capture.get(1) {
160 let text = tag_remove_re
161 .replace_all(heading_text.as_str(), "")
162 .trim()
163 .to_string();
164
165 headings.push(Heading {
166 level,
167 text,
168 location: ContentLocation {
169 page: None,
170 section: Some(i),
171 char_offset: None,
172 line: None,
173 column: None,
174 },
175 });
176 }
177 }
178 }
179 }
180
181 headings
182 }
183
184 fn extract_links(&self, html: &str) -> Vec<crate::content_processing::ExtractedLink> {
185 let mut links = Vec::new();
186 let tag_remove_re = regex::Regex::new(r"<[^>]*>").expect("valid regex pattern");
187
188 if let Ok(re) = regex::Regex::new(r#"<a[^>]*href\s*=\s*["']([^"']*)["'][^>]*>(.*?)</a>"#) {
189 for capture in re.captures_iter(html) {
190 if let (Some(url), Some(text)) = (capture.get(1), capture.get(2)) {
191 links.push(crate::content_processing::ExtractedLink {
192 url: url.as_str().to_string(),
193 text: tag_remove_re
194 .replace_all(text.as_str(), "")
195 .trim()
196 .to_string(),
197 title: None,
198 location: crate::content_processing::ContentLocation {
199 page: None,
200 section: None,
201 char_offset: None,
202 line: None,
203 column: None,
204 },
205 });
206 }
207 }
208 }
209
210 links
211 }
212
213 fn extract_metadata(&self, html: &str) -> HashMap<String, String> {
214 let mut metadata = HashMap::new();
215
216 if let Ok(re) = regex::Regex::new(r"<title[^>]*>(.*?)</title>") {
218 if let Some(capture) = re.captures(html) {
219 if let Some(title) = capture.get(1) {
220 metadata.insert("title".to_string(), title.as_str().trim().to_string());
221 }
222 }
223 }
224
225 if let Ok(re) = regex::Regex::new(
227 r#"<meta[^>]*name\s*=\s*["']([^"']*)["'][^>]*content\s*=\s*["']([^"']*)["'][^>]*>"#,
228 ) {
229 for capture in re.captures_iter(html) {
230 if let (Some(name), Some(content)) = (capture.get(1), capture.get(2)) {
231 metadata.insert(name.as_str().to_string(), content.as_str().to_string());
232 }
233 }
234 }
235
236 metadata
237 }
238}
239
240#[cfg(feature = "content-processing")]
242pub struct XmlHandler;
243
244#[cfg(feature = "content-processing")]
245impl FormatHandler for XmlHandler {
246 fn extract_content(
247 &self,
248 data: &[u8],
249 _config: &ContentExtractionConfig,
250 ) -> Result<ExtractedContent> {
251 let xml = String::from_utf8_lossy(data);
252
253 let text = self.extract_text_from_xml(&xml);
255
256 Ok(ExtractedContent {
257 format: DocumentFormat::Xml,
258 text,
259 metadata: HashMap::new(),
260 images: Vec::new(),
261 tables: Vec::new(),
262 links: Vec::new(),
263 structure: DocumentStructure {
264 title: None,
265 headings: Vec::new(),
266 page_count: 1,
267 section_count: 1,
268 table_of_contents: Vec::new(),
269 },
270 chunks: Vec::new(),
271 language: None,
272 processing_stats: ProcessingStats::default(),
273 audio_content: Vec::new(),
274 video_content: Vec::new(),
275 cross_modal_embeddings: Vec::new(),
276 })
277 }
278
279 fn can_handle(&self, data: &[u8]) -> bool {
280 let content = String::from_utf8_lossy(data);
281 content.trim_start().starts_with("<?xml") || content.contains("<") && content.contains(">")
282 }
283
284 fn supported_extensions(&self) -> Vec<&'static str> {
285 vec!["xml"]
286 }
287}
288
289#[cfg(feature = "content-processing")]
290impl XmlHandler {
291 fn extract_text_from_xml(&self, xml: &str) -> String {
292 let text = regex::Regex::new(r"<[^>]*>")
294 .expect("valid regex pattern")
295 .replace_all(xml, " ")
296 .to_string();
297
298 regex::Regex::new(r"\s+")
300 .expect("valid regex pattern")
301 .replace_all(&text, " ")
302 .trim()
303 .to_string()
304 }
305}
306
307#[cfg(feature = "content-processing")]
309pub struct MarkdownHandler;
310
311#[cfg(feature = "content-processing")]
312impl FormatHandler for MarkdownHandler {
313 fn extract_content(
314 &self,
315 data: &[u8],
316 config: &ContentExtractionConfig,
317 ) -> Result<ExtractedContent> {
318 let markdown = String::from_utf8_lossy(data);
319
320 let text = self.extract_text_from_markdown(&markdown);
321 let headings = self.extract_headings(&markdown);
322 let links = if config.extract_links {
323 self.extract_links(&markdown)
324 } else {
325 Vec::new()
326 };
327
328 Ok(ExtractedContent {
329 format: DocumentFormat::Markdown,
330 text,
331 metadata: HashMap::new(),
332 images: Vec::new(),
333 tables: Vec::new(),
334 links,
335 structure: DocumentStructure {
336 title: None,
337 headings,
338 page_count: 1,
339 section_count: 1,
340 table_of_contents: Vec::new(),
341 },
342 chunks: Vec::new(),
343 language: None,
344 processing_stats: ProcessingStats::default(),
345 audio_content: Vec::new(),
346 video_content: Vec::new(),
347 cross_modal_embeddings: Vec::new(),
348 })
349 }
350
351 fn can_handle(&self, data: &[u8]) -> bool {
352 let content = String::from_utf8_lossy(data);
353 content.contains("#")
355 || content.contains("*")
356 || content.contains("```")
357 || content.contains("[")
358 }
359
360 fn supported_extensions(&self) -> Vec<&'static str> {
361 vec!["md", "markdown"]
362 }
363}
364
365#[cfg(feature = "content-processing")]
366impl MarkdownHandler {
367 fn extract_text_from_markdown(&self, markdown: &str) -> String {
368 let mut text = markdown.to_string();
369
370 text = regex::Regex::new(r"```[\s\S]*?```")
372 .expect("valid regex pattern")
373 .replace_all(&text, "")
374 .to_string();
375
376 text = regex::Regex::new(r"`[^`]*`")
378 .expect("valid regex pattern")
379 .replace_all(&text, "")
380 .to_string();
381
382 text = regex::Regex::new(r"[*_]{1,2}([^*_]*)[*_]{1,2}")
384 .expect("valid regex pattern")
385 .replace_all(&text, "$1")
386 .to_string();
387
388 text = regex::Regex::new(r"^#+\s*(.*)$")
390 .expect("valid regex pattern")
391 .replace_all(&text, "$1")
392 .to_string();
393
394 text = regex::Regex::new(r"\[([^\]]*)\]\([^)]*\)")
396 .expect("valid regex pattern")
397 .replace_all(&text, "$1")
398 .to_string();
399
400 regex::Regex::new(r"\s+")
402 .expect("valid regex pattern")
403 .replace_all(&text, " ")
404 .trim()
405 .to_string()
406 }
407
408 fn extract_headings(&self, markdown: &str) -> Vec<Heading> {
409 let mut headings = Vec::new();
410 let heading_re = regex::Regex::new(r"^(#{1,6})\s+(.+)$").expect("valid regex pattern");
411
412 for (i, line) in markdown.lines().enumerate() {
413 if let Some(captures) = heading_re.captures(line) {
414 let level = captures[1].len();
415 let text = captures[2].to_string();
416
417 headings.push(Heading {
418 level,
419 text,
420 location: ContentLocation {
421 page: None,
422 section: Some(i),
423 char_offset: None,
424 line: Some(i),
425 column: None,
426 },
427 });
428 }
429 }
430
431 headings
432 }
433
434 fn extract_links(&self, markdown: &str) -> Vec<crate::content_processing::ExtractedLink> {
435 let mut links = Vec::new();
436
437 if let Ok(re) = regex::Regex::new(r"\[([^\]]*)\]\(([^)]*)\)") {
438 for capture in re.captures_iter(markdown) {
439 if let (Some(text), Some(url)) = (capture.get(1), capture.get(2)) {
440 links.push(crate::content_processing::ExtractedLink {
441 url: url.as_str().to_string(),
442 text: text.as_str().to_string(),
443 title: None,
444 location: crate::content_processing::ContentLocation {
445 page: None,
446 section: None,
447 char_offset: None,
448 line: None,
449 column: None,
450 },
451 });
452 }
453 }
454 }
455
456 links
457 }
458}