oxirs_vec/content_processing/
text_handlers.rs

1//! Text format handlers for content processing
2//!
3//! This module provides handlers for plain text, HTML, XML, and Markdown documents.
4
5#[cfg(feature = "content-processing")]
6use crate::content_processing::{
7    ContentExtractionConfig, ContentLocation, DocumentFormat, DocumentStructure, ExtractedContent,
8    FormatHandler, Heading, ProcessingStats,
9};
10#[cfg(feature = "content-processing")]
11use anyhow::Result;
12#[cfg(feature = "content-processing")]
13use std::collections::HashMap;
14
15/// Plain text handler
16#[cfg(feature = "content-processing")]
17pub struct PlainTextHandler;
18
19#[cfg(feature = "content-processing")]
20impl FormatHandler for PlainTextHandler {
21    fn extract_content(
22        &self,
23        data: &[u8],
24        _config: &ContentExtractionConfig,
25    ) -> Result<ExtractedContent> {
26        let text = String::from_utf8_lossy(data).to_string();
27
28        Ok(ExtractedContent {
29            format: DocumentFormat::PlainText,
30            text,
31            metadata: HashMap::new(),
32            images: Vec::new(),
33            tables: Vec::new(),
34            links: Vec::new(),
35            structure: DocumentStructure {
36                title: None,
37                headings: Vec::new(),
38                page_count: 1,
39                section_count: 1,
40                table_of_contents: Vec::new(),
41            },
42            chunks: Vec::new(),
43            language: None,
44            processing_stats: ProcessingStats::default(),
45            audio_content: Vec::new(),
46            video_content: Vec::new(),
47            cross_modal_embeddings: Vec::new(),
48        })
49    }
50
51    fn can_handle(&self, data: &[u8]) -> bool {
52        // Check if data is valid UTF-8
53        String::from_utf8(data.to_vec()).is_ok()
54    }
55
56    fn supported_extensions(&self) -> Vec<&'static str> {
57        vec!["txt", "text"]
58    }
59}
60
61/// HTML handler
62#[cfg(feature = "content-processing")]
63pub struct HtmlHandler;
64
65#[cfg(feature = "content-processing")]
66impl FormatHandler for HtmlHandler {
67    fn extract_content(
68        &self,
69        data: &[u8],
70        config: &ContentExtractionConfig,
71    ) -> Result<ExtractedContent> {
72        let html = String::from_utf8_lossy(data);
73
74        // Simple HTML text extraction (in practice, would use a proper HTML parser)
75        let text = self.extract_text_from_html(&html);
76        let headings = self.extract_headings(&html);
77        let links = if config.extract_links {
78            self.extract_links(&html)
79        } else {
80            Vec::new()
81        };
82
83        let metadata = self.extract_metadata(&html);
84        let title = metadata.get("title").cloned();
85
86        Ok(ExtractedContent {
87            format: DocumentFormat::Html,
88            text,
89            metadata,
90            images: Vec::new(), // Would implement image extraction
91            tables: Vec::new(), // Would implement table extraction
92            links,
93            structure: DocumentStructure {
94                title,
95                headings,
96                page_count: 1,
97                section_count: 1,
98                table_of_contents: Vec::new(),
99            },
100            chunks: Vec::new(),
101            language: None,
102            processing_stats: ProcessingStats::default(),
103            audio_content: Vec::new(),
104            video_content: Vec::new(),
105            cross_modal_embeddings: Vec::new(),
106        })
107    }
108
109    fn can_handle(&self, data: &[u8]) -> bool {
110        let content = String::from_utf8_lossy(data);
111        content.contains("<html") || content.contains("<!DOCTYPE")
112    }
113
114    fn supported_extensions(&self) -> Vec<&'static str> {
115        vec!["html", "htm"]
116    }
117}
118
119#[cfg(feature = "content-processing")]
120impl HtmlHandler {
121    fn extract_text_from_html(&self, html: &str) -> String {
122        // Very basic HTML text extraction
123        // In practice, would use html5ever or similar
124        let mut text = html.to_string();
125
126        // Remove script and style elements
127        text = regex::Regex::new(r"<script[^>]*>.*?</script>")
128            .expect("valid regex pattern")
129            .replace_all(&text, "")
130            .to_string();
131        text = regex::Regex::new(r"<style[^>]*>.*?</style>")
132            .expect("valid regex pattern")
133            .replace_all(&text, "")
134            .to_string();
135
136        // Remove HTML tags
137        text = regex::Regex::new(r"<[^>]*>")
138            .expect("valid regex pattern")
139            .replace_all(&text, " ")
140            .to_string();
141
142        // Clean up whitespace
143        text = regex::Regex::new(r"\s+")
144            .expect("valid regex pattern")
145            .replace_all(&text, " ")
146            .to_string();
147
148        text.trim().to_string()
149    }
150
151    fn extract_headings(&self, html: &str) -> Vec<Heading> {
152        let mut headings = Vec::new();
153        let tag_remove_re = regex::Regex::new(r"<[^>]*>").expect("valid regex pattern");
154
155        for level in 1..=6 {
156            let pattern = format!(r"<h{}[^>]*>(.*?)</h{}>", level, level);
157            if let Ok(re) = regex::Regex::new(&pattern) {
158                for (i, capture) in re.captures_iter(html).enumerate() {
159                    if let Some(heading_text) = capture.get(1) {
160                        let text = tag_remove_re
161                            .replace_all(heading_text.as_str(), "")
162                            .trim()
163                            .to_string();
164
165                        headings.push(Heading {
166                            level,
167                            text,
168                            location: ContentLocation {
169                                page: None,
170                                section: Some(i),
171                                char_offset: None,
172                                line: None,
173                                column: None,
174                            },
175                        });
176                    }
177                }
178            }
179        }
180
181        headings
182    }
183
184    fn extract_links(&self, html: &str) -> Vec<crate::content_processing::ExtractedLink> {
185        let mut links = Vec::new();
186        let tag_remove_re = regex::Regex::new(r"<[^>]*>").expect("valid regex pattern");
187
188        if let Ok(re) = regex::Regex::new(r#"<a[^>]*href\s*=\s*["']([^"']*)["'][^>]*>(.*?)</a>"#) {
189            for capture in re.captures_iter(html) {
190                if let (Some(url), Some(text)) = (capture.get(1), capture.get(2)) {
191                    links.push(crate::content_processing::ExtractedLink {
192                        url: url.as_str().to_string(),
193                        text: tag_remove_re
194                            .replace_all(text.as_str(), "")
195                            .trim()
196                            .to_string(),
197                        title: None,
198                        location: crate::content_processing::ContentLocation {
199                            page: None,
200                            section: None,
201                            char_offset: None,
202                            line: None,
203                            column: None,
204                        },
205                    });
206                }
207            }
208        }
209
210        links
211    }
212
213    fn extract_metadata(&self, html: &str) -> HashMap<String, String> {
214        let mut metadata = HashMap::new();
215
216        // Extract title
217        if let Ok(re) = regex::Regex::new(r"<title[^>]*>(.*?)</title>") {
218            if let Some(capture) = re.captures(html) {
219                if let Some(title) = capture.get(1) {
220                    metadata.insert("title".to_string(), title.as_str().trim().to_string());
221                }
222            }
223        }
224
225        // Extract meta tags
226        if let Ok(re) = regex::Regex::new(
227            r#"<meta[^>]*name\s*=\s*["']([^"']*)["'][^>]*content\s*=\s*["']([^"']*)["'][^>]*>"#,
228        ) {
229            for capture in re.captures_iter(html) {
230                if let (Some(name), Some(content)) = (capture.get(1), capture.get(2)) {
231                    metadata.insert(name.as_str().to_string(), content.as_str().to_string());
232                }
233            }
234        }
235
236        metadata
237    }
238}
239
240/// XML handler
241#[cfg(feature = "content-processing")]
242pub struct XmlHandler;
243
244#[cfg(feature = "content-processing")]
245impl FormatHandler for XmlHandler {
246    fn extract_content(
247        &self,
248        data: &[u8],
249        _config: &ContentExtractionConfig,
250    ) -> Result<ExtractedContent> {
251        let xml = String::from_utf8_lossy(data);
252
253        // Basic XML text extraction
254        let text = self.extract_text_from_xml(&xml);
255
256        Ok(ExtractedContent {
257            format: DocumentFormat::Xml,
258            text,
259            metadata: HashMap::new(),
260            images: Vec::new(),
261            tables: Vec::new(),
262            links: Vec::new(),
263            structure: DocumentStructure {
264                title: None,
265                headings: Vec::new(),
266                page_count: 1,
267                section_count: 1,
268                table_of_contents: Vec::new(),
269            },
270            chunks: Vec::new(),
271            language: None,
272            processing_stats: ProcessingStats::default(),
273            audio_content: Vec::new(),
274            video_content: Vec::new(),
275            cross_modal_embeddings: Vec::new(),
276        })
277    }
278
279    fn can_handle(&self, data: &[u8]) -> bool {
280        let content = String::from_utf8_lossy(data);
281        content.trim_start().starts_with("<?xml") || content.contains("<") && content.contains(">")
282    }
283
284    fn supported_extensions(&self) -> Vec<&'static str> {
285        vec!["xml"]
286    }
287}
288
289#[cfg(feature = "content-processing")]
290impl XmlHandler {
291    fn extract_text_from_xml(&self, xml: &str) -> String {
292        // Basic XML text extraction - strip tags and return text content
293        let text = regex::Regex::new(r"<[^>]*>")
294            .expect("valid regex pattern")
295            .replace_all(xml, " ")
296            .to_string();
297
298        // Clean up whitespace
299        regex::Regex::new(r"\s+")
300            .expect("valid regex pattern")
301            .replace_all(&text, " ")
302            .trim()
303            .to_string()
304    }
305}
306
307/// Markdown handler
308#[cfg(feature = "content-processing")]
309pub struct MarkdownHandler;
310
311#[cfg(feature = "content-processing")]
312impl FormatHandler for MarkdownHandler {
313    fn extract_content(
314        &self,
315        data: &[u8],
316        config: &ContentExtractionConfig,
317    ) -> Result<ExtractedContent> {
318        let markdown = String::from_utf8_lossy(data);
319
320        let text = self.extract_text_from_markdown(&markdown);
321        let headings = self.extract_headings(&markdown);
322        let links = if config.extract_links {
323            self.extract_links(&markdown)
324        } else {
325            Vec::new()
326        };
327
328        Ok(ExtractedContent {
329            format: DocumentFormat::Markdown,
330            text,
331            metadata: HashMap::new(),
332            images: Vec::new(),
333            tables: Vec::new(),
334            links,
335            structure: DocumentStructure {
336                title: None,
337                headings,
338                page_count: 1,
339                section_count: 1,
340                table_of_contents: Vec::new(),
341            },
342            chunks: Vec::new(),
343            language: None,
344            processing_stats: ProcessingStats::default(),
345            audio_content: Vec::new(),
346            video_content: Vec::new(),
347            cross_modal_embeddings: Vec::new(),
348        })
349    }
350
351    fn can_handle(&self, data: &[u8]) -> bool {
352        let content = String::from_utf8_lossy(data);
353        // Check for common markdown patterns
354        content.contains("#")
355            || content.contains("*")
356            || content.contains("```")
357            || content.contains("[")
358    }
359
360    fn supported_extensions(&self) -> Vec<&'static str> {
361        vec!["md", "markdown"]
362    }
363}
364
365#[cfg(feature = "content-processing")]
366impl MarkdownHandler {
367    fn extract_text_from_markdown(&self, markdown: &str) -> String {
368        let mut text = markdown.to_string();
369
370        // Remove code blocks
371        text = regex::Regex::new(r"```[\s\S]*?```")
372            .expect("valid regex pattern")
373            .replace_all(&text, "")
374            .to_string();
375
376        // Remove inline code
377        text = regex::Regex::new(r"`[^`]*`")
378            .expect("valid regex pattern")
379            .replace_all(&text, "")
380            .to_string();
381
382        // Remove markdown formatting
383        text = regex::Regex::new(r"[*_]{1,2}([^*_]*)[*_]{1,2}")
384            .expect("valid regex pattern")
385            .replace_all(&text, "$1")
386            .to_string();
387
388        // Remove headers
389        text = regex::Regex::new(r"^#+\s*(.*)$")
390            .expect("valid regex pattern")
391            .replace_all(&text, "$1")
392            .to_string();
393
394        // Remove links
395        text = regex::Regex::new(r"\[([^\]]*)\]\([^)]*\)")
396            .expect("valid regex pattern")
397            .replace_all(&text, "$1")
398            .to_string();
399
400        // Clean up whitespace
401        regex::Regex::new(r"\s+")
402            .expect("valid regex pattern")
403            .replace_all(&text, " ")
404            .trim()
405            .to_string()
406    }
407
408    fn extract_headings(&self, markdown: &str) -> Vec<Heading> {
409        let mut headings = Vec::new();
410        let heading_re = regex::Regex::new(r"^(#{1,6})\s+(.+)$").expect("valid regex pattern");
411
412        for (i, line) in markdown.lines().enumerate() {
413            if let Some(captures) = heading_re.captures(line) {
414                let level = captures[1].len();
415                let text = captures[2].to_string();
416
417                headings.push(Heading {
418                    level,
419                    text,
420                    location: ContentLocation {
421                        page: None,
422                        section: Some(i),
423                        char_offset: None,
424                        line: Some(i),
425                        column: None,
426                    },
427                });
428            }
429        }
430
431        headings
432    }
433
434    fn extract_links(&self, markdown: &str) -> Vec<crate::content_processing::ExtractedLink> {
435        let mut links = Vec::new();
436
437        if let Ok(re) = regex::Regex::new(r"\[([^\]]*)\]\(([^)]*)\)") {
438            for capture in re.captures_iter(markdown) {
439                if let (Some(text), Some(url)) = (capture.get(1), capture.get(2)) {
440                    links.push(crate::content_processing::ExtractedLink {
441                        url: url.as_str().to_string(),
442                        text: text.as_str().to_string(),
443                        title: None,
444                        location: crate::content_processing::ContentLocation {
445                            page: None,
446                            section: None,
447                            char_offset: None,
448                            line: None,
449                            column: None,
450                        },
451                    });
452                }
453            }
454        }
455
456        links
457    }
458}