kaccy_ai/
document.rs

1//! Document analysis module
2//!
3//! This module provides utilities for parsing and analyzing documents
4//! in various formats including Markdown, HTML, and PDF.
5
6use serde::{Deserialize, Serialize};
7use std::fmt::Write as _;
8
9/// Supported document formats
10#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
11pub enum DocumentFormat {
12    /// Markdown document
13    Markdown,
14    /// HTML document
15    Html,
16    /// Plain text
17    PlainText,
18    /// PDF document
19    Pdf,
20}
21
22impl DocumentFormat {
23    /// Detect format from content
24    #[must_use]
25    pub fn detect(content: &str) -> Self {
26        let content_lower = content.to_lowercase();
27
28        // Check for HTML markers
29        if content_lower.contains("<!doctype html")
30            || content_lower.contains("<html")
31            || (content_lower.contains("<head") && content_lower.contains("<body"))
32            || content_lower.contains("<div")
33            || content_lower.contains("<p>")
34        {
35            return DocumentFormat::Html;
36        }
37
38        // Check for Markdown markers
39        if content.contains("# ")
40            || content.contains("## ")
41            || content.contains("```")
42            || content.contains("**")
43            || content.contains("__")
44            || content.contains("](") // Markdown link pattern [text](url)
45            || content.contains("![")
46            || content.contains("- [ ]")
47            || content.contains("- [x]")
48        {
49            return DocumentFormat::Markdown;
50        }
51
52        DocumentFormat::PlainText
53    }
54
55    /// Detect format from file extension
56    #[must_use]
57    pub fn from_extension(ext: &str) -> Self {
58        match ext.to_lowercase().as_str() {
59            "md" | "markdown" | "mdown" | "mkd" => DocumentFormat::Markdown,
60            "html" | "htm" | "xhtml" => DocumentFormat::Html,
61            "pdf" => DocumentFormat::Pdf,
62            _ => DocumentFormat::PlainText,
63        }
64    }
65
66    /// Detect format from binary data (for PDF detection)
67    #[must_use]
68    pub fn detect_from_bytes(data: &[u8]) -> Self {
69        // Check for PDF magic bytes (%PDF-)
70        if data.len() >= 5 && &data[0..5] == b"%PDF-" {
71            return DocumentFormat::Pdf;
72        }
73
74        // Fall back to string-based detection
75        if let Ok(content) = std::str::from_utf8(data) {
76            Self::detect(content)
77        } else {
78            // Binary content that's not PDF
79            DocumentFormat::PlainText
80        }
81    }
82}
83
84/// Extracted document structure
85#[derive(Debug, Clone, Serialize, Deserialize)]
86pub struct DocumentStructure {
87    /// Document format
88    pub format: DocumentFormat,
89    /// Title (if detected)
90    pub title: Option<String>,
91    /// Headings with their levels
92    pub headings: Vec<Heading>,
93    /// Extracted links
94    pub links: Vec<Link>,
95    /// Extracted images
96    pub images: Vec<Image>,
97    /// Code blocks
98    pub code_blocks: Vec<CodeBlock>,
99    /// Plain text content (HTML tags stripped)
100    pub plain_text: String,
101    /// Word count
102    pub word_count: usize,
103    /// Character count
104    pub char_count: usize,
105    /// Estimated reading time in minutes
106    pub reading_time_minutes: u32,
107    /// Key statistics
108    pub stats: DocumentStats,
109}
110
111/// Document heading
112#[derive(Debug, Clone, Serialize, Deserialize)]
113pub struct Heading {
114    /// Heading level (1-6)
115    pub level: u8,
116    /// Heading text
117    pub text: String,
118    /// Anchor/ID (if available)
119    pub anchor: Option<String>,
120}
121
122/// Extracted link
123#[derive(Debug, Clone, Serialize, Deserialize)]
124pub struct Link {
125    /// Link URL
126    pub url: String,
127    /// Link text
128    pub text: String,
129    /// Link title (if available)
130    pub title: Option<String>,
131    /// Whether this is an external link
132    pub is_external: bool,
133}
134
135/// Extracted image
136#[derive(Debug, Clone, Serialize, Deserialize)]
137pub struct Image {
138    /// Image URL/path
139    pub src: String,
140    /// Alt text
141    pub alt: String,
142    /// Title (if available)
143    pub title: Option<String>,
144}
145
146/// Code block
147#[derive(Debug, Clone, Serialize, Deserialize)]
148pub struct CodeBlock {
149    /// Programming language (if specified)
150    pub language: Option<String>,
151    /// Code content
152    pub code: String,
153    /// Line count
154    pub line_count: usize,
155}
156
157/// Document statistics
158#[derive(Debug, Clone, Default, Serialize, Deserialize)]
159pub struct DocumentStats {
160    /// Number of headings
161    pub heading_count: usize,
162    /// Number of paragraphs
163    pub paragraph_count: usize,
164    /// Number of lists
165    pub list_count: usize,
166    /// Number of links
167    pub link_count: usize,
168    /// Number of images
169    pub image_count: usize,
170    /// Number of code blocks
171    pub code_block_count: usize,
172    /// Number of tables
173    pub table_count: usize,
174    /// Number of blockquotes
175    pub blockquote_count: usize,
176}
177
178/// Document parser
179pub struct DocumentParser;
180
181impl DocumentParser {
182    /// Parse a document and extract its structure
183    #[must_use]
184    pub fn parse(content: &str) -> DocumentStructure {
185        let format = DocumentFormat::detect(content);
186
187        match format {
188            DocumentFormat::Markdown => Self::parse_markdown(content),
189            DocumentFormat::Html => Self::parse_html(content),
190            DocumentFormat::PlainText => Self::parse_plain_text(content),
191            DocumentFormat::Pdf => Self::parse_plain_text(content), // PDF needs binary data
192        }
193    }
194
195    /// Parse a document with explicit format
196    #[must_use]
197    pub fn parse_with_format(content: &str, format: DocumentFormat) -> DocumentStructure {
198        match format {
199            DocumentFormat::Markdown => Self::parse_markdown(content),
200            DocumentFormat::Html => Self::parse_html(content),
201            DocumentFormat::PlainText => Self::parse_plain_text(content),
202            DocumentFormat::Pdf => Self::parse_plain_text(content), // PDF needs binary data
203        }
204    }
205
206    /// Parse binary PDF data and extract its structure
207    pub fn parse_pdf(data: &[u8]) -> Result<DocumentStructure, PdfParseError> {
208        PdfParser::parse(data)
209    }
210
211    /// Parse binary PDF from a file path
212    pub fn parse_pdf_file(path: &std::path::Path) -> Result<DocumentStructure, PdfParseError> {
213        let data = std::fs::read(path).map_err(|e| PdfParseError::IoError(e.to_string()))?;
214        Self::parse_pdf(&data)
215    }
216
217    /// Parse Markdown document
218    fn parse_markdown(content: &str) -> DocumentStructure {
219        let mut headings = Vec::new();
220        let mut links = Vec::new();
221        let mut images = Vec::new();
222        let mut code_blocks = Vec::new();
223        let mut title = None;
224        let mut stats = DocumentStats::default();
225
226        let mut in_code_block = false;
227        let mut code_block_lang = None;
228        let mut code_block_content = String::new();
229
230        for line in content.lines() {
231            // Handle code blocks
232            if line.starts_with("```") {
233                if in_code_block {
234                    // End of code block
235                    code_blocks.push(CodeBlock {
236                        language: code_block_lang.take(),
237                        line_count: code_block_content.lines().count(),
238                        code: std::mem::take(&mut code_block_content),
239                    });
240                    stats.code_block_count += 1;
241                    in_code_block = false;
242                } else {
243                    // Start of code block
244                    let lang = line.trim_start_matches("```").trim();
245                    code_block_lang = if lang.is_empty() {
246                        None
247                    } else {
248                        Some(lang.to_string())
249                    };
250                    in_code_block = true;
251                }
252                continue;
253            }
254
255            if in_code_block {
256                code_block_content.push_str(line);
257                code_block_content.push('\n');
258                continue;
259            }
260
261            // Parse headings
262            if let Some(heading) = Self::parse_markdown_heading(line) {
263                if title.is_none() && heading.level == 1 {
264                    title = Some(heading.text.clone());
265                }
266                headings.push(heading);
267                stats.heading_count += 1;
268            }
269
270            // Parse links: [text](url) or [text](url "title")
271            Self::extract_markdown_links(line, &mut links);
272
273            // Parse images: ![alt](src) or ![alt](src "title")
274            Self::extract_markdown_images(line, &mut images);
275
276            // Count lists
277            if line.trim_start().starts_with("- ")
278                || line.trim_start().starts_with("* ")
279                || line.trim_start().starts_with("+ ")
280                || line
281                    .trim_start()
282                    .chars()
283                    .next()
284                    .is_some_and(|c| c.is_ascii_digit())
285                    && line.contains(". ")
286            {
287                stats.list_count += 1;
288            }
289
290            // Count blockquotes
291            if line.trim_start().starts_with("> ") {
292                stats.blockquote_count += 1;
293            }
294
295            // Count tables (simple detection)
296            if line.contains('|') && line.trim().starts_with('|') {
297                stats.table_count += 1;
298            }
299        }
300
301        stats.link_count = links.len();
302        stats.image_count = images.len();
303
304        // Calculate plain text
305        let plain_text = Self::markdown_to_plain_text(content);
306        let word_count = plain_text.split_whitespace().count();
307        let char_count = plain_text.chars().count();
308
309        // Count paragraphs (blank line separated blocks)
310        stats.paragraph_count = content
311            .split("\n\n")
312            .filter(|p| !p.trim().is_empty() && !p.trim().starts_with('#'))
313            .count();
314
315        DocumentStructure {
316            format: DocumentFormat::Markdown,
317            title,
318            headings,
319            links,
320            images,
321            code_blocks,
322            plain_text,
323            word_count,
324            char_count,
325            reading_time_minutes: (word_count / 200).max(1) as u32,
326            stats,
327        }
328    }
329
330    /// Parse a markdown heading
331    fn parse_markdown_heading(line: &str) -> Option<Heading> {
332        let trimmed = line.trim();
333        if !trimmed.starts_with('#') {
334            return None;
335        }
336
337        let mut level = 0u8;
338        for c in trimmed.chars() {
339            if c == '#' {
340                level += 1;
341            } else {
342                break;
343            }
344        }
345
346        if level > 6 {
347            return None;
348        }
349
350        let text = trimmed.trim_start_matches('#').trim().to_string();
351        if text.is_empty() {
352            return None;
353        }
354
355        // Generate anchor from text
356        let anchor = text
357            .to_lowercase()
358            .replace(' ', "-")
359            .chars()
360            .filter(|c| c.is_alphanumeric() || *c == '-')
361            .collect::<String>();
362
363        Some(Heading {
364            level,
365            text,
366            anchor: Some(anchor),
367        })
368    }
369
370    /// Extract markdown links from a line
371    fn extract_markdown_links(line: &str, links: &mut Vec<Link>) {
372        let mut remaining = line;
373
374        while let Some(start) = remaining.find('[') {
375            let after_start = &remaining[start + 1..];
376
377            // Find closing bracket
378            if let Some(close) = after_start.find(']') {
379                let text = &after_start[..close];
380                let after_close = &after_start[close + 1..];
381
382                // Check for (url) or (url "title")
383                if after_close.starts_with('(') {
384                    if let Some(paren_close) = after_close.find(')') {
385                        let url_part = &after_close[1..paren_close];
386
387                        // Parse URL and optional title
388                        let (url, title) = if let Some(quote_start) = url_part.find('"') {
389                            let url = url_part[..quote_start].trim().to_string();
390                            let title_part = &url_part[quote_start + 1..];
391                            let title = title_part.trim_end_matches('"').to_string();
392                            (url, Some(title))
393                        } else {
394                            (url_part.trim().to_string(), None)
395                        };
396
397                        // Skip image links (they start with !)
398                        if !remaining[..start].ends_with('!') && !url.is_empty() {
399                            let is_external = url.starts_with("http://")
400                                || url.starts_with("https://")
401                                || url.starts_with("//");
402
403                            links.push(Link {
404                                url,
405                                text: text.to_string(),
406                                title,
407                                is_external,
408                            });
409                        }
410
411                        remaining = &after_close[paren_close + 1..];
412                        continue;
413                    }
414                }
415            }
416
417            remaining = &remaining[start + 1..];
418        }
419    }
420
421    /// Extract markdown images from a line
422    fn extract_markdown_images(line: &str, images: &mut Vec<Image>) {
423        let mut remaining = line;
424
425        while let Some(start) = remaining.find("![") {
426            let after_start = &remaining[start + 2..];
427
428            // Find closing bracket
429            if let Some(close) = after_start.find(']') {
430                let alt = &after_start[..close];
431                let after_close = &after_start[close + 1..];
432
433                // Check for (src) or (src "title")
434                if after_close.starts_with('(') {
435                    if let Some(paren_close) = after_close.find(')') {
436                        let src_part = &after_close[1..paren_close];
437
438                        // Parse src and optional title
439                        let (src, title) = if let Some(quote_start) = src_part.find('"') {
440                            let src = src_part[..quote_start].trim().to_string();
441                            let title_part = &src_part[quote_start + 1..];
442                            let title = title_part.trim_end_matches('"').to_string();
443                            (src, Some(title))
444                        } else {
445                            (src_part.trim().to_string(), None)
446                        };
447
448                        if !src.is_empty() {
449                            images.push(Image {
450                                src,
451                                alt: alt.to_string(),
452                                title,
453                            });
454                        }
455
456                        remaining = &after_close[paren_close + 1..];
457                        continue;
458                    }
459                }
460            }
461
462            remaining = &remaining[start + 2..];
463        }
464    }
465
466    /// Convert markdown to plain text
467    fn markdown_to_plain_text(content: &str) -> String {
468        let mut result = String::new();
469        let mut in_code_block = false;
470
471        for line in content.lines() {
472            if line.starts_with("```") {
473                in_code_block = !in_code_block;
474                continue;
475            }
476
477            if in_code_block {
478                continue;
479            }
480
481            // Remove headings markers
482            let line = if line.starts_with('#') {
483                line.trim_start_matches('#').trim()
484            } else {
485                line
486            };
487
488            // Remove bold/italic markers
489            let line = line
490                .replace("**", "")
491                .replace("__", "")
492                .replace(['*', '_'], "");
493
494            // Remove inline code
495            let line = Self::remove_inline_code(&line);
496
497            // Remove links but keep text
498            let line = Self::remove_markdown_links(&line);
499
500            // Remove images
501            let line = Self::remove_markdown_images(&line);
502
503            if !line.trim().is_empty() {
504                result.push_str(&line);
505                result.push(' ');
506            }
507        }
508
509        result.trim().to_string()
510    }
511
512    /// Remove inline code markers
513    fn remove_inline_code(line: &str) -> String {
514        let mut result = String::new();
515        let mut in_code = false;
516
517        for c in line.chars() {
518            if c == '`' {
519                in_code = !in_code;
520            } else if !in_code {
521                result.push(c);
522            }
523        }
524
525        result
526    }
527
528    /// Remove markdown links but keep text
529    fn remove_markdown_links(line: &str) -> String {
530        let mut result = line.to_string();
531
532        // Simple replacement of [text](url) with text
533        while let Some(start) = result.find('[') {
534            if let Some(close) = result[start..].find(']') {
535                let absolute_close = start + close;
536                if result.len() > absolute_close + 1
537                    && result.as_bytes()[absolute_close + 1] == b'('
538                {
539                    if let Some(paren_close) = result[absolute_close..].find(')') {
540                        let text = &result[start + 1..absolute_close];
541                        let before = &result[..start];
542                        let after = &result[absolute_close + paren_close + 1..];
543                        result = format!("{before}{text}{after}");
544                        continue;
545                    }
546                }
547            }
548            break;
549        }
550
551        result
552    }
553
554    /// Remove markdown images
555    fn remove_markdown_images(line: &str) -> String {
556        let mut result = line.to_string();
557
558        while let Some(start) = result.find("![") {
559            if let Some(close) = result[start..].find(']') {
560                let absolute_close = start + close;
561                if result.len() > absolute_close + 1
562                    && result.as_bytes()[absolute_close + 1] == b'('
563                {
564                    if let Some(paren_close) = result[absolute_close..].find(')') {
565                        let before = &result[..start];
566                        let after = &result[absolute_close + paren_close + 1..];
567                        result = format!("{before}{after}");
568                        continue;
569                    }
570                }
571            }
572            break;
573        }
574
575        result
576    }
577
578    /// Parse HTML document
579    fn parse_html(content: &str) -> DocumentStructure {
580        let mut headings = Vec::new();
581        let mut links = Vec::new();
582        let mut images = Vec::new();
583        let mut code_blocks = Vec::new();
584        let mut title = None;
585        let mut stats = DocumentStats::default();
586
587        // Extract title from <title> tag
588        if let Some(title_text) = Self::extract_html_tag_content(content, "title") {
589            title = Some(title_text);
590        }
591
592        // Extract headings (h1-h6)
593        for level in 1..=6 {
594            let tag = format!("h{level}");
595            for text in Self::extract_all_html_tag_contents(content, &tag) {
596                if title.is_none() && level == 1 {
597                    title = Some(text.clone());
598                }
599                headings.push(Heading {
600                    level: level as u8,
601                    text,
602                    anchor: None,
603                });
604                stats.heading_count += 1;
605            }
606        }
607
608        // Extract links
609        Self::extract_html_links(content, &mut links);
610        stats.link_count = links.len();
611
612        // Extract images
613        Self::extract_html_images(content, &mut images);
614        stats.image_count = images.len();
615
616        // Extract code blocks (<pre><code> or <code>)
617        for code in Self::extract_all_html_tag_contents(content, "code") {
618            code_blocks.push(CodeBlock {
619                language: None,
620                line_count: code.lines().count(),
621                code,
622            });
623            stats.code_block_count += 1;
624        }
625
626        // Count other elements
627        stats.paragraph_count = Self::count_html_tags(content, "p");
628        stats.list_count =
629            Self::count_html_tags(content, "ul") + Self::count_html_tags(content, "ol");
630        stats.table_count = Self::count_html_tags(content, "table");
631        stats.blockquote_count = Self::count_html_tags(content, "blockquote");
632
633        // Get plain text
634        let plain_text = Self::html_to_plain_text(content);
635        let word_count = plain_text.split_whitespace().count();
636        let char_count = plain_text.chars().count();
637
638        DocumentStructure {
639            format: DocumentFormat::Html,
640            title,
641            headings,
642            links,
643            images,
644            code_blocks,
645            plain_text,
646            word_count,
647            char_count,
648            reading_time_minutes: (word_count / 200).max(1) as u32,
649            stats,
650        }
651    }
652
653    /// Extract content from an HTML tag
654    fn extract_html_tag_content(content: &str, tag: &str) -> Option<String> {
655        let open_tag = format!("<{tag}");
656        let close_tag = format!("</{tag}>");
657
658        let start = content.to_lowercase().find(&open_tag)?;
659        let after_open = &content[start..];
660
661        // Find the end of the opening tag
662        let tag_end = after_open.find('>')?;
663        let content_start = start + tag_end + 1;
664
665        let close_pos = content[content_start..].to_lowercase().find(&close_tag)?;
666
667        let text = &content[content_start..content_start + close_pos];
668        Some(Self::html_to_plain_text(text).trim().to_string())
669    }
670
671    /// Extract all contents from HTML tags
672    fn extract_all_html_tag_contents(content: &str, tag: &str) -> Vec<String> {
673        let mut results = Vec::new();
674        let content_lower = content.to_lowercase();
675        let open_tag = format!("<{tag}");
676        let close_tag = format!("</{tag}>");
677
678        let mut search_start = 0;
679        while let Some(start) = content_lower[search_start..].find(&open_tag) {
680            let absolute_start = search_start + start;
681            let after_open = &content[absolute_start..];
682
683            if let Some(tag_end) = after_open.find('>') {
684                let content_start = absolute_start + tag_end + 1;
685
686                if let Some(close_pos) = content_lower[content_start..].find(&close_tag) {
687                    let text = &content[content_start..content_start + close_pos];
688                    let clean_text = Self::html_to_plain_text(text).trim().to_string();
689                    if !clean_text.is_empty() {
690                        results.push(clean_text);
691                    }
692                    search_start = content_start + close_pos + close_tag.len();
693                    continue;
694                }
695            }
696
697            search_start = absolute_start + 1;
698        }
699
700        results
701    }
702
703    /// Count occurrences of an HTML tag
704    fn count_html_tags(content: &str, tag: &str) -> usize {
705        let open_tag = format!("<{tag}");
706        content.to_lowercase().matches(&open_tag).count()
707    }
708
709    /// Extract HTML links
710    fn extract_html_links(content: &str, links: &mut Vec<Link>) {
711        let content_lower = content.to_lowercase();
712        let mut search_start = 0;
713
714        while let Some(start) = content_lower[search_start..].find("<a ") {
715            let absolute_start = search_start + start;
716            let after_open = &content[absolute_start..];
717
718            if let Some(tag_end) = after_open.find('>') {
719                let tag_content = &after_open[..tag_end];
720
721                // Extract href
722                if let Some(href) = Self::extract_html_attribute(tag_content, "href") {
723                    let close_pos = content_lower[absolute_start..].find("</a>");
724
725                    let text = if let Some(close) = close_pos {
726                        let content_start = absolute_start + tag_end + 1;
727                        let content_end = absolute_start + close;
728                        Self::html_to_plain_text(&content[content_start..content_end])
729                            .trim()
730                            .to_string()
731                    } else {
732                        String::new()
733                    };
734
735                    let title = Self::extract_html_attribute(tag_content, "title");
736                    let is_external = href.starts_with("http://")
737                        || href.starts_with("https://")
738                        || href.starts_with("//");
739
740                    links.push(Link {
741                        url: href,
742                        text,
743                        title,
744                        is_external,
745                    });
746                }
747
748                search_start = absolute_start + tag_end;
749            } else {
750                search_start = absolute_start + 1;
751            }
752        }
753    }
754
755    /// Extract HTML images
756    fn extract_html_images(content: &str, images: &mut Vec<Image>) {
757        let content_lower = content.to_lowercase();
758        let mut search_start = 0;
759
760        while let Some(start) = content_lower[search_start..].find("<img ") {
761            let absolute_start = search_start + start;
762            let after_open = &content[absolute_start..];
763
764            if let Some(tag_end) = after_open.find('>').or_else(|| after_open.find("/>")) {
765                let tag_content = &after_open[..tag_end];
766
767                if let Some(src) = Self::extract_html_attribute(tag_content, "src") {
768                    let alt = Self::extract_html_attribute(tag_content, "alt").unwrap_or_default();
769                    let title = Self::extract_html_attribute(tag_content, "title");
770
771                    images.push(Image { src, alt, title });
772                }
773
774                search_start = absolute_start + tag_end;
775            } else {
776                search_start = absolute_start + 1;
777            }
778        }
779    }
780
781    /// Extract an HTML attribute value
782    fn extract_html_attribute(tag_content: &str, attr: &str) -> Option<String> {
783        let attr_pattern = format!("{attr}=");
784        let content_lower = tag_content.to_lowercase();
785
786        let attr_start = content_lower.find(&attr_pattern)?;
787        let after_attr = &tag_content[attr_start + attr_pattern.len()..];
788
789        // Handle quoted attribute values
790        let first_char = after_attr.chars().next()?;
791        if first_char == '"' || first_char == '\'' {
792            let quote = first_char;
793            let value_start = 1;
794            let value_end = after_attr[value_start..].find(quote)?;
795            return Some(after_attr[value_start..value_start + value_end].to_string());
796        }
797
798        // Handle unquoted attribute values
799        let value_end = after_attr.find(|c: char| c.is_whitespace() || c == '>')?;
800        Some(after_attr[..value_end].to_string())
801    }
802
803    /// Convert HTML to plain text
804    fn html_to_plain_text(content: &str) -> String {
805        let mut result = String::new();
806        let mut in_tag = false;
807        let mut in_script = false;
808        let mut in_style = false;
809
810        let content_lower = content.to_lowercase();
811        let chars: Vec<char> = content.chars().collect();
812        let chars_lower: Vec<char> = content_lower.chars().collect();
813
814        let mut i = 0;
815        while i < chars.len() {
816            // Check for script/style tags
817            if i + 7 < chars.len() {
818                let slice: String = chars_lower[i..i + 7].iter().collect();
819                if slice == "<script" {
820                    in_script = true;
821                } else if slice == "</scrip" {
822                    in_script = false;
823                }
824            }
825
826            if i + 6 < chars.len() {
827                let slice: String = chars_lower[i..i + 6].iter().collect();
828                if slice == "<style" {
829                    in_style = true;
830                } else if slice == "</styl" {
831                    in_style = false;
832                }
833            }
834
835            let c = chars[i];
836
837            if c == '<' {
838                in_tag = true;
839            } else if c == '>' {
840                in_tag = false;
841                // Add space after certain tags
842                result.push(' ');
843            } else if !in_tag && !in_script && !in_style {
844                result.push(c);
845            }
846
847            i += 1;
848        }
849
850        // Decode common HTML entities
851        let result = result
852            .replace("&nbsp;", " ")
853            .replace("&amp;", "&")
854            .replace("&lt;", "<")
855            .replace("&gt;", ">")
856            .replace("&quot;", "\"")
857            .replace("&apos;", "'")
858            .replace("&#39;", "'");
859
860        // Normalize whitespace
861        result.split_whitespace().collect::<Vec<_>>().join(" ")
862    }
863
864    /// Parse plain text document
865    fn parse_plain_text(content: &str) -> DocumentStructure {
866        let word_count = content.split_whitespace().count();
867        let char_count = content.chars().count();
868        let paragraph_count = content
869            .split("\n\n")
870            .filter(|p| !p.trim().is_empty())
871            .count();
872
873        DocumentStructure {
874            format: DocumentFormat::PlainText,
875            title: None,
876            headings: Vec::new(),
877            links: Vec::new(),
878            images: Vec::new(),
879            code_blocks: Vec::new(),
880            plain_text: content.to_string(),
881            word_count,
882            char_count,
883            reading_time_minutes: (word_count / 200).max(1) as u32,
884            stats: DocumentStats {
885                paragraph_count,
886                ..Default::default()
887            },
888        }
889    }
890}
891
892/// Document quality analysis
893#[derive(Debug, Clone, Serialize, Deserialize)]
894pub struct DocumentQuality {
895    /// Overall quality score (0-100)
896    pub overall_score: u32,
897    /// Readability score (0-100)
898    pub readability_score: u32,
899    /// Structure score (0-100)
900    pub structure_score: u32,
901    /// Issues found
902    pub issues: Vec<QualityIssue>,
903    /// Suggestions for improvement
904    pub suggestions: Vec<String>,
905}
906
907/// Quality issue
908#[derive(Debug, Clone, Serialize, Deserialize)]
909pub struct QualityIssue {
910    /// Issue severity
911    pub severity: IssueSeverity,
912    /// Issue description
913    pub description: String,
914}
915
916/// Issue severity levels
917#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
918pub enum IssueSeverity {
919    Info,
920    Warning,
921    Error,
922}
923
924/// Document quality analyzer
925pub struct QualityAnalyzer;
926
927impl QualityAnalyzer {
928    /// Analyze document quality
929    #[must_use]
930    pub fn analyze(structure: &DocumentStructure) -> DocumentQuality {
931        let mut issues = Vec::new();
932        let mut suggestions = Vec::new();
933
934        // Check for title
935        if structure.title.is_none() {
936            issues.push(QualityIssue {
937                severity: IssueSeverity::Warning,
938                description: "Document has no title".to_string(),
939            });
940            suggestions
941                .push("Add a main heading (# Title) at the start of the document".to_string());
942        }
943
944        // Check heading structure
945        let mut prev_level = 0u8;
946        for heading in &structure.headings {
947            if heading.level > prev_level + 1 && prev_level > 0 {
948                issues.push(QualityIssue {
949                    severity: IssueSeverity::Warning,
950                    description: format!(
951                        "Heading level jumps from {} to {}: '{}'",
952                        prev_level, heading.level, heading.text
953                    ),
954                });
955            }
956            prev_level = heading.level;
957        }
958
959        // Check word count
960        if structure.word_count < 100 {
961            issues.push(QualityIssue {
962                severity: IssueSeverity::Info,
963                description: "Document is very short".to_string(),
964            });
965        } else if structure.word_count > 5000 {
966            suggestions.push("Consider breaking long documents into multiple sections".to_string());
967        }
968
969        // Check for broken/empty links
970        for link in &structure.links {
971            if link.url.is_empty() {
972                issues.push(QualityIssue {
973                    severity: IssueSeverity::Error,
974                    description: format!("Empty link URL for text: '{}'", link.text),
975                });
976            }
977            if link.text.is_empty() {
978                issues.push(QualityIssue {
979                    severity: IssueSeverity::Warning,
980                    description: format!("Link has no text: '{}'", link.url),
981                });
982            }
983        }
984
985        // Check for images without alt text
986        for image in &structure.images {
987            if image.alt.is_empty() {
988                issues.push(QualityIssue {
989                    severity: IssueSeverity::Warning,
990                    description: format!("Image missing alt text: '{}'", image.src),
991                });
992            }
993        }
994
995        // Calculate scores
996        let structure_score = Self::calculate_structure_score(structure, &issues);
997        let readability_score = Self::calculate_readability_score(structure);
998        let overall_score = u32::midpoint(structure_score, readability_score);
999
1000        DocumentQuality {
1001            overall_score,
1002            readability_score,
1003            structure_score,
1004            issues,
1005            suggestions,
1006        }
1007    }
1008
1009    /// Calculate structure score
1010    fn calculate_structure_score(structure: &DocumentStructure, issues: &[QualityIssue]) -> u32 {
1011        let mut score = 100u32;
1012
1013        // Deduct for issues
1014        for issue in issues {
1015            match issue.severity {
1016                IssueSeverity::Error => score = score.saturating_sub(15),
1017                IssueSeverity::Warning => score = score.saturating_sub(5),
1018                IssueSeverity::Info => score = score.saturating_sub(2),
1019            }
1020        }
1021
1022        // Bonus for good structure
1023        if structure.title.is_some() {
1024            score = score.saturating_add(5).min(100);
1025        }
1026        if !structure.headings.is_empty() {
1027            score = score.saturating_add(5).min(100);
1028        }
1029
1030        score
1031    }
1032
1033    /// Calculate readability score (simplified Flesch-Kincaid style)
1034    fn calculate_readability_score(structure: &DocumentStructure) -> u32 {
1035        let words = structure.word_count;
1036        if words == 0 {
1037            return 50;
1038        }
1039
1040        // Count sentences (rough estimate)
1041        let sentence_count = structure.plain_text.matches(['.', '!', '?']).count().max(1);
1042
1043        // Average words per sentence
1044        let avg_words_per_sentence = words as f64 / sentence_count as f64;
1045
1046        // Optimal is around 15-20 words per sentence
1047        let score = if avg_words_per_sentence < 10.0 {
1048            70 + ((avg_words_per_sentence / 10.0) * 20.0) as u32
1049        } else if avg_words_per_sentence <= 20.0 {
1050            90 + (10.0 - (avg_words_per_sentence - 15.0).abs()) as u32
1051        } else if avg_words_per_sentence <= 30.0 {
1052            70 - ((avg_words_per_sentence - 20.0) * 2.0) as u32
1053        } else {
1054            50
1055        };
1056
1057        score.min(100)
1058    }
1059}
1060
1061/// Document table of contents generator
1062pub struct TocGenerator;
1063
1064impl TocGenerator {
1065    /// Generate table of contents from document structure
1066    #[must_use]
1067    pub fn generate(structure: &DocumentStructure) -> Vec<TocEntry> {
1068        structure
1069            .headings
1070            .iter()
1071            .map(|h| TocEntry {
1072                level: h.level,
1073                text: h.text.clone(),
1074                anchor: h.anchor.clone(),
1075            })
1076            .collect()
1077    }
1078
1079    /// Generate table of contents as markdown
1080    #[must_use]
1081    pub fn generate_markdown(structure: &DocumentStructure) -> String {
1082        let mut result = String::new();
1083
1084        for heading in &structure.headings {
1085            let indent = "  ".repeat((heading.level - 1) as usize);
1086            let anchor = heading
1087                .anchor
1088                .as_ref()
1089                .map(|a| format!("#{a}"))
1090                .unwrap_or_default();
1091
1092            let _ = writeln!(result, "{}- [{}]({})", indent, heading.text, anchor);
1093        }
1094
1095        result
1096    }
1097}
1098
1099/// Table of contents entry
1100#[derive(Debug, Clone, Serialize, Deserialize)]
1101pub struct TocEntry {
1102    /// Heading level
1103    pub level: u8,
1104    /// Heading text
1105    pub text: String,
1106    /// Anchor link
1107    pub anchor: Option<String>,
1108}
1109
1110/// Document metadata extractor
1111pub struct MetadataExtractor;
1112
1113impl MetadataExtractor {
1114    /// Extract metadata from document
1115    #[must_use]
1116    pub fn extract(content: &str) -> DocumentMetadata {
1117        let structure = DocumentParser::parse(content);
1118        let quality = QualityAnalyzer::analyze(&structure);
1119
1120        DocumentMetadata {
1121            format: structure.format,
1122            title: structure.title,
1123            word_count: structure.word_count,
1124            char_count: structure.char_count,
1125            reading_time_minutes: structure.reading_time_minutes,
1126            heading_count: structure.stats.heading_count,
1127            link_count: structure.stats.link_count,
1128            image_count: structure.stats.image_count,
1129            code_block_count: structure.stats.code_block_count,
1130            quality_score: quality.overall_score,
1131            external_links: structure.links.iter().filter(|l| l.is_external).count(),
1132            internal_links: structure.links.iter().filter(|l| !l.is_external).count(),
1133        }
1134    }
1135}
1136
1137/// Document metadata summary
1138#[derive(Debug, Clone, Serialize, Deserialize)]
1139pub struct DocumentMetadata {
1140    /// Document format
1141    pub format: DocumentFormat,
1142    /// Document title
1143    pub title: Option<String>,
1144    /// Word count
1145    pub word_count: usize,
1146    /// Character count
1147    pub char_count: usize,
1148    /// Estimated reading time
1149    pub reading_time_minutes: u32,
1150    /// Number of headings
1151    pub heading_count: usize,
1152    /// Number of links
1153    pub link_count: usize,
1154    /// Number of images
1155    pub image_count: usize,
1156    /// Number of code blocks
1157    pub code_block_count: usize,
1158    /// Quality score
1159    pub quality_score: u32,
1160    /// External link count
1161    pub external_links: usize,
1162    /// Internal link count
1163    pub internal_links: usize,
1164}
1165
1166/// Error type for PDF parsing
1167#[derive(Debug, Clone)]
1168pub enum PdfParseError {
1169    /// IO error reading PDF
1170    IoError(String),
1171    /// Invalid PDF format
1172    InvalidFormat(String),
1173    /// PDF parsing failed
1174    ParseError(String),
1175    /// Text extraction failed
1176    ExtractionError(String),
1177}
1178
1179impl std::fmt::Display for PdfParseError {
1180    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1181        match self {
1182            PdfParseError::IoError(e) => write!(f, "IO error: {e}"),
1183            PdfParseError::InvalidFormat(e) => write!(f, "Invalid PDF format: {e}"),
1184            PdfParseError::ParseError(e) => write!(f, "Parse error: {e}"),
1185            PdfParseError::ExtractionError(e) => write!(f, "Extraction error: {e}"),
1186        }
1187    }
1188}
1189
1190impl std::error::Error for PdfParseError {}
1191
1192/// PDF document parser
1193pub struct PdfParser;
1194
1195impl PdfParser {
1196    /// Parse a PDF document from binary data
1197    pub fn parse(data: &[u8]) -> Result<DocumentStructure, PdfParseError> {
1198        use lopdf::Document;
1199
1200        let doc = Document::load_mem(data).map_err(|e| PdfParseError::ParseError(e.to_string()))?;
1201
1202        let mut all_text = String::new();
1203        let mut page_count = 0;
1204
1205        // Extract text from all pages
1206        let pages = doc.get_pages();
1207        for (page_num, _) in &pages {
1208            page_count += 1;
1209            if let Ok(text) = Self::extract_page_text(&doc, *page_num) {
1210                all_text.push_str(&text);
1211                all_text.push('\n');
1212            }
1213        }
1214
1215        let plain_text = Self::clean_extracted_text(&all_text);
1216        let word_count = plain_text.split_whitespace().count();
1217        let char_count = plain_text.chars().count();
1218
1219        // Try to extract title from metadata or first heading
1220        let title = Self::extract_title(&doc, &plain_text);
1221
1222        // Extract headings (based on text analysis)
1223        let headings = Self::detect_headings(&plain_text);
1224        let heading_count = headings.len();
1225
1226        // Extract links from PDF
1227        let links = Self::extract_links(&doc);
1228        let link_count = links.len();
1229
1230        Ok(DocumentStructure {
1231            format: DocumentFormat::Pdf,
1232            title,
1233            headings,
1234            links,
1235            images: Vec::new(), // PDF image extraction is complex
1236            code_blocks: Vec::new(),
1237            plain_text,
1238            word_count,
1239            char_count,
1240            reading_time_minutes: (word_count / 200).max(1) as u32,
1241            stats: DocumentStats {
1242                heading_count,
1243                paragraph_count: page_count,
1244                link_count,
1245                ..Default::default()
1246            },
1247        })
1248    }
1249
1250    /// Extract text from a specific page
1251    fn extract_page_text(doc: &lopdf::Document, page_num: u32) -> Result<String, PdfParseError> {
1252        let page_id = doc
1253            .page_iter()
1254            .nth((page_num - 1) as usize)
1255            .ok_or_else(|| PdfParseError::ExtractionError(format!("Page {page_num} not found")))?;
1256
1257        let content = doc
1258            .get_page_content(page_id)
1259            .map_err(|e| PdfParseError::ExtractionError(e.to_string()))?;
1260
1261        // Parse content stream for text
1262        let text = Self::parse_content_stream(&content, doc);
1263        Ok(text)
1264    }
1265
1266    /// Parse PDF content stream to extract text
1267    fn parse_content_stream(content: &[u8], doc: &lopdf::Document) -> String {
1268        use lopdf::content::Content;
1269
1270        let mut text = String::new();
1271
1272        if let Ok(content_obj) = Content::decode(content) {
1273            for operation in content_obj.operations {
1274                match operation.operator.as_str() {
1275                    "Tj" | "TJ" => {
1276                        // Text showing operators
1277                        for operand in &operation.operands {
1278                            Self::extract_text_from_object(operand, doc, &mut text);
1279                        }
1280                    }
1281                    "'" | "\"" => {
1282                        // Text with newline
1283                        text.push('\n');
1284                        for operand in &operation.operands {
1285                            Self::extract_text_from_object(operand, doc, &mut text);
1286                        }
1287                    }
1288                    _ => {}
1289                }
1290            }
1291        }
1292
1293        text
1294    }
1295
1296    /// Extract text from a PDF object
1297    fn extract_text_from_object(obj: &lopdf::Object, _doc: &lopdf::Document, text: &mut String) {
1298        use lopdf::Object;
1299
1300        match obj {
1301            Object::String(bytes, _) => {
1302                // Try UTF-8 first, then PDFDocEncoding (Latin-1)
1303                if let Ok(s) = std::str::from_utf8(bytes) {
1304                    text.push_str(s);
1305                } else {
1306                    // Fall back to Latin-1
1307                    let s: String = bytes.iter().map(|&b| b as char).collect();
1308                    text.push_str(&s);
1309                }
1310            }
1311            Object::Array(arr) => {
1312                for item in arr {
1313                    match item {
1314                        Object::String(bytes, _) => {
1315                            if let Ok(s) = std::str::from_utf8(bytes) {
1316                                text.push_str(s);
1317                            } else {
1318                                let s: String = bytes.iter().map(|&b| b as char).collect();
1319                                text.push_str(&s);
1320                            }
1321                        }
1322                        Object::Integer(n) => {
1323                            // Negative numbers indicate kerning/spacing
1324                            if *n < -100 {
1325                                text.push(' ');
1326                            }
1327                        }
1328                        Object::Real(n) => {
1329                            if *n < -100.0 {
1330                                text.push(' ');
1331                            }
1332                        }
1333                        _ => {}
1334                    }
1335                }
1336            }
1337            _ => {}
1338        }
1339    }
1340
1341    /// Clean extracted text
1342    fn clean_extracted_text(text: &str) -> String {
1343        // Remove excessive whitespace and normalize
1344        let mut result = String::new();
1345        let mut last_was_space = true;
1346        let mut last_was_newline = true;
1347
1348        for c in text.chars() {
1349            if c == '\n' || c == '\r' {
1350                if !last_was_newline {
1351                    result.push('\n');
1352                    last_was_newline = true;
1353                    last_was_space = true;
1354                }
1355            } else if c.is_whitespace() {
1356                if !last_was_space {
1357                    result.push(' ');
1358                    last_was_space = true;
1359                }
1360            } else if c.is_control() {
1361                // Skip control characters
1362            } else {
1363                result.push(c);
1364                last_was_space = false;
1365                last_was_newline = false;
1366            }
1367        }
1368
1369        result.trim().to_string()
1370    }
1371
1372    /// Extract title from PDF metadata or content
1373    fn extract_title(doc: &lopdf::Document, text: &str) -> Option<String> {
1374        // Try to get title from PDF metadata
1375        if let Ok(info) = doc.trailer.get(b"Info") {
1376            if let Ok(lopdf::Object::Dictionary(dict)) = doc.get_object(info.as_reference().ok()?) {
1377                if let Ok(lopdf::Object::String(bytes, _)) = dict.get(b"Title") {
1378                    if let Ok(s) = std::str::from_utf8(bytes) {
1379                        let title = s.trim();
1380                        if !title.is_empty() {
1381                            return Some(title.to_string());
1382                        }
1383                    }
1384                }
1385            }
1386        }
1387
1388        // Fall back to first line that looks like a title
1389        for line in text.lines().take(10) {
1390            let trimmed = line.trim();
1391            if trimmed.len() > 3 && trimmed.len() < 200 {
1392                // Likely a title if it's reasonably sized and not a full paragraph
1393                let word_count = trimmed.split_whitespace().count();
1394                if word_count <= 15 && !trimmed.ends_with('.') {
1395                    return Some(trimmed.to_string());
1396                }
1397            }
1398        }
1399
1400        None
1401    }
1402
1403    /// Detect headings from text structure
1404    fn detect_headings(text: &str) -> Vec<Heading> {
1405        let mut headings = Vec::new();
1406        let lines: Vec<&str> = text.lines().collect();
1407        let numbered_heading = regex::Regex::new(r"^(\d+\.)+\d*\s+[A-Z]").ok();
1408
1409        for (i, line) in lines.iter().enumerate() {
1410            let trimmed = line.trim();
1411
1412            // Skip empty or very long lines
1413            if trimmed.is_empty() || trimmed.len() > 200 {
1414                continue;
1415            }
1416
1417            // Detect numbered headings (e.g., "1. Introduction", "1.2.3 Methods")
1418            if let Some(re) = &numbered_heading {
1419                if re.is_match(trimmed) {
1420                    let depth = trimmed.matches('.').count();
1421                    let level = (depth.min(5) + 1) as u8;
1422                    headings.push(Heading {
1423                        level,
1424                        text: trimmed.to_string(),
1425                        anchor: None,
1426                    });
1427                    continue;
1428                }
1429            }
1430
1431            // Detect ALL CAPS headings (common in PDFs)
1432            let word_count = trimmed.split_whitespace().count();
1433            if (1..=10).contains(&word_count)
1434                && trimmed
1435                    .chars()
1436                    .filter(|c| c.is_alphabetic())
1437                    .all(char::is_uppercase)
1438                && trimmed.chars().any(char::is_alphabetic)
1439            {
1440                headings.push(Heading {
1441                    level: 2,
1442                    text: trimmed.to_string(),
1443                    anchor: None,
1444                });
1445                continue;
1446            }
1447
1448            // Detect headings followed by blank line or significantly shorter
1449            if i + 1 < lines.len() {
1450                let next_line = lines[i + 1].trim();
1451                if next_line.is_empty() && word_count <= 8 && !trimmed.ends_with('.') {
1452                    // Check if it's capitalized like a title
1453                    if trimmed.chars().next().is_some_and(char::is_uppercase) {
1454                        headings.push(Heading {
1455                            level: 3,
1456                            text: trimmed.to_string(),
1457                            anchor: None,
1458                        });
1459                    }
1460                }
1461            }
1462        }
1463
1464        headings
1465    }
1466
1467    /// Extract links from PDF annotations
1468    fn extract_links(doc: &lopdf::Document) -> Vec<Link> {
1469        let mut links = Vec::new();
1470
1471        for (_page_num, page_id) in doc.get_pages() {
1472            if let Ok(lopdf::Object::Dictionary(dict)) = doc.get_object(page_id) {
1473                if let Ok(annots) = dict.get(b"Annots") {
1474                    Self::extract_links_from_annotations(doc, annots, &mut links);
1475                }
1476            }
1477        }
1478
1479        links
1480    }
1481
1482    /// Extract links from annotation array
1483    fn extract_links_from_annotations(
1484        doc: &lopdf::Document,
1485        annots: &lopdf::Object,
1486        links: &mut Vec<Link>,
1487    ) {
1488        let annot_refs = match annots {
1489            lopdf::Object::Array(arr) => arr.clone(),
1490            lopdf::Object::Reference(r) => {
1491                if let Ok(lopdf::Object::Array(arr)) = doc.get_object(*r) {
1492                    arr.clone()
1493                } else {
1494                    return;
1495                }
1496            }
1497            _ => return,
1498        };
1499
1500        for annot_ref in annot_refs {
1501            let annot = match &annot_ref {
1502                lopdf::Object::Reference(r) => doc.get_object(*r).ok().cloned(),
1503                obj => Some(obj.clone()),
1504            };
1505
1506            if let Some(lopdf::Object::Dictionary(dict)) = annot {
1507                // Check if it's a link annotation
1508                if let Ok(lopdf::Object::Name(subtype)) = dict.get(b"Subtype") {
1509                    if subtype == b"Link" {
1510                        // Extract URL from action
1511                        if let Ok(action) = dict.get(b"A") {
1512                            Self::extract_url_from_action(doc, action, links);
1513                        }
1514                    }
1515                }
1516            }
1517        }
1518    }
1519
1520    /// Extract URL from PDF action
1521    fn extract_url_from_action(
1522        doc: &lopdf::Document,
1523        action: &lopdf::Object,
1524        links: &mut Vec<Link>,
1525    ) {
1526        let action_dict = match action {
1527            lopdf::Object::Dictionary(dict) => dict.clone(),
1528            lopdf::Object::Reference(r) => {
1529                if let Ok(lopdf::Object::Dictionary(dict)) = doc.get_object(*r) {
1530                    dict.clone()
1531                } else {
1532                    return;
1533                }
1534            }
1535            _ => return,
1536        };
1537
1538        // Check for URI action
1539        if let Ok(lopdf::Object::Name(s)) = action_dict.get(b"S") {
1540            if s == b"URI" {
1541                if let Ok(lopdf::Object::String(bytes, _)) = action_dict.get(b"URI") {
1542                    if let Ok(url) = std::str::from_utf8(bytes) {
1543                        let is_external = url.starts_with("http://")
1544                            || url.starts_with("https://")
1545                            || url.starts_with("mailto:");
1546                        links.push(Link {
1547                            url: url.to_string(),
1548                            text: String::new(), // PDF links often don't have separate text
1549                            title: None,
1550                            is_external,
1551                        });
1552                    }
1553                }
1554            }
1555        }
1556    }
1557}
1558
1559/// PDF document metadata
1560#[derive(Debug, Clone, Serialize, Deserialize)]
1561pub struct PdfMetadata {
1562    /// PDF version
1563    pub version: String,
1564    /// Page count
1565    pub page_count: usize,
1566    /// Document title
1567    pub title: Option<String>,
1568    /// Document author
1569    pub author: Option<String>,
1570    /// Document subject
1571    pub subject: Option<String>,
1572    /// Document keywords
1573    pub keywords: Option<String>,
1574    /// Creator application
1575    pub creator: Option<String>,
1576    /// Producer application
1577    pub producer: Option<String>,
1578    /// Creation date
1579    pub creation_date: Option<String>,
1580    /// Modification date
1581    pub modification_date: Option<String>,
1582    /// Whether the PDF is encrypted
1583    pub is_encrypted: bool,
1584}
1585
1586impl PdfParser {
1587    /// Extract metadata from PDF
1588    pub fn extract_metadata(data: &[u8]) -> Result<PdfMetadata, PdfParseError> {
1589        use lopdf::Document;
1590
1591        let doc = Document::load_mem(data).map_err(|e| PdfParseError::ParseError(e.to_string()))?;
1592
1593        let page_count = doc.get_pages().len();
1594        let version = doc.version.clone();
1595        let is_encrypted = doc.is_encrypted();
1596
1597        let mut metadata = PdfMetadata {
1598            version,
1599            page_count,
1600            title: None,
1601            author: None,
1602            subject: None,
1603            keywords: None,
1604            creator: None,
1605            producer: None,
1606            creation_date: None,
1607            modification_date: None,
1608            is_encrypted,
1609        };
1610
1611        // Extract info dictionary
1612        if let Ok(info_ref) = doc.trailer.get(b"Info") {
1613            if let Ok(r) = info_ref.as_reference() {
1614                if let Ok(lopdf::Object::Dictionary(dict)) = doc.get_object(r) {
1615                    metadata.title = Self::get_string_from_dict(dict, b"Title");
1616                    metadata.author = Self::get_string_from_dict(dict, b"Author");
1617                    metadata.subject = Self::get_string_from_dict(dict, b"Subject");
1618                    metadata.keywords = Self::get_string_from_dict(dict, b"Keywords");
1619                    metadata.creator = Self::get_string_from_dict(dict, b"Creator");
1620                    metadata.producer = Self::get_string_from_dict(dict, b"Producer");
1621                    metadata.creation_date = Self::get_string_from_dict(dict, b"CreationDate");
1622                    metadata.modification_date = Self::get_string_from_dict(dict, b"ModDate");
1623                }
1624            }
1625        }
1626
1627        Ok(metadata)
1628    }
1629
1630    /// Get string value from dictionary
1631    fn get_string_from_dict(dict: &lopdf::Dictionary, key: &[u8]) -> Option<String> {
1632        if let Ok(lopdf::Object::String(bytes, _)) = dict.get(key) {
1633            if let Ok(s) = std::str::from_utf8(bytes) {
1634                let trimmed = s.trim();
1635                if !trimmed.is_empty() {
1636                    return Some(trimmed.to_string());
1637                }
1638            }
1639        }
1640        None
1641    }
1642}
1643
1644#[cfg(test)]
1645mod tests {
1646    use super::*;
1647
1648    #[test]
1649    fn test_format_detection_markdown() {
1650        let content = "# Hello World\n\nThis is a **test** document.";
1651        assert_eq!(DocumentFormat::detect(content), DocumentFormat::Markdown);
1652    }
1653
1654    #[test]
1655    fn test_format_detection_html() {
1656        let content = "<!DOCTYPE html><html><body><p>Hello</p></body></html>";
1657        assert_eq!(DocumentFormat::detect(content), DocumentFormat::Html);
1658    }
1659
1660    #[test]
1661    fn test_markdown_heading_parsing() {
1662        let content = "# Title\n\n## Section 1\n\n### Subsection\n\nSome text.";
1663        let structure = DocumentParser::parse(content);
1664
1665        assert_eq!(structure.headings.len(), 3);
1666        assert_eq!(structure.headings[0].level, 1);
1667        assert_eq!(structure.headings[0].text, "Title");
1668        assert_eq!(structure.headings[1].level, 2);
1669        assert_eq!(structure.headings[2].level, 3);
1670    }
1671
1672    #[test]
1673    fn test_markdown_link_extraction() {
1674        let content = "Check out [Rust](https://rust-lang.org) and [this](./local.md).";
1675        let structure = DocumentParser::parse(content);
1676
1677        assert_eq!(structure.links.len(), 2);
1678        assert!(structure.links[0].is_external);
1679        assert!(!structure.links[1].is_external);
1680    }
1681
1682    #[test]
1683    fn test_markdown_image_extraction() {
1684        let content = "![Alt text](image.png \"Title\")";
1685        let structure = DocumentParser::parse(content);
1686
1687        assert_eq!(structure.images.len(), 1);
1688        assert_eq!(structure.images[0].alt, "Alt text");
1689        assert_eq!(structure.images[0].src, "image.png");
1690    }
1691
1692    #[test]
1693    fn test_markdown_code_block_extraction() {
1694        let content = "```rust\nfn main() {}\n```";
1695        let structure = DocumentParser::parse(content);
1696
1697        assert_eq!(structure.code_blocks.len(), 1);
1698        assert_eq!(structure.code_blocks[0].language, Some("rust".to_string()));
1699    }
1700
1701    #[test]
1702    fn test_html_to_plain_text() {
1703        let html = "<p>Hello <strong>world</strong>!</p>";
1704        let plain = DocumentParser::html_to_plain_text(html);
1705        assert_eq!(plain, "Hello world !");
1706    }
1707
1708    #[test]
1709    fn test_quality_analysis() {
1710        let content = "# My Document\n\nThis is a test document with some content.\n\n## Section\n\nMore content here.";
1711        let structure = DocumentParser::parse(content);
1712        let quality = QualityAnalyzer::analyze(&structure);
1713
1714        assert!(quality.overall_score > 70);
1715        assert!(
1716            quality.issues.is_empty()
1717                || quality
1718                    .issues
1719                    .iter()
1720                    .all(|i| i.severity != IssueSeverity::Error)
1721        );
1722    }
1723}
kaccy_ai/document.rs

kaccy_ai/
document.rs