1use serde::{Deserialize, Serialize};
7use std::fmt::Write as _;
8
9#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
11pub enum DocumentFormat {
12 Markdown,
14 Html,
16 PlainText,
18 Pdf,
20}
21
22impl DocumentFormat {
23 #[must_use]
25 pub fn detect(content: &str) -> Self {
26 let content_lower = content.to_lowercase();
27
28 if content_lower.contains("<!doctype html")
30 || content_lower.contains("<html")
31 || (content_lower.contains("<head") && content_lower.contains("<body"))
32 || content_lower.contains("<div")
33 || content_lower.contains("<p>")
34 {
35 return DocumentFormat::Html;
36 }
37
38 if content.contains("# ")
40 || content.contains("## ")
41 || content.contains("```")
42 || content.contains("**")
43 || content.contains("__")
44 || content.contains("](") || content.contains("![")
46 || content.contains("- [ ]")
47 || content.contains("- [x]")
48 {
49 return DocumentFormat::Markdown;
50 }
51
52 DocumentFormat::PlainText
53 }
54
55 #[must_use]
57 pub fn from_extension(ext: &str) -> Self {
58 match ext.to_lowercase().as_str() {
59 "md" | "markdown" | "mdown" | "mkd" => DocumentFormat::Markdown,
60 "html" | "htm" | "xhtml" => DocumentFormat::Html,
61 "pdf" => DocumentFormat::Pdf,
62 _ => DocumentFormat::PlainText,
63 }
64 }
65
66 #[must_use]
68 pub fn detect_from_bytes(data: &[u8]) -> Self {
69 if data.len() >= 5 && &data[0..5] == b"%PDF-" {
71 return DocumentFormat::Pdf;
72 }
73
74 if let Ok(content) = std::str::from_utf8(data) {
76 Self::detect(content)
77 } else {
78 DocumentFormat::PlainText
80 }
81 }
82}
83
84#[derive(Debug, Clone, Serialize, Deserialize)]
86pub struct DocumentStructure {
87 pub format: DocumentFormat,
89 pub title: Option<String>,
91 pub headings: Vec<Heading>,
93 pub links: Vec<Link>,
95 pub images: Vec<Image>,
97 pub code_blocks: Vec<CodeBlock>,
99 pub plain_text: String,
101 pub word_count: usize,
103 pub char_count: usize,
105 pub reading_time_minutes: u32,
107 pub stats: DocumentStats,
109}
110
111#[derive(Debug, Clone, Serialize, Deserialize)]
113pub struct Heading {
114 pub level: u8,
116 pub text: String,
118 pub anchor: Option<String>,
120}
121
122#[derive(Debug, Clone, Serialize, Deserialize)]
124pub struct Link {
125 pub url: String,
127 pub text: String,
129 pub title: Option<String>,
131 pub is_external: bool,
133}
134
135#[derive(Debug, Clone, Serialize, Deserialize)]
137pub struct Image {
138 pub src: String,
140 pub alt: String,
142 pub title: Option<String>,
144}
145
146#[derive(Debug, Clone, Serialize, Deserialize)]
148pub struct CodeBlock {
149 pub language: Option<String>,
151 pub code: String,
153 pub line_count: usize,
155}
156
157#[derive(Debug, Clone, Default, Serialize, Deserialize)]
159pub struct DocumentStats {
160 pub heading_count: usize,
162 pub paragraph_count: usize,
164 pub list_count: usize,
166 pub link_count: usize,
168 pub image_count: usize,
170 pub code_block_count: usize,
172 pub table_count: usize,
174 pub blockquote_count: usize,
176}
177
178pub struct DocumentParser;
180
181impl DocumentParser {
182 #[must_use]
184 pub fn parse(content: &str) -> DocumentStructure {
185 let format = DocumentFormat::detect(content);
186
187 match format {
188 DocumentFormat::Markdown => Self::parse_markdown(content),
189 DocumentFormat::Html => Self::parse_html(content),
190 DocumentFormat::PlainText => Self::parse_plain_text(content),
191 DocumentFormat::Pdf => Self::parse_plain_text(content), }
193 }
194
195 #[must_use]
197 pub fn parse_with_format(content: &str, format: DocumentFormat) -> DocumentStructure {
198 match format {
199 DocumentFormat::Markdown => Self::parse_markdown(content),
200 DocumentFormat::Html => Self::parse_html(content),
201 DocumentFormat::PlainText => Self::parse_plain_text(content),
202 DocumentFormat::Pdf => Self::parse_plain_text(content), }
204 }
205
206 pub fn parse_pdf(data: &[u8]) -> Result<DocumentStructure, PdfParseError> {
208 PdfParser::parse(data)
209 }
210
211 pub fn parse_pdf_file(path: &std::path::Path) -> Result<DocumentStructure, PdfParseError> {
213 let data = std::fs::read(path).map_err(|e| PdfParseError::IoError(e.to_string()))?;
214 Self::parse_pdf(&data)
215 }
216
217 fn parse_markdown(content: &str) -> DocumentStructure {
219 let mut headings = Vec::new();
220 let mut links = Vec::new();
221 let mut images = Vec::new();
222 let mut code_blocks = Vec::new();
223 let mut title = None;
224 let mut stats = DocumentStats::default();
225
226 let mut in_code_block = false;
227 let mut code_block_lang = None;
228 let mut code_block_content = String::new();
229
230 for line in content.lines() {
231 if line.starts_with("```") {
233 if in_code_block {
234 code_blocks.push(CodeBlock {
236 language: code_block_lang.take(),
237 line_count: code_block_content.lines().count(),
238 code: std::mem::take(&mut code_block_content),
239 });
240 stats.code_block_count += 1;
241 in_code_block = false;
242 } else {
243 let lang = line.trim_start_matches("```").trim();
245 code_block_lang = if lang.is_empty() {
246 None
247 } else {
248 Some(lang.to_string())
249 };
250 in_code_block = true;
251 }
252 continue;
253 }
254
255 if in_code_block {
256 code_block_content.push_str(line);
257 code_block_content.push('\n');
258 continue;
259 }
260
261 if let Some(heading) = Self::parse_markdown_heading(line) {
263 if title.is_none() && heading.level == 1 {
264 title = Some(heading.text.clone());
265 }
266 headings.push(heading);
267 stats.heading_count += 1;
268 }
269
270 Self::extract_markdown_links(line, &mut links);
272
273 Self::extract_markdown_images(line, &mut images);
275
276 if line.trim_start().starts_with("- ")
278 || line.trim_start().starts_with("* ")
279 || line.trim_start().starts_with("+ ")
280 || line
281 .trim_start()
282 .chars()
283 .next()
284 .is_some_and(|c| c.is_ascii_digit())
285 && line.contains(". ")
286 {
287 stats.list_count += 1;
288 }
289
290 if line.trim_start().starts_with("> ") {
292 stats.blockquote_count += 1;
293 }
294
295 if line.contains('|') && line.trim().starts_with('|') {
297 stats.table_count += 1;
298 }
299 }
300
301 stats.link_count = links.len();
302 stats.image_count = images.len();
303
304 let plain_text = Self::markdown_to_plain_text(content);
306 let word_count = plain_text.split_whitespace().count();
307 let char_count = plain_text.chars().count();
308
309 stats.paragraph_count = content
311 .split("\n\n")
312 .filter(|p| !p.trim().is_empty() && !p.trim().starts_with('#'))
313 .count();
314
315 DocumentStructure {
316 format: DocumentFormat::Markdown,
317 title,
318 headings,
319 links,
320 images,
321 code_blocks,
322 plain_text,
323 word_count,
324 char_count,
325 reading_time_minutes: (word_count / 200).max(1) as u32,
326 stats,
327 }
328 }
329
330 fn parse_markdown_heading(line: &str) -> Option<Heading> {
332 let trimmed = line.trim();
333 if !trimmed.starts_with('#') {
334 return None;
335 }
336
337 let mut level = 0u8;
338 for c in trimmed.chars() {
339 if c == '#' {
340 level += 1;
341 } else {
342 break;
343 }
344 }
345
346 if level > 6 {
347 return None;
348 }
349
350 let text = trimmed.trim_start_matches('#').trim().to_string();
351 if text.is_empty() {
352 return None;
353 }
354
355 let anchor = text
357 .to_lowercase()
358 .replace(' ', "-")
359 .chars()
360 .filter(|c| c.is_alphanumeric() || *c == '-')
361 .collect::<String>();
362
363 Some(Heading {
364 level,
365 text,
366 anchor: Some(anchor),
367 })
368 }
369
370 fn extract_markdown_links(line: &str, links: &mut Vec<Link>) {
372 let mut remaining = line;
373
374 while let Some(start) = remaining.find('[') {
375 let after_start = &remaining[start + 1..];
376
377 if let Some(close) = after_start.find(']') {
379 let text = &after_start[..close];
380 let after_close = &after_start[close + 1..];
381
382 if after_close.starts_with('(') {
384 if let Some(paren_close) = after_close.find(')') {
385 let url_part = &after_close[1..paren_close];
386
387 let (url, title) = if let Some(quote_start) = url_part.find('"') {
389 let url = url_part[..quote_start].trim().to_string();
390 let title_part = &url_part[quote_start + 1..];
391 let title = title_part.trim_end_matches('"').to_string();
392 (url, Some(title))
393 } else {
394 (url_part.trim().to_string(), None)
395 };
396
397 if !remaining[..start].ends_with('!') && !url.is_empty() {
399 let is_external = url.starts_with("http://")
400 || url.starts_with("https://")
401 || url.starts_with("//");
402
403 links.push(Link {
404 url,
405 text: text.to_string(),
406 title,
407 is_external,
408 });
409 }
410
411 remaining = &after_close[paren_close + 1..];
412 continue;
413 }
414 }
415 }
416
417 remaining = &remaining[start + 1..];
418 }
419 }
420
421 fn extract_markdown_images(line: &str, images: &mut Vec<Image>) {
423 let mut remaining = line;
424
425 while let Some(start) = remaining.find("![") {
426 let after_start = &remaining[start + 2..];
427
428 if let Some(close) = after_start.find(']') {
430 let alt = &after_start[..close];
431 let after_close = &after_start[close + 1..];
432
433 if after_close.starts_with('(') {
435 if let Some(paren_close) = after_close.find(')') {
436 let src_part = &after_close[1..paren_close];
437
438 let (src, title) = if let Some(quote_start) = src_part.find('"') {
440 let src = src_part[..quote_start].trim().to_string();
441 let title_part = &src_part[quote_start + 1..];
442 let title = title_part.trim_end_matches('"').to_string();
443 (src, Some(title))
444 } else {
445 (src_part.trim().to_string(), None)
446 };
447
448 if !src.is_empty() {
449 images.push(Image {
450 src,
451 alt: alt.to_string(),
452 title,
453 });
454 }
455
456 remaining = &after_close[paren_close + 1..];
457 continue;
458 }
459 }
460 }
461
462 remaining = &remaining[start + 2..];
463 }
464 }
465
466 fn markdown_to_plain_text(content: &str) -> String {
468 let mut result = String::new();
469 let mut in_code_block = false;
470
471 for line in content.lines() {
472 if line.starts_with("```") {
473 in_code_block = !in_code_block;
474 continue;
475 }
476
477 if in_code_block {
478 continue;
479 }
480
481 let line = if line.starts_with('#') {
483 line.trim_start_matches('#').trim()
484 } else {
485 line
486 };
487
488 let line = line
490 .replace("**", "")
491 .replace("__", "")
492 .replace(['*', '_'], "");
493
494 let line = Self::remove_inline_code(&line);
496
497 let line = Self::remove_markdown_links(&line);
499
500 let line = Self::remove_markdown_images(&line);
502
503 if !line.trim().is_empty() {
504 result.push_str(&line);
505 result.push(' ');
506 }
507 }
508
509 result.trim().to_string()
510 }
511
512 fn remove_inline_code(line: &str) -> String {
514 let mut result = String::new();
515 let mut in_code = false;
516
517 for c in line.chars() {
518 if c == '`' {
519 in_code = !in_code;
520 } else if !in_code {
521 result.push(c);
522 }
523 }
524
525 result
526 }
527
528 fn remove_markdown_links(line: &str) -> String {
530 let mut result = line.to_string();
531
532 while let Some(start) = result.find('[') {
534 if let Some(close) = result[start..].find(']') {
535 let absolute_close = start + close;
536 if result.len() > absolute_close + 1
537 && result.as_bytes()[absolute_close + 1] == b'('
538 {
539 if let Some(paren_close) = result[absolute_close..].find(')') {
540 let text = &result[start + 1..absolute_close];
541 let before = &result[..start];
542 let after = &result[absolute_close + paren_close + 1..];
543 result = format!("{before}{text}{after}");
544 continue;
545 }
546 }
547 }
548 break;
549 }
550
551 result
552 }
553
554 fn remove_markdown_images(line: &str) -> String {
556 let mut result = line.to_string();
557
558 while let Some(start) = result.find("![") {
559 if let Some(close) = result[start..].find(']') {
560 let absolute_close = start + close;
561 if result.len() > absolute_close + 1
562 && result.as_bytes()[absolute_close + 1] == b'('
563 {
564 if let Some(paren_close) = result[absolute_close..].find(')') {
565 let before = &result[..start];
566 let after = &result[absolute_close + paren_close + 1..];
567 result = format!("{before}{after}");
568 continue;
569 }
570 }
571 }
572 break;
573 }
574
575 result
576 }
577
578 fn parse_html(content: &str) -> DocumentStructure {
580 let mut headings = Vec::new();
581 let mut links = Vec::new();
582 let mut images = Vec::new();
583 let mut code_blocks = Vec::new();
584 let mut title = None;
585 let mut stats = DocumentStats::default();
586
587 if let Some(title_text) = Self::extract_html_tag_content(content, "title") {
589 title = Some(title_text);
590 }
591
592 for level in 1..=6 {
594 let tag = format!("h{level}");
595 for text in Self::extract_all_html_tag_contents(content, &tag) {
596 if title.is_none() && level == 1 {
597 title = Some(text.clone());
598 }
599 headings.push(Heading {
600 level: level as u8,
601 text,
602 anchor: None,
603 });
604 stats.heading_count += 1;
605 }
606 }
607
608 Self::extract_html_links(content, &mut links);
610 stats.link_count = links.len();
611
612 Self::extract_html_images(content, &mut images);
614 stats.image_count = images.len();
615
616 for code in Self::extract_all_html_tag_contents(content, "code") {
618 code_blocks.push(CodeBlock {
619 language: None,
620 line_count: code.lines().count(),
621 code,
622 });
623 stats.code_block_count += 1;
624 }
625
626 stats.paragraph_count = Self::count_html_tags(content, "p");
628 stats.list_count =
629 Self::count_html_tags(content, "ul") + Self::count_html_tags(content, "ol");
630 stats.table_count = Self::count_html_tags(content, "table");
631 stats.blockquote_count = Self::count_html_tags(content, "blockquote");
632
633 let plain_text = Self::html_to_plain_text(content);
635 let word_count = plain_text.split_whitespace().count();
636 let char_count = plain_text.chars().count();
637
638 DocumentStructure {
639 format: DocumentFormat::Html,
640 title,
641 headings,
642 links,
643 images,
644 code_blocks,
645 plain_text,
646 word_count,
647 char_count,
648 reading_time_minutes: (word_count / 200).max(1) as u32,
649 stats,
650 }
651 }
652
653 fn extract_html_tag_content(content: &str, tag: &str) -> Option<String> {
655 let open_tag = format!("<{tag}");
656 let close_tag = format!("</{tag}>");
657
658 let start = content.to_lowercase().find(&open_tag)?;
659 let after_open = &content[start..];
660
661 let tag_end = after_open.find('>')?;
663 let content_start = start + tag_end + 1;
664
665 let close_pos = content[content_start..].to_lowercase().find(&close_tag)?;
666
667 let text = &content[content_start..content_start + close_pos];
668 Some(Self::html_to_plain_text(text).trim().to_string())
669 }
670
671 fn extract_all_html_tag_contents(content: &str, tag: &str) -> Vec<String> {
673 let mut results = Vec::new();
674 let content_lower = content.to_lowercase();
675 let open_tag = format!("<{tag}");
676 let close_tag = format!("</{tag}>");
677
678 let mut search_start = 0;
679 while let Some(start) = content_lower[search_start..].find(&open_tag) {
680 let absolute_start = search_start + start;
681 let after_open = &content[absolute_start..];
682
683 if let Some(tag_end) = after_open.find('>') {
684 let content_start = absolute_start + tag_end + 1;
685
686 if let Some(close_pos) = content_lower[content_start..].find(&close_tag) {
687 let text = &content[content_start..content_start + close_pos];
688 let clean_text = Self::html_to_plain_text(text).trim().to_string();
689 if !clean_text.is_empty() {
690 results.push(clean_text);
691 }
692 search_start = content_start + close_pos + close_tag.len();
693 continue;
694 }
695 }
696
697 search_start = absolute_start + 1;
698 }
699
700 results
701 }
702
703 fn count_html_tags(content: &str, tag: &str) -> usize {
705 let open_tag = format!("<{tag}");
706 content.to_lowercase().matches(&open_tag).count()
707 }
708
709 fn extract_html_links(content: &str, links: &mut Vec<Link>) {
711 let content_lower = content.to_lowercase();
712 let mut search_start = 0;
713
714 while let Some(start) = content_lower[search_start..].find("<a ") {
715 let absolute_start = search_start + start;
716 let after_open = &content[absolute_start..];
717
718 if let Some(tag_end) = after_open.find('>') {
719 let tag_content = &after_open[..tag_end];
720
721 if let Some(href) = Self::extract_html_attribute(tag_content, "href") {
723 let close_pos = content_lower[absolute_start..].find("</a>");
724
725 let text = if let Some(close) = close_pos {
726 let content_start = absolute_start + tag_end + 1;
727 let content_end = absolute_start + close;
728 Self::html_to_plain_text(&content[content_start..content_end])
729 .trim()
730 .to_string()
731 } else {
732 String::new()
733 };
734
735 let title = Self::extract_html_attribute(tag_content, "title");
736 let is_external = href.starts_with("http://")
737 || href.starts_with("https://")
738 || href.starts_with("//");
739
740 links.push(Link {
741 url: href,
742 text,
743 title,
744 is_external,
745 });
746 }
747
748 search_start = absolute_start + tag_end;
749 } else {
750 search_start = absolute_start + 1;
751 }
752 }
753 }
754
755 fn extract_html_images(content: &str, images: &mut Vec<Image>) {
757 let content_lower = content.to_lowercase();
758 let mut search_start = 0;
759
760 while let Some(start) = content_lower[search_start..].find("<img ") {
761 let absolute_start = search_start + start;
762 let after_open = &content[absolute_start..];
763
764 if let Some(tag_end) = after_open.find('>').or_else(|| after_open.find("/>")) {
765 let tag_content = &after_open[..tag_end];
766
767 if let Some(src) = Self::extract_html_attribute(tag_content, "src") {
768 let alt = Self::extract_html_attribute(tag_content, "alt").unwrap_or_default();
769 let title = Self::extract_html_attribute(tag_content, "title");
770
771 images.push(Image { src, alt, title });
772 }
773
774 search_start = absolute_start + tag_end;
775 } else {
776 search_start = absolute_start + 1;
777 }
778 }
779 }
780
781 fn extract_html_attribute(tag_content: &str, attr: &str) -> Option<String> {
783 let attr_pattern = format!("{attr}=");
784 let content_lower = tag_content.to_lowercase();
785
786 let attr_start = content_lower.find(&attr_pattern)?;
787 let after_attr = &tag_content[attr_start + attr_pattern.len()..];
788
789 let first_char = after_attr.chars().next()?;
791 if first_char == '"' || first_char == '\'' {
792 let quote = first_char;
793 let value_start = 1;
794 let value_end = after_attr[value_start..].find(quote)?;
795 return Some(after_attr[value_start..value_start + value_end].to_string());
796 }
797
798 let value_end = after_attr.find(|c: char| c.is_whitespace() || c == '>')?;
800 Some(after_attr[..value_end].to_string())
801 }
802
803 fn html_to_plain_text(content: &str) -> String {
805 let mut result = String::new();
806 let mut in_tag = false;
807 let mut in_script = false;
808 let mut in_style = false;
809
810 let content_lower = content.to_lowercase();
811 let chars: Vec<char> = content.chars().collect();
812 let chars_lower: Vec<char> = content_lower.chars().collect();
813
814 let mut i = 0;
815 while i < chars.len() {
816 if i + 7 < chars.len() {
818 let slice: String = chars_lower[i..i + 7].iter().collect();
819 if slice == "<script" {
820 in_script = true;
821 } else if slice == "</scrip" {
822 in_script = false;
823 }
824 }
825
826 if i + 6 < chars.len() {
827 let slice: String = chars_lower[i..i + 6].iter().collect();
828 if slice == "<style" {
829 in_style = true;
830 } else if slice == "</styl" {
831 in_style = false;
832 }
833 }
834
835 let c = chars[i];
836
837 if c == '<' {
838 in_tag = true;
839 } else if c == '>' {
840 in_tag = false;
841 result.push(' ');
843 } else if !in_tag && !in_script && !in_style {
844 result.push(c);
845 }
846
847 i += 1;
848 }
849
850 let result = result
852 .replace(" ", " ")
853 .replace("&", "&")
854 .replace("<", "<")
855 .replace(">", ">")
856 .replace(""", "\"")
857 .replace("'", "'")
858 .replace("'", "'");
859
860 result.split_whitespace().collect::<Vec<_>>().join(" ")
862 }
863
864 fn parse_plain_text(content: &str) -> DocumentStructure {
866 let word_count = content.split_whitespace().count();
867 let char_count = content.chars().count();
868 let paragraph_count = content
869 .split("\n\n")
870 .filter(|p| !p.trim().is_empty())
871 .count();
872
873 DocumentStructure {
874 format: DocumentFormat::PlainText,
875 title: None,
876 headings: Vec::new(),
877 links: Vec::new(),
878 images: Vec::new(),
879 code_blocks: Vec::new(),
880 plain_text: content.to_string(),
881 word_count,
882 char_count,
883 reading_time_minutes: (word_count / 200).max(1) as u32,
884 stats: DocumentStats {
885 paragraph_count,
886 ..Default::default()
887 },
888 }
889 }
890}
891
892#[derive(Debug, Clone, Serialize, Deserialize)]
894pub struct DocumentQuality {
895 pub overall_score: u32,
897 pub readability_score: u32,
899 pub structure_score: u32,
901 pub issues: Vec<QualityIssue>,
903 pub suggestions: Vec<String>,
905}
906
907#[derive(Debug, Clone, Serialize, Deserialize)]
909pub struct QualityIssue {
910 pub severity: IssueSeverity,
912 pub description: String,
914}
915
916#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
918pub enum IssueSeverity {
919 Info,
920 Warning,
921 Error,
922}
923
924pub struct QualityAnalyzer;
926
927impl QualityAnalyzer {
928 #[must_use]
930 pub fn analyze(structure: &DocumentStructure) -> DocumentQuality {
931 let mut issues = Vec::new();
932 let mut suggestions = Vec::new();
933
934 if structure.title.is_none() {
936 issues.push(QualityIssue {
937 severity: IssueSeverity::Warning,
938 description: "Document has no title".to_string(),
939 });
940 suggestions
941 .push("Add a main heading (# Title) at the start of the document".to_string());
942 }
943
944 let mut prev_level = 0u8;
946 for heading in &structure.headings {
947 if heading.level > prev_level + 1 && prev_level > 0 {
948 issues.push(QualityIssue {
949 severity: IssueSeverity::Warning,
950 description: format!(
951 "Heading level jumps from {} to {}: '{}'",
952 prev_level, heading.level, heading.text
953 ),
954 });
955 }
956 prev_level = heading.level;
957 }
958
959 if structure.word_count < 100 {
961 issues.push(QualityIssue {
962 severity: IssueSeverity::Info,
963 description: "Document is very short".to_string(),
964 });
965 } else if structure.word_count > 5000 {
966 suggestions.push("Consider breaking long documents into multiple sections".to_string());
967 }
968
969 for link in &structure.links {
971 if link.url.is_empty() {
972 issues.push(QualityIssue {
973 severity: IssueSeverity::Error,
974 description: format!("Empty link URL for text: '{}'", link.text),
975 });
976 }
977 if link.text.is_empty() {
978 issues.push(QualityIssue {
979 severity: IssueSeverity::Warning,
980 description: format!("Link has no text: '{}'", link.url),
981 });
982 }
983 }
984
985 for image in &structure.images {
987 if image.alt.is_empty() {
988 issues.push(QualityIssue {
989 severity: IssueSeverity::Warning,
990 description: format!("Image missing alt text: '{}'", image.src),
991 });
992 }
993 }
994
995 let structure_score = Self::calculate_structure_score(structure, &issues);
997 let readability_score = Self::calculate_readability_score(structure);
998 let overall_score = u32::midpoint(structure_score, readability_score);
999
1000 DocumentQuality {
1001 overall_score,
1002 readability_score,
1003 structure_score,
1004 issues,
1005 suggestions,
1006 }
1007 }
1008
1009 fn calculate_structure_score(structure: &DocumentStructure, issues: &[QualityIssue]) -> u32 {
1011 let mut score = 100u32;
1012
1013 for issue in issues {
1015 match issue.severity {
1016 IssueSeverity::Error => score = score.saturating_sub(15),
1017 IssueSeverity::Warning => score = score.saturating_sub(5),
1018 IssueSeverity::Info => score = score.saturating_sub(2),
1019 }
1020 }
1021
1022 if structure.title.is_some() {
1024 score = score.saturating_add(5).min(100);
1025 }
1026 if !structure.headings.is_empty() {
1027 score = score.saturating_add(5).min(100);
1028 }
1029
1030 score
1031 }
1032
1033 fn calculate_readability_score(structure: &DocumentStructure) -> u32 {
1035 let words = structure.word_count;
1036 if words == 0 {
1037 return 50;
1038 }
1039
1040 let sentence_count = structure.plain_text.matches(['.', '!', '?']).count().max(1);
1042
1043 let avg_words_per_sentence = words as f64 / sentence_count as f64;
1045
1046 let score = if avg_words_per_sentence < 10.0 {
1048 70 + ((avg_words_per_sentence / 10.0) * 20.0) as u32
1049 } else if avg_words_per_sentence <= 20.0 {
1050 90 + (10.0 - (avg_words_per_sentence - 15.0).abs()) as u32
1051 } else if avg_words_per_sentence <= 30.0 {
1052 70 - ((avg_words_per_sentence - 20.0) * 2.0) as u32
1053 } else {
1054 50
1055 };
1056
1057 score.min(100)
1058 }
1059}
1060
1061pub struct TocGenerator;
1063
1064impl TocGenerator {
1065 #[must_use]
1067 pub fn generate(structure: &DocumentStructure) -> Vec<TocEntry> {
1068 structure
1069 .headings
1070 .iter()
1071 .map(|h| TocEntry {
1072 level: h.level,
1073 text: h.text.clone(),
1074 anchor: h.anchor.clone(),
1075 })
1076 .collect()
1077 }
1078
1079 #[must_use]
1081 pub fn generate_markdown(structure: &DocumentStructure) -> String {
1082 let mut result = String::new();
1083
1084 for heading in &structure.headings {
1085 let indent = " ".repeat((heading.level - 1) as usize);
1086 let anchor = heading
1087 .anchor
1088 .as_ref()
1089 .map(|a| format!("#{a}"))
1090 .unwrap_or_default();
1091
1092 let _ = writeln!(result, "{}- [{}]({})", indent, heading.text, anchor);
1093 }
1094
1095 result
1096 }
1097}
1098
1099#[derive(Debug, Clone, Serialize, Deserialize)]
1101pub struct TocEntry {
1102 pub level: u8,
1104 pub text: String,
1106 pub anchor: Option<String>,
1108}
1109
1110pub struct MetadataExtractor;
1112
1113impl MetadataExtractor {
1114 #[must_use]
1116 pub fn extract(content: &str) -> DocumentMetadata {
1117 let structure = DocumentParser::parse(content);
1118 let quality = QualityAnalyzer::analyze(&structure);
1119
1120 DocumentMetadata {
1121 format: structure.format,
1122 title: structure.title,
1123 word_count: structure.word_count,
1124 char_count: structure.char_count,
1125 reading_time_minutes: structure.reading_time_minutes,
1126 heading_count: structure.stats.heading_count,
1127 link_count: structure.stats.link_count,
1128 image_count: structure.stats.image_count,
1129 code_block_count: structure.stats.code_block_count,
1130 quality_score: quality.overall_score,
1131 external_links: structure.links.iter().filter(|l| l.is_external).count(),
1132 internal_links: structure.links.iter().filter(|l| !l.is_external).count(),
1133 }
1134 }
1135}
1136
1137#[derive(Debug, Clone, Serialize, Deserialize)]
1139pub struct DocumentMetadata {
1140 pub format: DocumentFormat,
1142 pub title: Option<String>,
1144 pub word_count: usize,
1146 pub char_count: usize,
1148 pub reading_time_minutes: u32,
1150 pub heading_count: usize,
1152 pub link_count: usize,
1154 pub image_count: usize,
1156 pub code_block_count: usize,
1158 pub quality_score: u32,
1160 pub external_links: usize,
1162 pub internal_links: usize,
1164}
1165
1166#[derive(Debug, Clone)]
1168pub enum PdfParseError {
1169 IoError(String),
1171 InvalidFormat(String),
1173 ParseError(String),
1175 ExtractionError(String),
1177}
1178
1179impl std::fmt::Display for PdfParseError {
1180 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1181 match self {
1182 PdfParseError::IoError(e) => write!(f, "IO error: {e}"),
1183 PdfParseError::InvalidFormat(e) => write!(f, "Invalid PDF format: {e}"),
1184 PdfParseError::ParseError(e) => write!(f, "Parse error: {e}"),
1185 PdfParseError::ExtractionError(e) => write!(f, "Extraction error: {e}"),
1186 }
1187 }
1188}
1189
1190impl std::error::Error for PdfParseError {}
1191
1192pub struct PdfParser;
1194
1195impl PdfParser {
1196 pub fn parse(data: &[u8]) -> Result<DocumentStructure, PdfParseError> {
1198 use lopdf::Document;
1199
1200 let doc = Document::load_mem(data).map_err(|e| PdfParseError::ParseError(e.to_string()))?;
1201
1202 let mut all_text = String::new();
1203 let mut page_count = 0;
1204
1205 let pages = doc.get_pages();
1207 for (page_num, _) in &pages {
1208 page_count += 1;
1209 if let Ok(text) = Self::extract_page_text(&doc, *page_num) {
1210 all_text.push_str(&text);
1211 all_text.push('\n');
1212 }
1213 }
1214
1215 let plain_text = Self::clean_extracted_text(&all_text);
1216 let word_count = plain_text.split_whitespace().count();
1217 let char_count = plain_text.chars().count();
1218
1219 let title = Self::extract_title(&doc, &plain_text);
1221
1222 let headings = Self::detect_headings(&plain_text);
1224 let heading_count = headings.len();
1225
1226 let links = Self::extract_links(&doc);
1228 let link_count = links.len();
1229
1230 Ok(DocumentStructure {
1231 format: DocumentFormat::Pdf,
1232 title,
1233 headings,
1234 links,
1235 images: Vec::new(), code_blocks: Vec::new(),
1237 plain_text,
1238 word_count,
1239 char_count,
1240 reading_time_minutes: (word_count / 200).max(1) as u32,
1241 stats: DocumentStats {
1242 heading_count,
1243 paragraph_count: page_count,
1244 link_count,
1245 ..Default::default()
1246 },
1247 })
1248 }
1249
1250 fn extract_page_text(doc: &lopdf::Document, page_num: u32) -> Result<String, PdfParseError> {
1252 let page_id = doc
1253 .page_iter()
1254 .nth((page_num - 1) as usize)
1255 .ok_or_else(|| PdfParseError::ExtractionError(format!("Page {page_num} not found")))?;
1256
1257 let content = doc
1258 .get_page_content(page_id)
1259 .map_err(|e| PdfParseError::ExtractionError(e.to_string()))?;
1260
1261 let text = Self::parse_content_stream(&content, doc);
1263 Ok(text)
1264 }
1265
1266 fn parse_content_stream(content: &[u8], doc: &lopdf::Document) -> String {
1268 use lopdf::content::Content;
1269
1270 let mut text = String::new();
1271
1272 if let Ok(content_obj) = Content::decode(content) {
1273 for operation in content_obj.operations {
1274 match operation.operator.as_str() {
1275 "Tj" | "TJ" => {
1276 for operand in &operation.operands {
1278 Self::extract_text_from_object(operand, doc, &mut text);
1279 }
1280 }
1281 "'" | "\"" => {
1282 text.push('\n');
1284 for operand in &operation.operands {
1285 Self::extract_text_from_object(operand, doc, &mut text);
1286 }
1287 }
1288 _ => {}
1289 }
1290 }
1291 }
1292
1293 text
1294 }
1295
1296 fn extract_text_from_object(obj: &lopdf::Object, _doc: &lopdf::Document, text: &mut String) {
1298 use lopdf::Object;
1299
1300 match obj {
1301 Object::String(bytes, _) => {
1302 if let Ok(s) = std::str::from_utf8(bytes) {
1304 text.push_str(s);
1305 } else {
1306 let s: String = bytes.iter().map(|&b| b as char).collect();
1308 text.push_str(&s);
1309 }
1310 }
1311 Object::Array(arr) => {
1312 for item in arr {
1313 match item {
1314 Object::String(bytes, _) => {
1315 if let Ok(s) = std::str::from_utf8(bytes) {
1316 text.push_str(s);
1317 } else {
1318 let s: String = bytes.iter().map(|&b| b as char).collect();
1319 text.push_str(&s);
1320 }
1321 }
1322 Object::Integer(n) => {
1323 if *n < -100 {
1325 text.push(' ');
1326 }
1327 }
1328 Object::Real(n) => {
1329 if *n < -100.0 {
1330 text.push(' ');
1331 }
1332 }
1333 _ => {}
1334 }
1335 }
1336 }
1337 _ => {}
1338 }
1339 }
1340
1341 fn clean_extracted_text(text: &str) -> String {
1343 let mut result = String::new();
1345 let mut last_was_space = true;
1346 let mut last_was_newline = true;
1347
1348 for c in text.chars() {
1349 if c == '\n' || c == '\r' {
1350 if !last_was_newline {
1351 result.push('\n');
1352 last_was_newline = true;
1353 last_was_space = true;
1354 }
1355 } else if c.is_whitespace() {
1356 if !last_was_space {
1357 result.push(' ');
1358 last_was_space = true;
1359 }
1360 } else if c.is_control() {
1361 } else {
1363 result.push(c);
1364 last_was_space = false;
1365 last_was_newline = false;
1366 }
1367 }
1368
1369 result.trim().to_string()
1370 }
1371
1372 fn extract_title(doc: &lopdf::Document, text: &str) -> Option<String> {
1374 if let Ok(info) = doc.trailer.get(b"Info") {
1376 if let Ok(lopdf::Object::Dictionary(dict)) = doc.get_object(info.as_reference().ok()?) {
1377 if let Ok(lopdf::Object::String(bytes, _)) = dict.get(b"Title") {
1378 if let Ok(s) = std::str::from_utf8(bytes) {
1379 let title = s.trim();
1380 if !title.is_empty() {
1381 return Some(title.to_string());
1382 }
1383 }
1384 }
1385 }
1386 }
1387
1388 for line in text.lines().take(10) {
1390 let trimmed = line.trim();
1391 if trimmed.len() > 3 && trimmed.len() < 200 {
1392 let word_count = trimmed.split_whitespace().count();
1394 if word_count <= 15 && !trimmed.ends_with('.') {
1395 return Some(trimmed.to_string());
1396 }
1397 }
1398 }
1399
1400 None
1401 }
1402
1403 fn detect_headings(text: &str) -> Vec<Heading> {
1405 let mut headings = Vec::new();
1406 let lines: Vec<&str> = text.lines().collect();
1407 let numbered_heading = regex::Regex::new(r"^(\d+\.)+\d*\s+[A-Z]").ok();
1408
1409 for (i, line) in lines.iter().enumerate() {
1410 let trimmed = line.trim();
1411
1412 if trimmed.is_empty() || trimmed.len() > 200 {
1414 continue;
1415 }
1416
1417 if let Some(re) = &numbered_heading {
1419 if re.is_match(trimmed) {
1420 let depth = trimmed.matches('.').count();
1421 let level = (depth.min(5) + 1) as u8;
1422 headings.push(Heading {
1423 level,
1424 text: trimmed.to_string(),
1425 anchor: None,
1426 });
1427 continue;
1428 }
1429 }
1430
1431 let word_count = trimmed.split_whitespace().count();
1433 if (1..=10).contains(&word_count)
1434 && trimmed
1435 .chars()
1436 .filter(|c| c.is_alphabetic())
1437 .all(char::is_uppercase)
1438 && trimmed.chars().any(char::is_alphabetic)
1439 {
1440 headings.push(Heading {
1441 level: 2,
1442 text: trimmed.to_string(),
1443 anchor: None,
1444 });
1445 continue;
1446 }
1447
1448 if i + 1 < lines.len() {
1450 let next_line = lines[i + 1].trim();
1451 if next_line.is_empty() && word_count <= 8 && !trimmed.ends_with('.') {
1452 if trimmed.chars().next().is_some_and(char::is_uppercase) {
1454 headings.push(Heading {
1455 level: 3,
1456 text: trimmed.to_string(),
1457 anchor: None,
1458 });
1459 }
1460 }
1461 }
1462 }
1463
1464 headings
1465 }
1466
1467 fn extract_links(doc: &lopdf::Document) -> Vec<Link> {
1469 let mut links = Vec::new();
1470
1471 for (_page_num, page_id) in doc.get_pages() {
1472 if let Ok(lopdf::Object::Dictionary(dict)) = doc.get_object(page_id) {
1473 if let Ok(annots) = dict.get(b"Annots") {
1474 Self::extract_links_from_annotations(doc, annots, &mut links);
1475 }
1476 }
1477 }
1478
1479 links
1480 }
1481
1482 fn extract_links_from_annotations(
1484 doc: &lopdf::Document,
1485 annots: &lopdf::Object,
1486 links: &mut Vec<Link>,
1487 ) {
1488 let annot_refs = match annots {
1489 lopdf::Object::Array(arr) => arr.clone(),
1490 lopdf::Object::Reference(r) => {
1491 if let Ok(lopdf::Object::Array(arr)) = doc.get_object(*r) {
1492 arr.clone()
1493 } else {
1494 return;
1495 }
1496 }
1497 _ => return,
1498 };
1499
1500 for annot_ref in annot_refs {
1501 let annot = match &annot_ref {
1502 lopdf::Object::Reference(r) => doc.get_object(*r).ok().cloned(),
1503 obj => Some(obj.clone()),
1504 };
1505
1506 if let Some(lopdf::Object::Dictionary(dict)) = annot {
1507 if let Ok(lopdf::Object::Name(subtype)) = dict.get(b"Subtype") {
1509 if subtype == b"Link" {
1510 if let Ok(action) = dict.get(b"A") {
1512 Self::extract_url_from_action(doc, action, links);
1513 }
1514 }
1515 }
1516 }
1517 }
1518 }
1519
1520 fn extract_url_from_action(
1522 doc: &lopdf::Document,
1523 action: &lopdf::Object,
1524 links: &mut Vec<Link>,
1525 ) {
1526 let action_dict = match action {
1527 lopdf::Object::Dictionary(dict) => dict.clone(),
1528 lopdf::Object::Reference(r) => {
1529 if let Ok(lopdf::Object::Dictionary(dict)) = doc.get_object(*r) {
1530 dict.clone()
1531 } else {
1532 return;
1533 }
1534 }
1535 _ => return,
1536 };
1537
1538 if let Ok(lopdf::Object::Name(s)) = action_dict.get(b"S") {
1540 if s == b"URI" {
1541 if let Ok(lopdf::Object::String(bytes, _)) = action_dict.get(b"URI") {
1542 if let Ok(url) = std::str::from_utf8(bytes) {
1543 let is_external = url.starts_with("http://")
1544 || url.starts_with("https://")
1545 || url.starts_with("mailto:");
1546 links.push(Link {
1547 url: url.to_string(),
1548 text: String::new(), title: None,
1550 is_external,
1551 });
1552 }
1553 }
1554 }
1555 }
1556 }
1557}
1558
1559#[derive(Debug, Clone, Serialize, Deserialize)]
1561pub struct PdfMetadata {
1562 pub version: String,
1564 pub page_count: usize,
1566 pub title: Option<String>,
1568 pub author: Option<String>,
1570 pub subject: Option<String>,
1572 pub keywords: Option<String>,
1574 pub creator: Option<String>,
1576 pub producer: Option<String>,
1578 pub creation_date: Option<String>,
1580 pub modification_date: Option<String>,
1582 pub is_encrypted: bool,
1584}
1585
1586impl PdfParser {
1587 pub fn extract_metadata(data: &[u8]) -> Result<PdfMetadata, PdfParseError> {
1589 use lopdf::Document;
1590
1591 let doc = Document::load_mem(data).map_err(|e| PdfParseError::ParseError(e.to_string()))?;
1592
1593 let page_count = doc.get_pages().len();
1594 let version = doc.version.clone();
1595 let is_encrypted = doc.is_encrypted();
1596
1597 let mut metadata = PdfMetadata {
1598 version,
1599 page_count,
1600 title: None,
1601 author: None,
1602 subject: None,
1603 keywords: None,
1604 creator: None,
1605 producer: None,
1606 creation_date: None,
1607 modification_date: None,
1608 is_encrypted,
1609 };
1610
1611 if let Ok(info_ref) = doc.trailer.get(b"Info") {
1613 if let Ok(r) = info_ref.as_reference() {
1614 if let Ok(lopdf::Object::Dictionary(dict)) = doc.get_object(r) {
1615 metadata.title = Self::get_string_from_dict(dict, b"Title");
1616 metadata.author = Self::get_string_from_dict(dict, b"Author");
1617 metadata.subject = Self::get_string_from_dict(dict, b"Subject");
1618 metadata.keywords = Self::get_string_from_dict(dict, b"Keywords");
1619 metadata.creator = Self::get_string_from_dict(dict, b"Creator");
1620 metadata.producer = Self::get_string_from_dict(dict, b"Producer");
1621 metadata.creation_date = Self::get_string_from_dict(dict, b"CreationDate");
1622 metadata.modification_date = Self::get_string_from_dict(dict, b"ModDate");
1623 }
1624 }
1625 }
1626
1627 Ok(metadata)
1628 }
1629
1630 fn get_string_from_dict(dict: &lopdf::Dictionary, key: &[u8]) -> Option<String> {
1632 if let Ok(lopdf::Object::String(bytes, _)) = dict.get(key) {
1633 if let Ok(s) = std::str::from_utf8(bytes) {
1634 let trimmed = s.trim();
1635 if !trimmed.is_empty() {
1636 return Some(trimmed.to_string());
1637 }
1638 }
1639 }
1640 None
1641 }
1642}
1643
1644#[cfg(test)]
1645mod tests {
1646 use super::*;
1647
1648 #[test]
1649 fn test_format_detection_markdown() {
1650 let content = "# Hello World\n\nThis is a **test** document.";
1651 assert_eq!(DocumentFormat::detect(content), DocumentFormat::Markdown);
1652 }
1653
1654 #[test]
1655 fn test_format_detection_html() {
1656 let content = "<!DOCTYPE html><html><body><p>Hello</p></body></html>";
1657 assert_eq!(DocumentFormat::detect(content), DocumentFormat::Html);
1658 }
1659
1660 #[test]
1661 fn test_markdown_heading_parsing() {
1662 let content = "# Title\n\n## Section 1\n\n### Subsection\n\nSome text.";
1663 let structure = DocumentParser::parse(content);
1664
1665 assert_eq!(structure.headings.len(), 3);
1666 assert_eq!(structure.headings[0].level, 1);
1667 assert_eq!(structure.headings[0].text, "Title");
1668 assert_eq!(structure.headings[1].level, 2);
1669 assert_eq!(structure.headings[2].level, 3);
1670 }
1671
1672 #[test]
1673 fn test_markdown_link_extraction() {
1674 let content = "Check out [Rust](https://rust-lang.org) and [this](./local.md).";
1675 let structure = DocumentParser::parse(content);
1676
1677 assert_eq!(structure.links.len(), 2);
1678 assert!(structure.links[0].is_external);
1679 assert!(!structure.links[1].is_external);
1680 }
1681
1682 #[test]
1683 fn test_markdown_image_extraction() {
1684 let content = "";
1685 let structure = DocumentParser::parse(content);
1686
1687 assert_eq!(structure.images.len(), 1);
1688 assert_eq!(structure.images[0].alt, "Alt text");
1689 assert_eq!(structure.images[0].src, "image.png");
1690 }
1691
1692 #[test]
1693 fn test_markdown_code_block_extraction() {
1694 let content = "```rust\nfn main() {}\n```";
1695 let structure = DocumentParser::parse(content);
1696
1697 assert_eq!(structure.code_blocks.len(), 1);
1698 assert_eq!(structure.code_blocks[0].language, Some("rust".to_string()));
1699 }
1700
1701 #[test]
1702 fn test_html_to_plain_text() {
1703 let html = "<p>Hello <strong>world</strong>!</p>";
1704 let plain = DocumentParser::html_to_plain_text(html);
1705 assert_eq!(plain, "Hello world !");
1706 }
1707
1708 #[test]
1709 fn test_quality_analysis() {
1710 let content = "# My Document\n\nThis is a test document with some content.\n\n## Section\n\nMore content here.";
1711 let structure = DocumentParser::parse(content);
1712 let quality = QualityAnalyzer::analyze(&structure);
1713
1714 assert!(quality.overall_score > 70);
1715 assert!(
1716 quality.issues.is_empty()
1717 || quality
1718 .issues
1719 .iter()
1720 .all(|i| i.severity != IssueSeverity::Error)
1721 );
1722 }
1723}