Skip to main content

edgeparse_core/pdf/
annotation_enrichment.rs

1//! Annotation enrichment — post-processing for extracted annotations.
2//!
3//! Groups, filters, and classifies annotations for output rendering
4//! and content-aware processing.
5
6use crate::pdf::annotation_extractor::{AnnotationType, PdfAnnotation};
7
8/// An annotation group — annotations of the same type on a page.
9#[derive(Debug, Clone)]
10pub struct AnnotationGroup {
11    /// Annotation type for this group.
12    pub annotation_type: AnnotationType,
13    /// Page number (1-based).
14    pub page_number: u32,
15    /// Annotations in this group.
16    pub annotations: Vec<PdfAnnotation>,
17}
18
19/// Summary statistics for annotations in a document.
20#[derive(Debug, Clone, Default)]
21pub struct AnnotationStats {
22    /// Total number of annotations.
23    pub total: usize,
24    /// Number of highlight annotations.
25    pub highlights: usize,
26    /// Number of comment/note annotations.
27    pub comments: usize,
28    /// Number of link annotations.
29    pub links: usize,
30    /// Number of stamp annotations.
31    pub stamps: usize,
32    /// Number of other annotation types.
33    pub other: usize,
34    /// Number of pages that contain at least one annotation.
35    pub pages_with_annotations: usize,
36}
37
38/// Group annotations by type and page.
39pub fn group_by_type_and_page(annotations: &[PdfAnnotation]) -> Vec<AnnotationGroup> {
40    use std::collections::BTreeMap;
41
42    // Group by (page_number, type_key)
43    let mut groups: BTreeMap<(u32, String), Vec<PdfAnnotation>> = BTreeMap::new();
44
45    for ann in annotations {
46        let type_key = type_to_key(&ann.annotation_type);
47        groups
48            .entry((ann.page_number, type_key))
49            .or_default()
50            .push(ann.clone());
51    }
52
53    groups
54        .into_iter()
55        .map(|((page_number, _), anns)| {
56            let annotation_type = anns[0].annotation_type.clone();
57            AnnotationGroup {
58                annotation_type,
59                page_number,
60                annotations: anns,
61            }
62        })
63        .collect()
64}
65
66/// Filter annotations to only include user-facing ones (exclude popups, links).
67pub fn filter_user_annotations(annotations: &[PdfAnnotation]) -> Vec<PdfAnnotation> {
68    annotations
69        .iter()
70        .filter(|a| {
71            !matches!(
72                a.annotation_type,
73                AnnotationType::Popup | AnnotationType::Link
74            )
75        })
76        .cloned()
77        .collect()
78}
79
80/// Filter annotations to only those with non-empty text content.
81pub fn filter_with_content(annotations: &[PdfAnnotation]) -> Vec<PdfAnnotation> {
82    annotations
83        .iter()
84        .filter(|a| a.contents.as_ref().is_some_and(|c| !c.trim().is_empty()))
85        .cloned()
86        .collect()
87}
88
89/// Compute annotation statistics.
90pub fn compute_stats(annotations: &[PdfAnnotation]) -> AnnotationStats {
91    let mut stats = AnnotationStats {
92        total: annotations.len(),
93        ..Default::default()
94    };
95
96    let mut pages = std::collections::HashSet::new();
97
98    for ann in annotations {
99        pages.insert(ann.page_number);
100        match &ann.annotation_type {
101            AnnotationType::Highlight | AnnotationType::Underline | AnnotationType::StrikeOut => {
102                stats.highlights += 1;
103            }
104            AnnotationType::Text | AnnotationType::FreeText => {
105                stats.comments += 1;
106            }
107            AnnotationType::Link => {
108                stats.links += 1;
109            }
110            AnnotationType::Stamp => {
111                stats.stamps += 1;
112            }
113            _ => {
114                stats.other += 1;
115            }
116        }
117    }
118
119    stats.pages_with_annotations = pages.len();
120    stats
121}
122
123/// Render annotations as a markdown summary.
124pub fn annotations_to_markdown(annotations: &[PdfAnnotation]) -> String {
125    if annotations.is_empty() {
126        return String::new();
127    }
128
129    let mut out = String::from("## Annotations\n\n");
130    let mut current_page = 0u32;
131
132    for ann in annotations {
133        if ann.page_number != current_page {
134            current_page = ann.page_number;
135            out.push_str(&format!("### Page {}\n\n", current_page));
136        }
137        let type_label = type_to_key(&ann.annotation_type);
138        let content = ann.contents.as_deref().unwrap_or("(no content)");
139        let author = ann
140            .author
141            .as_deref()
142            .map(|a| format!(" — {}", a))
143            .unwrap_or_default();
144        out.push_str(&format!("- **[{}]** {}{}\n", type_label, content, author));
145    }
146
147    out
148}
149
150fn type_to_key(t: &AnnotationType) -> String {
151    match t {
152        AnnotationType::Text => "Text".to_string(),
153        AnnotationType::Highlight => "Highlight".to_string(),
154        AnnotationType::Underline => "Underline".to_string(),
155        AnnotationType::StrikeOut => "StrikeOut".to_string(),
156        AnnotationType::FreeText => "FreeText".to_string(),
157        AnnotationType::Link => "Link".to_string(),
158        AnnotationType::Stamp => "Stamp".to_string(),
159        AnnotationType::Ink => "Ink".to_string(),
160        AnnotationType::FileAttachment => "FileAttachment".to_string(),
161        AnnotationType::Popup => "Popup".to_string(),
162        AnnotationType::Other(s) => s.clone(),
163    }
164}
165
166#[cfg(test)]
167mod tests {
168    use super::*;
169
170    fn make_ann(ann_type: AnnotationType, page: u32, content: Option<&str>) -> PdfAnnotation {
171        PdfAnnotation {
172            annotation_type: ann_type,
173            contents: content.map(|s| s.to_string()),
174            author: None,
175            page_number: page,
176            rect: None,
177            subject: None,
178            creation_date: None,
179            modification_date: None,
180        }
181    }
182
183    #[test]
184    fn test_group_by_type_and_page() {
185        let annotations = vec![
186            make_ann(AnnotationType::Highlight, 1, Some("important")),
187            make_ann(AnnotationType::Highlight, 1, Some("also important")),
188            make_ann(AnnotationType::Text, 1, Some("note")),
189            make_ann(AnnotationType::Highlight, 2, Some("another")),
190        ];
191        let groups = group_by_type_and_page(&annotations);
192        assert_eq!(groups.len(), 3); // H-p1, T-p1, H-p2
193        assert_eq!(groups[0].annotations.len(), 2); // two highlights on p1
194    }
195
196    #[test]
197    fn test_filter_user_annotations() {
198        let annotations = vec![
199            make_ann(AnnotationType::Highlight, 1, Some("yes")),
200            make_ann(AnnotationType::Popup, 1, None),
201            make_ann(AnnotationType::Link, 1, None),
202            make_ann(AnnotationType::Text, 2, Some("note")),
203        ];
204        let filtered = filter_user_annotations(&annotations);
205        assert_eq!(filtered.len(), 2);
206    }
207
208    #[test]
209    fn test_filter_with_content() {
210        let annotations = vec![
211            make_ann(AnnotationType::Highlight, 1, Some("has content")),
212            make_ann(AnnotationType::Highlight, 1, None),
213            make_ann(AnnotationType::Highlight, 1, Some("  ")),
214        ];
215        let filtered = filter_with_content(&annotations);
216        assert_eq!(filtered.len(), 1);
217    }
218
219    #[test]
220    fn test_compute_stats() {
221        let annotations = vec![
222            make_ann(AnnotationType::Highlight, 1, Some("a")),
223            make_ann(AnnotationType::Text, 1, Some("b")),
224            make_ann(AnnotationType::Link, 2, None),
225            make_ann(AnnotationType::Stamp, 3, None),
226        ];
227        let stats = compute_stats(&annotations);
228        assert_eq!(stats.total, 4);
229        assert_eq!(stats.highlights, 1);
230        assert_eq!(stats.comments, 1);
231        assert_eq!(stats.links, 1);
232        assert_eq!(stats.stamps, 1);
233        assert_eq!(stats.pages_with_annotations, 3);
234    }
235
236    #[test]
237    fn test_annotations_to_markdown() {
238        let annotations = vec![make_ann(AnnotationType::Highlight, 1, Some("key point"))];
239        let md = annotations_to_markdown(&annotations);
240        assert!(md.contains("## Annotations"));
241        assert!(md.contains("### Page 1"));
242        assert!(md.contains("[Highlight]"));
243        assert!(md.contains("key point"));
244    }
245}