pdf_annotations_converter/
parse_goodreader.rs

1use regex::Regex;
2
3use crate::core::{ParseConfig, ParsedItem};
4
5enum ParsedToken {
6    Highlight,
7    Underline,
8}
9
10/// Parses the provided GoodReader annotations emitting a [`ParsedItem`] for each identified
11/// annotation and page number indicators if so desired.
12/// Use the [`ParseConfig`] to configure if page numbers should be included, etc.
13pub fn parse_goodreader_annotations(annotations: &str, config: &ParseConfig) -> Vec<ParsedItem> {
14    let file_rx = Regex::new(r"^File: (.+)").unwrap();
15    let page_roman_rx = Regex::new(r"^--- Page ([mdclxvi]+) ---").unwrap();
16    let page_number_rx = Regex::new(r"^--- Page (\d+) ---").unwrap();
17    let highlight_rx = Regex::new(r"^Highlight( \([^)]+\))?:").unwrap();
18    let underline_rx = Regex::new(r"(?i)^(Squiggly )?Underline( \([^)]+\))?:").unwrap();
19
20    let lines = annotations.lines().filter(|&x| x.len() > 0);
21
22    let mut token: Option<ParsedToken> = None;
23    lines
24        .filter_map(|line| {
25            // Handling tokens we encountered on previous line
26            let item = match token {
27                Some(ParsedToken::Highlight) => {
28                    token = None;
29                    Some(ParsedItem::Highlight(line.to_string()))
30                }
31                Some(ParsedToken::Underline) => {
32                    token = None;
33                    Some(ParsedItem::Underline(line.to_string()))
34                }
35                None => None,
36            };
37            if item.is_some() {
38                return item;
39            }
40
41            // File: File_Name.pdf
42            if let Some(_) = file_rx.find(line) {
43                let c = file_rx.captures(line).unwrap();
44                let s: String = c.get(1).map(|x| x.as_str().to_string()).unwrap();
45                return Some(ParsedItem::File(s));
46            }
47
48            // --- Page NNN ---
49            if config.page_numbers {
50                if let Some(_) = page_number_rx.find(line) {
51                    let c = page_number_rx.captures(line).unwrap();
52                    let n: u32 = c
53                        .get(1)
54                        .map(|x| {
55                            let s = x.as_str().to_string();
56                            s.parse::<u32>().unwrap()
57                        })
58                        .unwrap();
59                    return match config.page_offset {
60                        // physical page is annotated, thus page offset is negative if set
61                        offset if offset < 0 => { 
62                            assert!(n as i32 > offset, format!("negative page offset {} seems to small, encountered physical page {}", offset, n));
63                            Some(ParsedItem::PageNumber((n as i32 + offset) as u32, n))
64                        }
65                        // page number is annotated, page offset is positive if set
66                        offset if offset > 0 => Some(ParsedItem::PageNumber(n, n + offset as u32)),
67                        // page offset not set, i.e. is 0
68                        _ =>Some(ParsedItem::PageNumber(n, n)), 
69                    };
70                }
71                if let Some(_) = page_roman_rx.find(line) {
72                    let c = page_roman_rx.captures(line).unwrap();
73                    let s: String = c
74                        .get(1)
75                        .map(|x| {
76                            x.as_str().to_string()
77                        })
78                        .unwrap();
79                    return Some(ParsedItem::PageRoman(s))
80                }
81            }
82
83            // Highlight:
84            // Highlighted Text
85            if let Some(_) = highlight_rx.find(line) {
86                token = Some(ParsedToken::Highlight);
87                return None;
88            }
89            // Underline:
90            // Underlined Text
91            if let Some(_) = underline_rx.find(line) {
92                token = Some(ParsedToken::Underline);
93                return None;
94            }
95            None
96        })
97        .collect()
98}
99
100#[cfg(test)]
101mod tests {
102    use super::*;
103
104    static WITH_PAGE_NUMBERS: ParseConfig = ParseConfig {
105        page_offset: 0,
106        page_numbers: true,
107    };
108    static WITHOUT_PAGE_NUMBERS: ParseConfig = ParseConfig {
109        page_offset: 0,
110        page_numbers: false,
111    };
112
113    #[test]
114    fn parsing_file_indicator() {
115        let annotations = r#"
116File: Hello_World.pdf
117"#;
118        let items = parse_goodreader_annotations(annotations, &WITHOUT_PAGE_NUMBERS);
119        assert_eq!(ParsedItem::File("Hello_World.pdf".to_string()), items[0]);
120    }
121
122    #[test]
123    fn parsing_page_indicator() {
124        let annotations = r#"
125--- Page 45 ---
126--- Page 22 ---
127-- Page 45 ---
128"#;
129        let items = parse_goodreader_annotations(annotations, &WITH_PAGE_NUMBERS);
130        assert_eq!(
131            items,
132            vec![
133                ParsedItem::PageNumber(45, 45),
134                ParsedItem::PageNumber(22, 22)
135            ]
136        )
137    }
138
139    #[test]
140    fn parsing_page_indicator_with_positive_page_offset() {
141        let annotations = r#"
142--- Page 45 ---
143--- Page 22 ---
144"#;
145        let items = parse_goodreader_annotations(
146            annotations,
147            &ParseConfig {
148                page_offset: 3,
149                page_numbers: true,
150            },
151        );
152        assert_eq!(
153            items,
154            vec![
155                ParsedItem::PageNumber(45, 48),
156                ParsedItem::PageNumber(22, 25)
157            ]
158        )
159    }
160
161    #[test]
162    fn parsing_page_roman_indicator() {
163        let annotations = r#"
164--- Page xxxviii ---
165--- Page xxxix ---
166"#;
167        let items = parse_goodreader_annotations(annotations, &WITH_PAGE_NUMBERS);
168        assert_eq!(
169            items,
170            vec![
171                ParsedItem::PageRoman("xxxviii".to_string()),
172                ParsedItem::PageRoman("xxxix".to_string())
173            ]
174        )
175    }
176
177    #[test]
178    fn parsing_page_indicator_with_negative_page_offset() {
179        let annotations = r#"
180--- Page 45 ---
181--- Page 22 ---
182"#;
183        let items = parse_goodreader_annotations(
184            annotations,
185            &ParseConfig {
186                page_offset: -3,
187                page_numbers: true,
188            },
189        );
190        assert_eq!(
191            items,
192            vec![
193                ParsedItem::PageNumber(42, 45),
194                ParsedItem::PageNumber(19, 22)
195            ]
196        )
197    }
198
199    #[test]
200    fn parsing_highlight_indicator() {
201        let annotations = r#"
202Highlight:
203Practical: A Simple Database
204"#;
205        let items = parse_goodreader_annotations(annotations, &WITHOUT_PAGE_NUMBERS);
206        assert_eq!(
207            items,
208            vec![ParsedItem::Highlight(
209                "Practical: A Simple Database".to_string()
210            )]
211        )
212    }
213
214    #[test]
215    fn parsing_highlight_color_indicator() {
216        let annotations = r#"
217Highlight (blue):
218Practical: A Simple Database
219"#;
220        let items = parse_goodreader_annotations(annotations, &WITHOUT_PAGE_NUMBERS);
221        assert_eq!(
222            items,
223            vec![ParsedItem::Highlight(
224                "Practical: A Simple Database".to_string()
225            )],
226        )
227    }
228
229    #[test]
230    fn parsing_underline_indicator() {
231        let annotations = r#"
232Underline:
233`(equal (getf cd ,field) ,value)
234"#;
235        let items = parse_goodreader_annotations(annotations, &WITHOUT_PAGE_NUMBERS);
236        assert_eq!(
237            items,
238            vec![ParsedItem::Underline(
239                "`(equal (getf cd ,field) ,value)".to_string()
240            )],
241        )
242    }
243
244    #[test]
245    fn parsing_underline_color() {
246        let annotations = r#"
247Underline: (color #6F77FF):
248`(equal (getf cd ,field) ,value)
249"#;
250        let items = parse_goodreader_annotations(annotations, &WITHOUT_PAGE_NUMBERS);
251        assert_eq!(
252            items,
253            vec![ParsedItem::Underline(
254                "`(equal (getf cd ,field) ,value)".to_string()
255            )],
256        );
257    }
258
259    static SECTION: &str = "
260File: Practical_Common_Lisp.pdf
261
262Annotation summary:
263
264--- Page 45 ---
265
266Highlight:
267Practical: A Simple Database
268
269
270--- Page 46 ---
271
272Underline:
273property list, or plist
274
275Underline:
276(list :a 1 :b 2 :c 3)
277
278Underline:
279GETF, which takes a plist and a symbol and returns the value in the plist
280
281Underline:
282(getf (list :a 1 :b 2 :c 3) :a)
283
284
285--- Page 47 ---
286
287Underline:
288global variable, *db*, which you can define with the DEFVAR macro
289";
290    #[test]
291    fn parsing_section_with_zero_based_page_numbers() {
292        let items = parse_goodreader_annotations(SECTION, &WITH_PAGE_NUMBERS);
293        assert_eq!(
294            items,
295            vec![
296                ParsedItem::File("Practical_Common_Lisp.pdf".to_string()),
297                ParsedItem::PageNumber(45, 45),
298                ParsedItem::Highlight("Practical: A Simple Database".to_string()),
299                ParsedItem::PageNumber(46, 46),
300                ParsedItem::Underline("property list, or plist".to_string()),
301                ParsedItem::Underline("(list :a 1 :b 2 :c 3)".to_string()),
302                ParsedItem::Underline(
303                    "GETF, which takes a plist and a symbol and returns the value in the plist"
304                        .to_string()
305                ),
306                ParsedItem::Underline("(getf (list :a 1 :b 2 :c 3) :a)".to_string()),
307                ParsedItem::PageNumber(47, 47),
308                ParsedItem::Underline(
309                    "global variable, *db*, which you can define with the DEFVAR macro".to_string()
310                )
311            ]
312        );
313    }
314
315    #[test]
316    fn parsing_section_with_10_based_page_numbers() {
317        let items = parse_goodreader_annotations(
318            SECTION,
319            &ParseConfig {
320                page_offset: 10,
321                page_numbers: true,
322            },
323        );
324        assert_eq!(
325            items,
326            vec![
327                ParsedItem::File("Practical_Common_Lisp.pdf".to_string()),
328                ParsedItem::PageNumber(45, 55),
329                ParsedItem::Highlight("Practical: A Simple Database".to_string()),
330                ParsedItem::PageNumber(46, 56),
331                ParsedItem::Underline("property list, or plist".to_string()),
332                ParsedItem::Underline("(list :a 1 :b 2 :c 3)".to_string()),
333                ParsedItem::Underline(
334                    "GETF, which takes a plist and a symbol and returns the value in the plist"
335                        .to_string()
336                ),
337                ParsedItem::Underline("(getf (list :a 1 :b 2 :c 3) :a)".to_string()),
338                ParsedItem::PageNumber(47, 57),
339                ParsedItem::Underline(
340                    "global variable, *db*, which you can define with the DEFVAR macro".to_string()
341                )
342            ]
343        );
344    }
345
346    #[test]
347    fn parsing_section_without_page_numbers() {
348        let items = parse_goodreader_annotations(SECTION, &WITHOUT_PAGE_NUMBERS);
349        assert_eq!(
350            items,
351            vec![
352                ParsedItem::File("Practical_Common_Lisp.pdf".to_string()),
353                ParsedItem::Highlight("Practical: A Simple Database".to_string()),
354                ParsedItem::Underline("property list, or plist".to_string()),
355                ParsedItem::Underline("(list :a 1 :b 2 :c 3)".to_string()),
356                ParsedItem::Underline(
357                    "GETF, which takes a plist and a symbol and returns the value in the plist"
358                        .to_string()
359                ),
360                ParsedItem::Underline("(getf (list :a 1 :b 2 :c 3) :a)".to_string()),
361                ParsedItem::Underline(
362                    "global variable, *db*, which you can define with the DEFVAR macro".to_string()
363                )
364            ]
365        );
366    }
367}