pdf_annotations_converter/
parse_goodreader.rs1use regex::Regex;
2
3use crate::core::{ParseConfig, ParsedItem};
4
5enum ParsedToken {
6 Highlight,
7 Underline,
8}
9
10pub fn parse_goodreader_annotations(annotations: &str, config: &ParseConfig) -> Vec<ParsedItem> {
14 let file_rx = Regex::new(r"^File: (.+)").unwrap();
15 let page_roman_rx = Regex::new(r"^--- Page ([mdclxvi]+) ---").unwrap();
16 let page_number_rx = Regex::new(r"^--- Page (\d+) ---").unwrap();
17 let highlight_rx = Regex::new(r"^Highlight( \([^)]+\))?:").unwrap();
18 let underline_rx = Regex::new(r"(?i)^(Squiggly )?Underline( \([^)]+\))?:").unwrap();
19
20 let lines = annotations.lines().filter(|&x| x.len() > 0);
21
22 let mut token: Option<ParsedToken> = None;
23 lines
24 .filter_map(|line| {
25 let item = match token {
27 Some(ParsedToken::Highlight) => {
28 token = None;
29 Some(ParsedItem::Highlight(line.to_string()))
30 }
31 Some(ParsedToken::Underline) => {
32 token = None;
33 Some(ParsedItem::Underline(line.to_string()))
34 }
35 None => None,
36 };
37 if item.is_some() {
38 return item;
39 }
40
41 if let Some(_) = file_rx.find(line) {
43 let c = file_rx.captures(line).unwrap();
44 let s: String = c.get(1).map(|x| x.as_str().to_string()).unwrap();
45 return Some(ParsedItem::File(s));
46 }
47
48 if config.page_numbers {
50 if let Some(_) = page_number_rx.find(line) {
51 let c = page_number_rx.captures(line).unwrap();
52 let n: u32 = c
53 .get(1)
54 .map(|x| {
55 let s = x.as_str().to_string();
56 s.parse::<u32>().unwrap()
57 })
58 .unwrap();
59 return match config.page_offset {
60 offset if offset < 0 => {
62 assert!(n as i32 > offset, format!("negative page offset {} seems to small, encountered physical page {}", offset, n));
63 Some(ParsedItem::PageNumber((n as i32 + offset) as u32, n))
64 }
65 offset if offset > 0 => Some(ParsedItem::PageNumber(n, n + offset as u32)),
67 _ =>Some(ParsedItem::PageNumber(n, n)),
69 };
70 }
71 if let Some(_) = page_roman_rx.find(line) {
72 let c = page_roman_rx.captures(line).unwrap();
73 let s: String = c
74 .get(1)
75 .map(|x| {
76 x.as_str().to_string()
77 })
78 .unwrap();
79 return Some(ParsedItem::PageRoman(s))
80 }
81 }
82
83 if let Some(_) = highlight_rx.find(line) {
86 token = Some(ParsedToken::Highlight);
87 return None;
88 }
89 if let Some(_) = underline_rx.find(line) {
92 token = Some(ParsedToken::Underline);
93 return None;
94 }
95 None
96 })
97 .collect()
98}
99
100#[cfg(test)]
101mod tests {
102 use super::*;
103
104 static WITH_PAGE_NUMBERS: ParseConfig = ParseConfig {
105 page_offset: 0,
106 page_numbers: true,
107 };
108 static WITHOUT_PAGE_NUMBERS: ParseConfig = ParseConfig {
109 page_offset: 0,
110 page_numbers: false,
111 };
112
113 #[test]
114 fn parsing_file_indicator() {
115 let annotations = r#"
116File: Hello_World.pdf
117"#;
118 let items = parse_goodreader_annotations(annotations, &WITHOUT_PAGE_NUMBERS);
119 assert_eq!(ParsedItem::File("Hello_World.pdf".to_string()), items[0]);
120 }
121
122 #[test]
123 fn parsing_page_indicator() {
124 let annotations = r#"
125--- Page 45 ---
126--- Page 22 ---
127-- Page 45 ---
128"#;
129 let items = parse_goodreader_annotations(annotations, &WITH_PAGE_NUMBERS);
130 assert_eq!(
131 items,
132 vec![
133 ParsedItem::PageNumber(45, 45),
134 ParsedItem::PageNumber(22, 22)
135 ]
136 )
137 }
138
139 #[test]
140 fn parsing_page_indicator_with_positive_page_offset() {
141 let annotations = r#"
142--- Page 45 ---
143--- Page 22 ---
144"#;
145 let items = parse_goodreader_annotations(
146 annotations,
147 &ParseConfig {
148 page_offset: 3,
149 page_numbers: true,
150 },
151 );
152 assert_eq!(
153 items,
154 vec![
155 ParsedItem::PageNumber(45, 48),
156 ParsedItem::PageNumber(22, 25)
157 ]
158 )
159 }
160
161 #[test]
162 fn parsing_page_roman_indicator() {
163 let annotations = r#"
164--- Page xxxviii ---
165--- Page xxxix ---
166"#;
167 let items = parse_goodreader_annotations(annotations, &WITH_PAGE_NUMBERS);
168 assert_eq!(
169 items,
170 vec![
171 ParsedItem::PageRoman("xxxviii".to_string()),
172 ParsedItem::PageRoman("xxxix".to_string())
173 ]
174 )
175 }
176
177 #[test]
178 fn parsing_page_indicator_with_negative_page_offset() {
179 let annotations = r#"
180--- Page 45 ---
181--- Page 22 ---
182"#;
183 let items = parse_goodreader_annotations(
184 annotations,
185 &ParseConfig {
186 page_offset: -3,
187 page_numbers: true,
188 },
189 );
190 assert_eq!(
191 items,
192 vec![
193 ParsedItem::PageNumber(42, 45),
194 ParsedItem::PageNumber(19, 22)
195 ]
196 )
197 }
198
199 #[test]
200 fn parsing_highlight_indicator() {
201 let annotations = r#"
202Highlight:
203Practical: A Simple Database
204"#;
205 let items = parse_goodreader_annotations(annotations, &WITHOUT_PAGE_NUMBERS);
206 assert_eq!(
207 items,
208 vec![ParsedItem::Highlight(
209 "Practical: A Simple Database".to_string()
210 )]
211 )
212 }
213
214 #[test]
215 fn parsing_highlight_color_indicator() {
216 let annotations = r#"
217Highlight (blue):
218Practical: A Simple Database
219"#;
220 let items = parse_goodreader_annotations(annotations, &WITHOUT_PAGE_NUMBERS);
221 assert_eq!(
222 items,
223 vec![ParsedItem::Highlight(
224 "Practical: A Simple Database".to_string()
225 )],
226 )
227 }
228
229 #[test]
230 fn parsing_underline_indicator() {
231 let annotations = r#"
232Underline:
233`(equal (getf cd ,field) ,value)
234"#;
235 let items = parse_goodreader_annotations(annotations, &WITHOUT_PAGE_NUMBERS);
236 assert_eq!(
237 items,
238 vec![ParsedItem::Underline(
239 "`(equal (getf cd ,field) ,value)".to_string()
240 )],
241 )
242 }
243
244 #[test]
245 fn parsing_underline_color() {
246 let annotations = r#"
247Underline: (color #6F77FF):
248`(equal (getf cd ,field) ,value)
249"#;
250 let items = parse_goodreader_annotations(annotations, &WITHOUT_PAGE_NUMBERS);
251 assert_eq!(
252 items,
253 vec![ParsedItem::Underline(
254 "`(equal (getf cd ,field) ,value)".to_string()
255 )],
256 );
257 }
258
259 static SECTION: &str = "
260File: Practical_Common_Lisp.pdf
261
262Annotation summary:
263
264--- Page 45 ---
265
266Highlight:
267Practical: A Simple Database
268
269
270--- Page 46 ---
271
272Underline:
273property list, or plist
274
275Underline:
276(list :a 1 :b 2 :c 3)
277
278Underline:
279GETF, which takes a plist and a symbol and returns the value in the plist
280
281Underline:
282(getf (list :a 1 :b 2 :c 3) :a)
283
284
285--- Page 47 ---
286
287Underline:
288global variable, *db*, which you can define with the DEFVAR macro
289";
290 #[test]
291 fn parsing_section_with_zero_based_page_numbers() {
292 let items = parse_goodreader_annotations(SECTION, &WITH_PAGE_NUMBERS);
293 assert_eq!(
294 items,
295 vec![
296 ParsedItem::File("Practical_Common_Lisp.pdf".to_string()),
297 ParsedItem::PageNumber(45, 45),
298 ParsedItem::Highlight("Practical: A Simple Database".to_string()),
299 ParsedItem::PageNumber(46, 46),
300 ParsedItem::Underline("property list, or plist".to_string()),
301 ParsedItem::Underline("(list :a 1 :b 2 :c 3)".to_string()),
302 ParsedItem::Underline(
303 "GETF, which takes a plist and a symbol and returns the value in the plist"
304 .to_string()
305 ),
306 ParsedItem::Underline("(getf (list :a 1 :b 2 :c 3) :a)".to_string()),
307 ParsedItem::PageNumber(47, 47),
308 ParsedItem::Underline(
309 "global variable, *db*, which you can define with the DEFVAR macro".to_string()
310 )
311 ]
312 );
313 }
314
315 #[test]
316 fn parsing_section_with_10_based_page_numbers() {
317 let items = parse_goodreader_annotations(
318 SECTION,
319 &ParseConfig {
320 page_offset: 10,
321 page_numbers: true,
322 },
323 );
324 assert_eq!(
325 items,
326 vec![
327 ParsedItem::File("Practical_Common_Lisp.pdf".to_string()),
328 ParsedItem::PageNumber(45, 55),
329 ParsedItem::Highlight("Practical: A Simple Database".to_string()),
330 ParsedItem::PageNumber(46, 56),
331 ParsedItem::Underline("property list, or plist".to_string()),
332 ParsedItem::Underline("(list :a 1 :b 2 :c 3)".to_string()),
333 ParsedItem::Underline(
334 "GETF, which takes a plist and a symbol and returns the value in the plist"
335 .to_string()
336 ),
337 ParsedItem::Underline("(getf (list :a 1 :b 2 :c 3) :a)".to_string()),
338 ParsedItem::PageNumber(47, 57),
339 ParsedItem::Underline(
340 "global variable, *db*, which you can define with the DEFVAR macro".to_string()
341 )
342 ]
343 );
344 }
345
346 #[test]
347 fn parsing_section_without_page_numbers() {
348 let items = parse_goodreader_annotations(SECTION, &WITHOUT_PAGE_NUMBERS);
349 assert_eq!(
350 items,
351 vec![
352 ParsedItem::File("Practical_Common_Lisp.pdf".to_string()),
353 ParsedItem::Highlight("Practical: A Simple Database".to_string()),
354 ParsedItem::Underline("property list, or plist".to_string()),
355 ParsedItem::Underline("(list :a 1 :b 2 :c 3)".to_string()),
356 ParsedItem::Underline(
357 "GETF, which takes a plist and a symbol and returns the value in the plist"
358 .to_string()
359 ),
360 ParsedItem::Underline("(getf (list :a 1 :b 2 :c 3) :a)".to_string()),
361 ParsedItem::Underline(
362 "global variable, *db*, which you can define with the DEFVAR macro".to_string()
363 )
364 ]
365 );
366 }
367}