lib/models/
epubcfi.rs

1//! Defines a parser to convert an [epubcfi][epubcfi] into a sortable string for sorting annotations
2//! into their order of appearance inside their respective books.
3//!
4//! [epubcfi]: https://w3c.github.io/epub-specs/epub33/epubcfi/
5
6use std::borrow::ToOwned;
7
8use once_cell::sync::Lazy;
9use regex::Regex;
10
11/// Capture a 'Step Reference' e.g. `/6` `/4`
12///
13/// <https://w3c.github.io/epub-specs/epub33/epubcfi/#sec-path-child-ref>
14static RE_STEP_REFERENCE: Lazy<Regex> = Lazy::new(|| Regex::new(r"/[0-9]+").unwrap());
15
16/// Captures an 'XML ID Assertion / Text Location Assertion' e.g. `[chap01]`
17///
18/// The specific difference between these two doesn't matter for our purposes. We just need to strip
19/// out anything that resembles an 'Assertion'.
20///
21/// <https://w3c.github.io/epub-specs/epub33/epubcfi/#sec-path-xmlid>
22/// <https://w3c.github.io/epub-specs/epub33/epubcfi/#sec-path-text-location>
23static RE_ASSERTIONS: Lazy<Regex> = Lazy::new(|| {
24    Regex::new(
25        r"(?x)
26        # Captures opening square bracket e.g. `[`
27        \[
28
29        # Captures anything but square brackets e.g. `chap01`
30        [^\[\]]*
31
32        # Captures closing square bracket e.g. `]`
33        \]
34    ",
35    )
36    .unwrap()
37});
38
39/// Captures a 'Character Offset' e.g. `:2` `:100`
40///
41/// <https://w3c.github.io/epub-specs/epub33/epubcfi/#sec-path-terminating-char>
42static RE_CHARACTER_OFFSET: Lazy<Regex> = Lazy::new(|| Regex::new(r":[0-9]+$").unwrap());
43
44/// Captures a 'Spacial Offset' e.g. `~23.5` `~42.43`
45///
46/// <https://w3c.github.io/epub-specs/epub33/epubcfi/#sec-path-terminating-spatial>
47static RE_TEMPORAL_OFFSET: Lazy<Regex> = Lazy::new(|| Regex::new(r"~[0-9]+\.[0-9]+").unwrap());
48
49/// Captures a 'Temporal Offset' e.g. `@100:100` `@5.75:97.6`
50///
51/// <https://w3c.github.io/epub-specs/epub33/epubcfi/#sec-path-terminating-temporal>
52static RE_SPACIAL_OFFSET: Lazy<Regex> = Lazy::new(|| Regex::new(r"@[0-9.]+:[0-9.]+").unwrap());
53
54/// Returns a simplified location string from a `epubcfi`.
55///
56/// This is a super simple EPUB CFI parser with a focus on extracting location information for
57/// sorting [`Annotation`][annotation]s.
58///
59/// Examples:
60///
61/// ```plaintext
62/// input:  epubcfi(/6/4[chap01ref]!/4[body01]/10[para05]/1:3[xx,y])
63/// output: 6.4.4.10.1:3
64/// ```
65/// <https://w3c.github.io/epub-specs/epub33/epubcfi/#example-8>
66///
67/// ```plaintext
68/// input:  epubcfi(/6/4[chap01ref]!/4[body01]/10[para05],/2/1:1,/3:4)
69/// output: 6.4.4.10.2.1:1
70/// ```
71/// <https://w3c.github.io/epub-specs/epub33/epubcfi/#example-23>
72///
73/// See <https://w3c.github.io/epub-specs/epub33/epubcfi/> for more
74/// information.
75///
76/// [annotation]: super::annotation::Annotation
77#[must_use]
78pub fn parse(raw: &str) -> String {
79    // Check that the incoming string is an `epubcfi`.
80    if !raw.starts_with("epubcfi(") && !raw.ends_with(')') {
81        return String::new();
82    }
83
84    // Starting with:
85    //
86    //    A: epubcfi(/6/4[chap01ref]!/4[body01]/10[para05],/2/1:1,/3:4)
87    //    B: epubcfi(/6/4[chap01ref]!/4[body01]/10[para05]/1:3[xx,y])
88    //    C: epubcfi(/2/4!/6[bar]/44!/3~1.11@1:1)
89
90    // Strip start and end: i.e. `epubcfi(` & `)`
91    //
92    // -> A: /6/4[chap01ref]!/4[body01]/10[para05],/2/1:1,/3:4
93    // -> B: /6/4[chap01ref]!/4[body01]/10[para05]/1:3[xx,y]
94    // -> C: /2/4!/6[bar]/44!/3~1.11@1:1
95    let mut location = raw[8..raw.len() - 1].to_owned();
96
97    // Dropping the following elements means they are not taken into consideration during sorting
98    // comparisons between `Annotation`s.
99
100    // Remove any type of 'Assertion'.
101    //
102    // -> A: /6/4!/4/10,/2/1:1,/3:4
103    // -> B: /6/4!/4/10/1:3
104    // -> C: /2/4!/6/44!/3~1.11@1:1
105    location = RE_ASSERTIONS.replace_all(&location, "").into_owned();
106
107    // Remove 'Temporal Offsets' (~)..
108    //
109    // -> A: ...
110    // -> B: ...
111    // -> C: /2/4!/6/44!/3@1:1
112    location = RE_TEMPORAL_OFFSET.replace_all(&location, "").into_owned();
113
114    // Remove 'Spacial Offsets' (@).
115    //
116    // -> A: ...
117    // -> B: ...
118    // -> C: /2/4!/6/44!/3
119    location = RE_SPACIAL_OFFSET.replace_all(&location, "").into_owned();
120
121    // "EPUB CFIs allow the expression of simple ranges extending from a start location to an end
122    // location."
123    //
124    // <https://w3c.github.io/epub-specs/epub33/epubcfi/#sec-ranges>
125    //
126    // For example:
127    //
128    //     epubcfi([parent-path],[range-start],[range-end])
129    //
130    // We only care about the [parent-path] and [range-start] which gives us the absolute path to
131    // where an `Annotation` begins.
132    let mut parts: Vec<&str> = location.split(',').collect();
133    parts = match parts[..] {
134        [parent_path, range_start, _] => {
135            vec![parent_path, range_start]
136        }
137        _ => parts,
138    };
139
140    // -> A: /6/4!/4/10,/2/1:1
141    // -> B: /6/4!/4/10/1:3
142    // -> C: /2/4!/6/44!/3
143    location = parts.join("");
144
145    // -> A: /6/4/4/10/2/1
146    // -> B: /6/4/4/10/1
147    // -> C: /2/4/6/44/3
148    let mut steps = RE_STEP_REFERENCE
149        .find_iter(&location)
150        .map(|m| m.as_str())
151        .map(ToOwned::to_owned)
152        .collect::<String>();
153
154    // -> A: 6/4/4/10/2/1
155    // -> B: 6/4/4/10/1
156    // -> C: 2/4/6/44/3
157    steps.remove(0);
158
159    // -> A: 6.4.4.10.2.1
160    // -> B: 6.4.4.10.1
161    // -> C: 2.4.6.44.3
162    steps = steps.replace('/', ".");
163
164    // Save the character offset found at the end of [range-start].
165    //
166    // -> A: :1
167    // -> B: :3
168    // -> C: N/A
169    let character_offset = RE_CHARACTER_OFFSET
170        .find(&location)
171        .map(|m| m.as_str())
172        .map_or_else(String::new, ToOwned::to_owned);
173
174    // -> A: 6.4.4.10.2.1:1
175    // -> B: 6.4.4.10.1:3
176    // -> C: 2.4.6.44.3
177    location = format!("{steps}{character_offset}");
178
179    location
180}
181
182#[cfg(test)]
183mod test {
184
185    use super::*;
186
187    // https://stackoverflow.com/a/34666891/16968574
188    macro_rules! parse_epubcfi {
189        ($($name:ident: $value:expr,)*) => {
190            $(
191                #[test]
192                fn $name() {
193                    let (raw, expected) = $value;
194                    let parsed = parse(raw);
195                    assert_eq!(parsed, expected);
196                }
197            )*
198        }
199    }
200
201    // https://stackoverflow.com/a/34666891/16968574
202    macro_rules! compare_epubcfi {
203        ($($name:ident: ($lhs:tt $cmp:tt $rhs:tt),)*) => {
204            $(
205                #[test]
206                fn $name() {
207                    let lhs_parsed = parse($lhs);
208                    let rhs_parsed = parse($rhs);
209                    assert!(lhs_parsed $cmp rhs_parsed);
210                }
211            )*
212        }
213    }
214
215    // <https://github.com/fread-ink/epub-cfi-resolver/blob/master/tests/simple.js>
216    parse_epubcfi! {
217        parse_epubcfi_00: (
218            "epubcfi(/1/2)",
219            "1.2",
220        ),
221        parse_epubcfi_01: (
222            "epubcfi(/1/0)",
223            "1.0",
224        ),
225        parse_epubcfi_02: (
226            "epubcfi(/1/2:3[pre,post])",
227            "1.2:3",
228        ),
229        parse_epubcfi_03: (
230            "epubcfi(/1/2:3[,post])",
231            "1.2:3",
232        ),
233        parse_epubcfi_04: (
234            "epubcfi(/1/2:3[pre,])",
235            "1.2:3",
236        ),
237        parse_epubcfi_05: (
238            "epubcfi(/1[^^^]])",
239            "1",
240        ),
241        parse_epubcfi_06: (
242            "epubcfi(/6/14[cha!/p05ref]!/4[bo!/dy01]/10/2/1[foo]:5[don't!/ panic;s=b])",
243            "6.14.4.10.2.1:5",
244        ),
245        parse_epubcfi_07: (
246            "epubcfi(/6/4[chap01ref]!/4[body01]/10[para05]/3:5)",
247            "6.4.4.10.3:5",
248        ),
249        parse_epubcfi_08: (
250            "epubcfi(/6/4[chap01ref]!/4/10/0)",
251            "6.4.4.10.0",
252        ),
253        parse_epubcfi_09: (
254            "epubcfi(/6/4[chap01ref]!/4/10/999)",
255            "6.4.4.10.999",
256        ),
257        parse_epubcfi_10: (
258            "epubcfi(/6/4[chap01ref]!/4[body01],/10[para05]/3:5,/10[para05]/3:8)",
259            "6.4.4.10.3:5",
260        ),
261        parse_epubcfi_11: (
262            "epubcfi(/6/4[chap01ref]!/4[body01]/10[para05]/3:3[34,67])",
263            "6.4.4.10.3:3",
264        ),
265        parse_epubcfi_12: (
266            "epubcfi(/6/14[cha!/p05ref]!/4[bo!/dy01]/10/2/1[foo]~42.43@100:101)",
267            "6.14.4.10.2.1",
268        ),
269        // Test that 'Temporal' and 'Spatial' offsets are ignored on all but last subpart.
270        parse_epubcfi_13: (
271            "epubcfi(/2~42.43@100:101/4!/6/8:100/6:200)",
272            "2.4.6.8.6:200",
273        ),
274        // Test that parser ignores vendor extensions.
275        // <https://w3c.github.io/epub-specs/epub33/epubcfi/#sec-extensions>
276        parse_epubcfi_14: (
277            "epubcfi(/2/4vnd.foo/6foo.bar:20)",
278            "2.4.6:20",
279        ),
280        parse_epubcfi_15: (
281            "epubcfi(/6/4[chap01ref]!/4[body01]/10[para05],/2/1:1,/3:4)",
282            "6.4.4.10.2.1:1",
283        ),
284        parse_epubcfi_16: (
285            "epubcfi(/6/4[chap01ref]!/4[body01]/10[para05]/1:3[xx,y])",
286            "6.4.4.10.1:3",
287        ),
288        parse_epubcfi_17: (
289            "epubcfi(/6/28[chap06]!/4/24[para06]/1,:4,:44)",
290            "6.28.4.24.1:4",
291        ),
292        parse_epubcfi_18: (
293            "epubcfi(/2/4[node-id]!/6/7:5[pre,post;s=b])",
294            "2.4.6.7:5",
295        ),
296        parse_epubcfi_19: (
297            "epubcfi(/2/4@4:2)",
298            "2.4",
299        ),
300        parse_epubcfi_20: (
301            "epubcfi(/2/4~3.14)",
302            "2.4",
303        ),
304        parse_epubcfi_21: (
305            "epubcfi(/2/4~3.14@4:2)",
306            "2.4",
307        ),
308    }
309
310    // <https://github.com/fread-ink/epub-cfi-resolver/blob/master/tests/compare.js>
311    compare_epubcfi! {
312        compare_epubcfi_00: (
313            "epubcfi(/2)" < "epubcfi(/6)"
314        ),
315        compare_epubcfi_01: (
316            "epubcfi(/2/4!/6)" < "epubcfi(/2/4!/7)"
317        ),
318        compare_epubcfi_02: (
319            "epubcfi(/2/4!/8)" > "epubcfi(/2/4!/7)"
320        ),
321        compare_epubcfi_03: (
322            "epubcfi(/2/4!/6[foo]/42!/12:100[lol])" < "epubcfi(/2/4!/6[bar]/44!/12:100[cat])"
323        ),
324        // Test that node ids and text location assertions are ignored.
325        compare_epubcfi_04: (
326            "epubcfi(/2/4!/6[foo]/44!/12:100[lol])" == "epubcfi(/2/4!/6[bar]/44!/12:100[cat])"
327        ),
328        compare_epubcfi_05: (
329            "epubcfi(/2/4!/6[bar]/44!/12:100[cat])" == "epubcfi(/2/4!/6[bar]/44!/12:100[cat])"
330        ),
331        // Test that temporal and spatial offsets are ignored on character (text/cdata) nodes
332        compare_epubcfi_06: (
333            "epubcfi(/2/4!/6[bar]/44!/3~1.11@1:1)" == "epubcfi(/2/4!/6[bar]/44!/3~2.22@2:2)"
334        ),
335        // Compare identical ranges.
336        compare_epubcfi_07: (
337            "epubcfi(/2/4,/6/8,/10/12)" == "epubcfi(/2/4,/6/8,/10/12)"
338        ),
339        // Compare ranges with different [range-start].
340        compare_epubcfi_08: (
341            "epubcfi(/2/4,/6/7,/10/11)" < "epubcfi(/2/4,/6/8,/10/12)"
342        ),
343        // Compare ranges with different [parent-path].
344        compare_epubcfi_09: (
345            "epubcfi(/2/2,/6/8,/10/12)" < "epubcfi(/2/4,/6/8,/10/12)"
346        ),
347        // Compare a range against a non-range.
348        compare_epubcfi_10: (
349            "epubcfi(/2/4,/6/8,/10/13)" > "epubcfi(/2/4/6/7)"
350        ),
351        // Compare a range against a non-range
352        compare_epubcfi_11: (
353            "epubcfi(/2/4,/6/8,/10/13)" == "epubcfi(/2/4/6/8)"
354        ),
355        compare_epubcfi_12: (
356            "epubcfi(/2/4!/6[bar]/44!/12:100[hah])" < "epubcfi(/2/4!/6[bar]/44!/12:200[cat])"
357        ),
358    }
359}