lib/models/epubcfi.rs
1//! Defines a parser to convert an [epubcfi][epubcfi] into a sortable string for sorting annotations
2//! into their order of appearance inside their respective books.
3//!
4//! [epubcfi]: https://w3c.github.io/epub-specs/epub33/epubcfi/
5
6use std::borrow::ToOwned;
7
8use once_cell::sync::Lazy;
9use regex::Regex;
10
11/// Capture a 'Step Reference' e.g. `/6` `/4`
12///
13/// <https://w3c.github.io/epub-specs/epub33/epubcfi/#sec-path-child-ref>
14static RE_STEP_REFERENCE: Lazy<Regex> = Lazy::new(|| Regex::new(r"/[0-9]+").unwrap());
15
16/// Captures an 'XML ID Assertion / Text Location Assertion' e.g. `[chap01]`
17///
18/// The specific difference between these two doesn't matter for our purposes. We just need to strip
19/// out anything that resembles an 'Assertion'.
20///
21/// <https://w3c.github.io/epub-specs/epub33/epubcfi/#sec-path-xmlid>
22/// <https://w3c.github.io/epub-specs/epub33/epubcfi/#sec-path-text-location>
23static RE_ASSERTIONS: Lazy<Regex> = Lazy::new(|| {
24 Regex::new(
25 r"(?x)
26 # Captures opening square bracket e.g. `[`
27 \[
28
29 # Captures anything but square brackets e.g. `chap01`
30 [^\[\]]*
31
32 # Captures closing square bracket e.g. `]`
33 \]
34 ",
35 )
36 .unwrap()
37});
38
39/// Captures a 'Character Offset' e.g. `:2` `:100`
40///
41/// <https://w3c.github.io/epub-specs/epub33/epubcfi/#sec-path-terminating-char>
42static RE_CHARACTER_OFFSET: Lazy<Regex> = Lazy::new(|| Regex::new(r":[0-9]+$").unwrap());
43
44/// Captures a 'Spacial Offset' e.g. `~23.5` `~42.43`
45///
46/// <https://w3c.github.io/epub-specs/epub33/epubcfi/#sec-path-terminating-spatial>
47static RE_TEMPORAL_OFFSET: Lazy<Regex> = Lazy::new(|| Regex::new(r"~[0-9]+\.[0-9]+").unwrap());
48
49/// Captures a 'Temporal Offset' e.g. `@100:100` `@5.75:97.6`
50///
51/// <https://w3c.github.io/epub-specs/epub33/epubcfi/#sec-path-terminating-temporal>
52static RE_SPACIAL_OFFSET: Lazy<Regex> = Lazy::new(|| Regex::new(r"@[0-9.]+:[0-9.]+").unwrap());
53
54/// Returns a simplified location string from a `epubcfi`.
55///
56/// This is a super simple EPUB CFI parser with a focus on extracting location information for
57/// sorting [`Annotation`][annotation]s.
58///
59/// Examples:
60///
61/// ```plaintext
62/// input: epubcfi(/6/4[chap01ref]!/4[body01]/10[para05]/1:3[xx,y])
63/// output: 6.4.4.10.1:3
64/// ```
65/// <https://w3c.github.io/epub-specs/epub33/epubcfi/#example-8>
66///
67/// ```plaintext
68/// input: epubcfi(/6/4[chap01ref]!/4[body01]/10[para05],/2/1:1,/3:4)
69/// output: 6.4.4.10.2.1:1
70/// ```
71/// <https://w3c.github.io/epub-specs/epub33/epubcfi/#example-23>
72///
73/// See <https://w3c.github.io/epub-specs/epub33/epubcfi/> for more
74/// information.
75///
76/// [annotation]: super::annotation::Annotation
77#[must_use]
78pub fn parse(raw: &str) -> String {
79 // Check that the incoming string is an `epubcfi`.
80 if !raw.starts_with("epubcfi(") && !raw.ends_with(')') {
81 return String::new();
82 }
83
84 // Starting with:
85 //
86 // A: epubcfi(/6/4[chap01ref]!/4[body01]/10[para05],/2/1:1,/3:4)
87 // B: epubcfi(/6/4[chap01ref]!/4[body01]/10[para05]/1:3[xx,y])
88 // C: epubcfi(/2/4!/6[bar]/44!/3~1.11@1:1)
89
90 // Strip start and end: i.e. `epubcfi(` & `)`
91 //
92 // -> A: /6/4[chap01ref]!/4[body01]/10[para05],/2/1:1,/3:4
93 // -> B: /6/4[chap01ref]!/4[body01]/10[para05]/1:3[xx,y]
94 // -> C: /2/4!/6[bar]/44!/3~1.11@1:1
95 let mut location = raw[8..raw.len() - 1].to_owned();
96
97 // Dropping the following elements means they are not taken into consideration during sorting
98 // comparisons between `Annotation`s.
99
100 // Remove any type of 'Assertion'.
101 //
102 // -> A: /6/4!/4/10,/2/1:1,/3:4
103 // -> B: /6/4!/4/10/1:3
104 // -> C: /2/4!/6/44!/3~1.11@1:1
105 location = RE_ASSERTIONS.replace_all(&location, "").into_owned();
106
107 // Remove 'Temporal Offsets' (~)..
108 //
109 // -> A: ...
110 // -> B: ...
111 // -> C: /2/4!/6/44!/3@1:1
112 location = RE_TEMPORAL_OFFSET.replace_all(&location, "").into_owned();
113
114 // Remove 'Spacial Offsets' (@).
115 //
116 // -> A: ...
117 // -> B: ...
118 // -> C: /2/4!/6/44!/3
119 location = RE_SPACIAL_OFFSET.replace_all(&location, "").into_owned();
120
121 // "EPUB CFIs allow the expression of simple ranges extending from a start location to an end
122 // location."
123 //
124 // <https://w3c.github.io/epub-specs/epub33/epubcfi/#sec-ranges>
125 //
126 // For example:
127 //
128 // epubcfi([parent-path],[range-start],[range-end])
129 //
130 // We only care about the [parent-path] and [range-start] which gives us the absolute path to
131 // where an `Annotation` begins.
132 let mut parts: Vec<&str> = location.split(',').collect();
133 parts = match parts[..] {
134 [parent_path, range_start, _] => {
135 vec![parent_path, range_start]
136 }
137 _ => parts,
138 };
139
140 // -> A: /6/4!/4/10,/2/1:1
141 // -> B: /6/4!/4/10/1:3
142 // -> C: /2/4!/6/44!/3
143 location = parts.join("");
144
145 // -> A: /6/4/4/10/2/1
146 // -> B: /6/4/4/10/1
147 // -> C: /2/4/6/44/3
148 let mut steps = RE_STEP_REFERENCE
149 .find_iter(&location)
150 .map(|m| m.as_str())
151 .map(ToOwned::to_owned)
152 .collect::<String>();
153
154 // -> A: 6/4/4/10/2/1
155 // -> B: 6/4/4/10/1
156 // -> C: 2/4/6/44/3
157 steps.remove(0);
158
159 // -> A: 6.4.4.10.2.1
160 // -> B: 6.4.4.10.1
161 // -> C: 2.4.6.44.3
162 steps = steps.replace('/', ".");
163
164 // Save the character offset found at the end of [range-start].
165 //
166 // -> A: :1
167 // -> B: :3
168 // -> C: N/A
169 let character_offset = RE_CHARACTER_OFFSET
170 .find(&location)
171 .map(|m| m.as_str())
172 .map_or_else(String::new, ToOwned::to_owned);
173
174 // -> A: 6.4.4.10.2.1:1
175 // -> B: 6.4.4.10.1:3
176 // -> C: 2.4.6.44.3
177 location = format!("{steps}{character_offset}");
178
179 location
180}
181
182#[cfg(test)]
183mod test {
184
185 use super::*;
186
187 // https://stackoverflow.com/a/34666891/16968574
188 macro_rules! parse_epubcfi {
189 ($($name:ident: $value:expr,)*) => {
190 $(
191 #[test]
192 fn $name() {
193 let (raw, expected) = $value;
194 let parsed = parse(raw);
195 assert_eq!(parsed, expected);
196 }
197 )*
198 }
199 }
200
201 // https://stackoverflow.com/a/34666891/16968574
202 macro_rules! compare_epubcfi {
203 ($($name:ident: ($lhs:tt $cmp:tt $rhs:tt),)*) => {
204 $(
205 #[test]
206 fn $name() {
207 let lhs_parsed = parse($lhs);
208 let rhs_parsed = parse($rhs);
209 assert!(lhs_parsed $cmp rhs_parsed);
210 }
211 )*
212 }
213 }
214
215 // <https://github.com/fread-ink/epub-cfi-resolver/blob/master/tests/simple.js>
216 parse_epubcfi! {
217 parse_epubcfi_00: (
218 "epubcfi(/1/2)",
219 "1.2",
220 ),
221 parse_epubcfi_01: (
222 "epubcfi(/1/0)",
223 "1.0",
224 ),
225 parse_epubcfi_02: (
226 "epubcfi(/1/2:3[pre,post])",
227 "1.2:3",
228 ),
229 parse_epubcfi_03: (
230 "epubcfi(/1/2:3[,post])",
231 "1.2:3",
232 ),
233 parse_epubcfi_04: (
234 "epubcfi(/1/2:3[pre,])",
235 "1.2:3",
236 ),
237 parse_epubcfi_05: (
238 "epubcfi(/1[^^^]])",
239 "1",
240 ),
241 parse_epubcfi_06: (
242 "epubcfi(/6/14[cha!/p05ref]!/4[bo!/dy01]/10/2/1[foo]:5[don't!/ panic;s=b])",
243 "6.14.4.10.2.1:5",
244 ),
245 parse_epubcfi_07: (
246 "epubcfi(/6/4[chap01ref]!/4[body01]/10[para05]/3:5)",
247 "6.4.4.10.3:5",
248 ),
249 parse_epubcfi_08: (
250 "epubcfi(/6/4[chap01ref]!/4/10/0)",
251 "6.4.4.10.0",
252 ),
253 parse_epubcfi_09: (
254 "epubcfi(/6/4[chap01ref]!/4/10/999)",
255 "6.4.4.10.999",
256 ),
257 parse_epubcfi_10: (
258 "epubcfi(/6/4[chap01ref]!/4[body01],/10[para05]/3:5,/10[para05]/3:8)",
259 "6.4.4.10.3:5",
260 ),
261 parse_epubcfi_11: (
262 "epubcfi(/6/4[chap01ref]!/4[body01]/10[para05]/3:3[34,67])",
263 "6.4.4.10.3:3",
264 ),
265 parse_epubcfi_12: (
266 "epubcfi(/6/14[cha!/p05ref]!/4[bo!/dy01]/10/2/1[foo]~42.43@100:101)",
267 "6.14.4.10.2.1",
268 ),
269 // Test that 'Temporal' and 'Spatial' offsets are ignored on all but last subpart.
270 parse_epubcfi_13: (
271 "epubcfi(/2~42.43@100:101/4!/6/8:100/6:200)",
272 "2.4.6.8.6:200",
273 ),
274 // Test that parser ignores vendor extensions.
275 // <https://w3c.github.io/epub-specs/epub33/epubcfi/#sec-extensions>
276 parse_epubcfi_14: (
277 "epubcfi(/2/4vnd.foo/6foo.bar:20)",
278 "2.4.6:20",
279 ),
280 parse_epubcfi_15: (
281 "epubcfi(/6/4[chap01ref]!/4[body01]/10[para05],/2/1:1,/3:4)",
282 "6.4.4.10.2.1:1",
283 ),
284 parse_epubcfi_16: (
285 "epubcfi(/6/4[chap01ref]!/4[body01]/10[para05]/1:3[xx,y])",
286 "6.4.4.10.1:3",
287 ),
288 parse_epubcfi_17: (
289 "epubcfi(/6/28[chap06]!/4/24[para06]/1,:4,:44)",
290 "6.28.4.24.1:4",
291 ),
292 parse_epubcfi_18: (
293 "epubcfi(/2/4[node-id]!/6/7:5[pre,post;s=b])",
294 "2.4.6.7:5",
295 ),
296 parse_epubcfi_19: (
297 "epubcfi(/2/4@4:2)",
298 "2.4",
299 ),
300 parse_epubcfi_20: (
301 "epubcfi(/2/4~3.14)",
302 "2.4",
303 ),
304 parse_epubcfi_21: (
305 "epubcfi(/2/4~3.14@4:2)",
306 "2.4",
307 ),
308 }
309
310 // <https://github.com/fread-ink/epub-cfi-resolver/blob/master/tests/compare.js>
311 compare_epubcfi! {
312 compare_epubcfi_00: (
313 "epubcfi(/2)" < "epubcfi(/6)"
314 ),
315 compare_epubcfi_01: (
316 "epubcfi(/2/4!/6)" < "epubcfi(/2/4!/7)"
317 ),
318 compare_epubcfi_02: (
319 "epubcfi(/2/4!/8)" > "epubcfi(/2/4!/7)"
320 ),
321 compare_epubcfi_03: (
322 "epubcfi(/2/4!/6[foo]/42!/12:100[lol])" < "epubcfi(/2/4!/6[bar]/44!/12:100[cat])"
323 ),
324 // Test that node ids and text location assertions are ignored.
325 compare_epubcfi_04: (
326 "epubcfi(/2/4!/6[foo]/44!/12:100[lol])" == "epubcfi(/2/4!/6[bar]/44!/12:100[cat])"
327 ),
328 compare_epubcfi_05: (
329 "epubcfi(/2/4!/6[bar]/44!/12:100[cat])" == "epubcfi(/2/4!/6[bar]/44!/12:100[cat])"
330 ),
331 // Test that temporal and spatial offsets are ignored on character (text/cdata) nodes
332 compare_epubcfi_06: (
333 "epubcfi(/2/4!/6[bar]/44!/3~1.11@1:1)" == "epubcfi(/2/4!/6[bar]/44!/3~2.22@2:2)"
334 ),
335 // Compare identical ranges.
336 compare_epubcfi_07: (
337 "epubcfi(/2/4,/6/8,/10/12)" == "epubcfi(/2/4,/6/8,/10/12)"
338 ),
339 // Compare ranges with different [range-start].
340 compare_epubcfi_08: (
341 "epubcfi(/2/4,/6/7,/10/11)" < "epubcfi(/2/4,/6/8,/10/12)"
342 ),
343 // Compare ranges with different [parent-path].
344 compare_epubcfi_09: (
345 "epubcfi(/2/2,/6/8,/10/12)" < "epubcfi(/2/4,/6/8,/10/12)"
346 ),
347 // Compare a range against a non-range.
348 compare_epubcfi_10: (
349 "epubcfi(/2/4,/6/8,/10/13)" > "epubcfi(/2/4/6/7)"
350 ),
351 // Compare a range against a non-range
352 compare_epubcfi_11: (
353 "epubcfi(/2/4,/6/8,/10/13)" == "epubcfi(/2/4/6/8)"
354 ),
355 compare_epubcfi_12: (
356 "epubcfi(/2/4!/6[bar]/44!/12:100[hah])" < "epubcfi(/2/4!/6[bar]/44!/12:200[cat])"
357 ),
358 }
359}