readstor 0.5.0

A CLI for Apple Books annotations
//! Defines a parser to convert an [epubcfi][epubcfi] into a sortable string
//! for sorting annotations into their order of appearance inside their
//! respective books.
//! [epubcfi]:

use std::borrow::ToOwned;

use once_cell::sync::Lazy;
use regex::Regex;

/// Capture a 'Step Reference' e.g. `/6` `/4`
/// <>
static RE_STEP_REFERENCE: Lazy<Regex> = Lazy::new(|| Regex::new(r"/[0-9]+").unwrap());

/// Captures an 'XML ID Assertion / Text Location Assertion' e.g. `[chap01]`
/// The specific difference between these two doesn't matter for our purposes.
/// We just need to strip out anything that resembles an 'Assertion'.
/// <>
/// <>
static RE_ASSERTIONS: Lazy<Regex> = Lazy::new(|| {
        # Captures opening square bracket e.g. `[`

        # Captures anything but square brackets e.g. `chap01`

        # Captures closing square bracket e.g. `]`

/// Captures a 'Character Offset' e.g. `:2` `:100`
/// <>
static RE_CHARACTER_OFFSET: Lazy<Regex> = Lazy::new(|| Regex::new(r":[0-9]+$").unwrap());

/// Captures a 'Spacial Offset' e.g. `~23.5` `~42.43`
/// <>
static RE_TEMPORAL_OFFSET: Lazy<Regex> = Lazy::new(|| Regex::new(r"~[0-9]+\.[0-9]+").unwrap());

/// Captures a 'Temporal Offset' e.g. `@100:100` `@5.75:97.6`
/// <>
static RE_SPACIAL_OFFSET: Lazy<Regex> = Lazy::new(|| Regex::new(r"@[0-9.]+:[0-9.]+").unwrap());

/// Returns a simplified location string from a `epubcfi`.
/// This is a super simple EPUB CFI parser with a focus on extracting location
/// information for sorting [`Annotation`][annotation]s.
/// Examples:
/// ```plaintext
/// input:  epubcfi(/6/4[chap01ref]!/4[body01]/10[para05]/1:3[xx,y])
/// output:
/// ```
/// <>
/// ```plaintext
/// input:  epubcfi(/6/4[chap01ref]!/4[body01]/10[para05],/2/1:1,/3:4)
/// output:
/// ```
/// <>
/// See <> for more
/// information.
/// [annotation]: super::annotation::Annotation
pub fn parse(raw: &str) -> String {
    // Check that the incoming string is an `epubcfi`.
    if !raw.starts_with("epubcfi(") && !raw.ends_with(')') {
        return String::new();

    // Starting with:
    //    A: epubcfi(/6/4[chap01ref]!/4[body01]/10[para05],/2/1:1,/3:4)
    //    B: epubcfi(/6/4[chap01ref]!/4[body01]/10[para05]/1:3[xx,y])
    //    C: epubcfi(/2/4!/6[bar]/44!/3~1.11@1:1)

    // Strip start and end: i.e. `epubcfi(` & `)`
    // -> A: /6/4[chap01ref]!/4[body01]/10[para05],/2/1:1,/3:4
    // -> B: /6/4[chap01ref]!/4[body01]/10[para05]/1:3[xx,y]
    // -> C: /2/4!/6[bar]/44!/3~1.11@1:1
    let mut location = raw[8..raw.len() - 1].to_owned();

    // Dropping the following elements means they are not taken into
    // consideration during sorting comparisons between `Annotation`s.

    // Remove any type of 'Assertion'.
    // -> A: /6/4!/4/10,/2/1:1,/3:4
    // -> B: /6/4!/4/10/1:3
    // -> C: /2/4!/6/44!/3~1.11@1:1
    location = RE_ASSERTIONS.replace_all(&location, "").into_owned();

    // Remove 'Temporal Offsets' (~)..
    // -> A: ...
    // -> B: ...
    // -> C: /2/4!/6/44!/3@1:1
    location = RE_TEMPORAL_OFFSET.replace_all(&location, "").into_owned();

    // Remove 'Spacial Offsets' (@).
    // -> A: ...
    // -> B: ...
    // -> C: /2/4!/6/44!/3
    location = RE_SPACIAL_OFFSET.replace_all(&location, "").into_owned();

    // "EPUB CFIs allow the expression of simple ranges extending from a start
    // location to an end location."
    // <>
    // For example:
    //     epubcfi([parent-path],[range-start],[range-end])
    // We only care about the [parent-path] and [range-start] which gives us
    // the absolute path to where an `Annotation` begins.
    let mut parts: Vec<&str> = location.split(',').collect();
    parts = match parts[..] {
        [parent_path, range_start, _] => {
            vec![parent_path, range_start]
        _ => parts,

    // -> A: /6/4!/4/10,/2/1:1
    // -> B: /6/4!/4/10/1:3
    // -> C: /2/4!/6/44!/3
    location = parts.join("");

    // -> A: /6/4/4/10/2/1
    // -> B: /6/4/4/10/1
    // -> C: /2/4/6/44/3
    let mut steps = RE_STEP_REFERENCE
        .map(|m| m.as_str())

    // -> A: 6/4/4/10/2/1
    // -> B: 6/4/4/10/1
    // -> C: 2/4/6/44/3

    // -> A:
    // -> B:
    // -> C:
    steps = steps.replace('/', ".");

    // Save the character offset found at the end of [range-start].
    // -> A: :1
    // -> B: :3
    // -> C: N/A
    let character_offset = RE_CHARACTER_OFFSET
        .map(|m| m.as_str())
        .map_or_else(String::new, ToOwned::to_owned);

    // -> A:
    // -> B:
    // -> C:
    location = format!("{steps}{character_offset}");


mod test_epubcfi_parser {

    use super::*;

    macro_rules! test_parse {
        ($($name:ident: $value:expr,)*) => {
                fn $name() {
                    let (raw, expected) = $value;
                    let parsed = parse(raw);
                    assert_eq!(parsed, expected);

    macro_rules! test_compare {
        ($($name:ident: ($lhs:tt $cmp:tt $rhs:tt),)*) => {
                fn $name() {
                    let lhs_parsed = parse($lhs);
                    let rhs_parsed = parse($rhs);
                    assert!(lhs_parsed $cmp rhs_parsed);

    // <>
    test_parse! {
        test_parse_00: (
        test_parse_01: (
        test_parse_02: (
        test_parse_03: (
        test_parse_04: (
        test_parse_05: (
        test_parse_06: (
            "epubcfi(/6/14[cha!/p05ref]!/4[bo!/dy01]/10/2/1[foo]:5[don't!/ panic;s=b])",
        test_parse_07: (
        test_parse_08: (
        test_parse_09: (
        test_parse_10: (
        test_parse_11: (
        test_parse_12: (
        test_parse_13: (
            // Test that 'Temporal' and 'Spatial' offsets are ignored on all
            // but last subpart.
        test_parse_14: (
            // Test that parser ignores vendor extensions.
            // <>
        test_parse_15: (
        test_parse_16: (
        test_parse_17: (
            // TODO: Could this --------------------^^ cause an error? Should it
            // be padded with a `0` so it doesn't look like its attached to the
            // wrong step? -> ''
        test_parse_18: (
        test_parse_19: (
        test_parse_20: (
        test_parse_21: (

    // <>
    test_compare! {
        test_compare_00: (
            "epubcfi(/2)" < "epubcfi(/6)"
        test_compare_01: (
            "epubcfi(/2/4!/6)" < "epubcfi(/2/4!/7)"
        test_compare_02: (
            "epubcfi(/2/4!/8)" > "epubcfi(/2/4!/7)"
        test_compare_03: (
            "epubcfi(/2/4!/6[foo]/42!/12:100[lol])" < "epubcfi(/2/4!/6[bar]/44!/12:100[cat])"
        test_compare_04: (
            // Test that node ids and text location assertions are ignored.
            "epubcfi(/2/4!/6[foo]/44!/12:100[lol])" == "epubcfi(/2/4!/6[bar]/44!/12:100[cat])"
        test_compare_05: (
            "epubcfi(/2/4!/6[bar]/44!/12:100[cat])" == "epubcfi(/2/4!/6[bar]/44!/12:100[cat])"
        test_compare_06: (
            // Test that temporal and spatial offsets are ignored on character
            // (text/cdata) nodes
            "epubcfi(/2/4!/6[bar]/44!/3~1.11@1:1)" == "epubcfi(/2/4!/6[bar]/44!/3~2.22@2:2)"
        test_compare_07: (
            // Compare identical ranges.
            "epubcfi(/2/4,/6/8,/10/12)" == "epubcfi(/2/4,/6/8,/10/12)"
        test_compare_08: (
            // Compare ranges with different [range-start].
            "epubcfi(/2/4,/6/7,/10/11)" < "epubcfi(/2/4,/6/8,/10/12)"
        test_compare_09: (
            // Compare ranges with different [parent-path].
            "epubcfi(/2/2,/6/8,/10/12)" < "epubcfi(/2/4,/6/8,/10/12)"
        test_compare_10: (
            // Compare a range against a non-range.
            "epubcfi(/2/4,/6/8,/10/13)" > "epubcfi(/2/4/6/7)"
        test_compare_11: (
            // Compare a range against a non-range
            "epubcfi(/2/4,/6/8,/10/13)" == "epubcfi(/2/4/6/8)"
        test_compare_12: (
            "epubcfi(/2/4!/6[bar]/44!/12:100[hah])" < "epubcfi(/2/4!/6[bar]/44!/12:200[cat])"