cartulary 0.3.0-alpha.1

The knowledge layer of your project — decisions, issues, docs, all in one place.
Documentation
//! URL-safe slug derived from a filesystem path segment.
//!
//! The transformation is path-driven: the filename is the source of
//! truth. No frontmatter override.

use std::fmt;

#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
pub struct Slug(String);

#[derive(Debug, Clone, PartialEq, Eq)]
pub enum SlugError {
    /// The segment yielded an empty slug — typically because it was
    /// just `.md`, a single space, or only-foldable-to-nothing input.
    Empty { segment: String },
    /// The segment contained a character that has no defined
    /// transformation rule.
    Rejected { segment: String, offending: char },
}

impl fmt::Display for SlugError {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        match self {
            Self::Empty { segment } => {
                write!(f, "slug derived from {segment:?} is empty")
            }
            Self::Rejected { segment, offending } => write!(
                f,
                "character {offending:?} in {segment:?} cannot be used in a slug",
            ),
        }
    }
}

impl std::error::Error for SlugError {}

impl Slug {
    /// Derive a slug from a single path segment (a filename, with or
    /// without the `.md` extension).
    ///
    /// Rules:
    /// - trailing `.md` is stripped;
    /// - uppercase ASCII is lowercased;
    /// - common Latin accented letters are folded to ASCII;
    /// - space and underscore become `-`;
    /// - `a-z`, `0-9`, `-` pass through;
    /// - anything else is rejected.
    pub fn from_segment(segment: &str) -> Result<Self, SlugError> {
        let base = segment.strip_suffix(".md").unwrap_or(segment);

        let mut out = String::with_capacity(base.len());
        for c in base.chars() {
            match c {
                'a'..='z' | '0'..='9' | '-' => out.push(c),
                'A'..='Z' => out.push(c.to_ascii_lowercase()),
                ' ' | '_' => out.push('-'),
                other => match fold_latin(other) {
                    Some(s) => out.push_str(s),
                    None => {
                        return Err(SlugError::Rejected {
                            segment: segment.to_owned(),
                            offending: other,
                        });
                    }
                },
            }
        }

        if out.is_empty() {
            return Err(SlugError::Empty {
                segment: segment.to_owned(),
            });
        }

        Ok(Self(out))
    }

    pub fn as_str(&self) -> &str {
        &self.0
    }
}

impl fmt::Display for Slug {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        self.0.fmt(f)
    }
}

/// Fold the common Latin-1 / Latin-Extended-A letters down to their
/// ASCII spine. Returns `None` for any character outside that small
/// table — callers treat that as a rejection.
fn fold_latin(c: char) -> Option<&'static str> {
    Some(match c {
        'À' | 'Á' | 'Â' | 'Ã' | 'Ä' | 'Å' | 'à' | 'á' | 'â' | 'ã' | 'ä' | 'å' => "a",
        'Æ' | 'æ' => "ae",
        'Ç' | 'ç' => "c",
        'È' | 'É' | 'Ê' | 'Ë' | 'è' | 'é' | 'ê' | 'ë' => "e",
        'Ì' | 'Í' | 'Î' | 'Ï' | 'ì' | 'í' | 'î' | 'ï' => "i",
        'Ñ' | 'ñ' => "n",
        'Ò' | 'Ó' | 'Ô' | 'Õ' | 'Ö' | 'Ø' | 'ò' | 'ó' | 'ô' | 'õ' | 'ö' | 'ø' => "o",
        'Œ' | 'œ' => "oe",
        'ß' => "ss",
        'Ù' | 'Ú' | 'Û' | 'Ü' | 'ù' | 'ú' | 'û' | 'ü' => "u",
        'Ý' | 'ý' | 'ÿ' => "y",
        _ => return None,
    })
}

#[cfg(test)]
pub mod strategy {
    use super::*;
    use proptest::prelude::*;

    /// Generates strings made of `[a-z0-9-]` only — already in
    /// canonical slug shape. Useful for round-trip / idempotence
    /// properties.
    pub fn arb_canonical_segment() -> impl Strategy<Value = String> {
        proptest::string::string_regex("[a-z0-9][a-z0-9-]{0,40}").unwrap()
    }

    pub fn arb_slug() -> impl Strategy<Value = Slug> {
        arb_canonical_segment().prop_map(|s| Slug::from_segment(&s).unwrap())
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn strips_md_extension() {
        assert_eq!(
            Slug::from_segment("getting-started.md").unwrap().as_str(),
            "getting-started"
        );
    }

    #[test]
    fn lowercases_ascii() {
        assert_eq!(Slug::from_segment("Foo.md").unwrap().as_str(), "foo");
    }

    #[test]
    fn spaces_become_dashes() {
        assert_eq!(
            Slug::from_segment("Foo Bar.md").unwrap().as_str(),
            "foo-bar"
        );
    }

    #[test]
    fn underscores_become_dashes() {
        assert_eq!(
            Slug::from_segment("foo_bar.md").unwrap().as_str(),
            "foo-bar"
        );
    }

    #[test]
    fn keeps_numeric_prefix() {
        assert_eq!(
            Slug::from_segment("01-intro.md").unwrap().as_str(),
            "01-intro"
        );
    }

    #[test]
    fn folds_common_accents() {
        assert_eq!(Slug::from_segment("café.md").unwrap().as_str(), "cafe");
        assert_eq!(Slug::from_segment("naïve.md").unwrap().as_str(), "naive");
        assert_eq!(Slug::from_segment("œuvre.md").unwrap().as_str(), "oeuvre");
        assert_eq!(Slug::from_segment("straße.md").unwrap().as_str(), "strasse");
    }

    #[test]
    fn rejects_question_mark() {
        let err = Slug::from_segment("a?b.md").unwrap_err();
        assert!(matches!(err, SlugError::Rejected { offending: '?', .. }));
    }

    #[test]
    fn rejects_hash() {
        let err = Slug::from_segment("a#b.md").unwrap_err();
        assert!(matches!(err, SlugError::Rejected { offending: '#', .. }));
    }

    #[test]
    fn rejects_ampersand() {
        let err = Slug::from_segment("a&b.md").unwrap_err();
        assert!(matches!(err, SlugError::Rejected { offending: '&', .. }));
    }

    #[test]
    fn rejects_inner_slash() {
        let err = Slug::from_segment("a/b.md").unwrap_err();
        assert!(matches!(err, SlugError::Rejected { offending: '/', .. }));
    }

    #[test]
    fn rejects_dot_inside_basename() {
        let err = Slug::from_segment("v1.2.md").unwrap_err();
        assert!(matches!(err, SlugError::Rejected { offending: '.', .. }));
    }

    #[test]
    fn rejects_unmapped_unicode() {
        let err = Slug::from_segment("漢字.md").unwrap_err();
        assert!(matches!(err, SlugError::Rejected { .. }));
    }

    #[test]
    fn empty_basename_is_rejected() {
        assert!(matches!(
            Slug::from_segment(".md"),
            Err(SlugError::Empty { .. })
        ));
        assert!(matches!(
            Slug::from_segment(""),
            Err(SlugError::Empty { .. })
        ));
    }

    #[test]
    fn segment_without_md_is_accepted() {
        assert_eq!(Slug::from_segment("foo").unwrap().as_str(), "foo");
    }

    proptest::proptest! {
        #[test]
        fn prop_canonical_segments_are_accepted(s in strategy::arb_canonical_segment()) {
            let slug = Slug::from_segment(&s).unwrap();
            assert_eq!(slug.as_str(), s);
        }

        #[test]
        fn prop_double_application_is_idempotent(s in strategy::arb_canonical_segment()) {
            let once = Slug::from_segment(&s).unwrap();
            let twice = Slug::from_segment(once.as_str()).unwrap();
            assert_eq!(once, twice);
        }

        #[test]
        fn prop_question_mark_always_rejected(
            prefix in "[a-z]{1,5}", suffix in "[a-z]{1,5}",
        ) {
            let segment = format!("{prefix}?{suffix}.md");
            assert!(matches!(
                Slug::from_segment(&segment),
                Err(SlugError::Rejected { offending: '?', .. }),
            ));
        }
    }
}