paft_utils/
string_canonical.rs

1//! Shared canonical string utilities for extensible enums.
2//!
3//! All extensible enum `Other` branches must construct their canonical token via
4//! [`Canonical::try_new`] to guarantee we never serialize an empty string and thus
5//! preserve serde/display round-trips.
6
7use std::{
8    borrow::{Borrow, Cow},
9    fmt,
10    str::FromStr,
11};
12
13/// Canonical string wrapper used for `Other` variants.
14///
15/// Invariants:
16/// - Trimmed
17/// - ASCII uppercased
18/// - Whitespace collapsed to single underscores
19#[derive(Debug, Clone, PartialEq, Eq, Hash)]
20pub struct Canonical(String);
21
22impl Canonical {
23    /// Attempts to create a new canonical string from arbitrary input, rejecting
24    /// values that would canonicalize to an empty token (e.g., strings composed
25    /// solely of separators or non-alphanumeric characters).
26    ///
27    /// This should be used by all enum `Other` variants to ensure the emitted
28    /// string is always non-empty and round-trips via serde and `Display`.
29    ///
30    /// # Errors
31    ///
32    /// Returns `CanonicalError::InvalidCanonicalToken` when the canonicalized token would
33    /// be empty.
34    pub fn try_new(input: &str) -> Result<Self, CanonicalError> {
35        let token = canonicalize(input);
36        if token.is_empty() {
37            return Err(CanonicalError::InvalidCanonicalToken {
38                value: input.to_string(),
39            });
40        }
41        Ok(Self(token.into_owned()))
42    }
43
44    /// Returns the inner canonical string slice.
45    #[inline]
46    #[must_use]
47    pub fn as_str(&self) -> &str {
48        &self.0
49    }
50
51    /// Consumes the `Canonical` and returns the inner `String`.
52    #[inline]
53    #[must_use]
54    pub fn into_inner(self) -> String {
55        self.0
56    }
57}
58
59impl fmt::Display for Canonical {
60    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
61        f.write_str(self.as_ref())
62    }
63}
64
65impl AsRef<str> for Canonical {
66    fn as_ref(&self) -> &str {
67        self.0.as_ref()
68    }
69}
70
71impl Borrow<str> for Canonical {
72    fn borrow(&self) -> &str {
73        self.as_ref()
74    }
75}
76
77impl FromStr for Canonical {
78    type Err = CanonicalError;
79
80    fn from_str(s: &str) -> Result<Self, Self::Err> {
81        Self::try_new(s)
82    }
83}
84
85/// Produces the canonical representation of an input string used across enums.
86///
87/// All `Display`/serde string forms across enums are canonical tokens produced by this function.
88///
89/// # Canonical Form Contract
90///
91/// Canonical form is `[A-Z0-9]+(?:_[A-Z0-9]+)*`. Non-ASCII and non-alphanumeric characters
92/// are treated as separators. Empty after normalization → error.
93///
94/// # Canonicalization Rules
95///
96/// - **ASCII-only**: Only ASCII uppercase letters (A-Z) and digits (0-9) are preserved as-is
97/// - **Case normalization**: ASCII lowercase letters are converted to uppercase
98/// - **Separators**: All non-alphanumeric ASCII characters and Unicode codepoints become separators
99/// - **Separator handling**: Contiguous separators collapse to a single underscore `_`
100/// - **Trimming**: Leading and trailing separators are removed
101/// - **Underscores**: Multiple underscores collapse to single underscores; no leading/trailing/double underscores
102///
103/// Returns `Cow::Borrowed(input)` if `input` is already canonical; otherwise returns an owned, normalized string.
104#[inline]
105#[must_use]
106pub fn canonicalize(input: &str) -> Cow<'_, str> {
107    // Fast path: check if input is already canonical
108    if is_canonical(input) {
109        return Cow::Borrowed(input);
110    }
111
112    let mut out = String::with_capacity(input.len());
113    let mut prev_sep = true; // treat start as "just saw a separator" to skip leading seps
114
115    for ch in input.chars() {
116        let c = ch.to_ascii_uppercase();
117        if c.is_ascii_alphanumeric() {
118            out.push(c);
119            prev_sep = false;
120        } else if !prev_sep {
121            out.push('_');
122            prev_sep = true;
123        }
124    }
125
126    if out.ends_with('_') {
127        out.pop(); // drop trailing separator without reallocation
128    }
129
130    Cow::Owned(out)
131}
132
133/// Checks if a string is already in canonical form.
134///
135/// A string is canonical if:
136/// - All characters are ASCII uppercase letters or digits
137/// - There are no consecutive non-alphanumeric characters
138/// - There are no leading or trailing underscores
139#[inline]
140fn is_canonical(input: &str) -> bool {
141    let b = input.as_bytes();
142    if b.is_empty() || b[0] == b'_' || b[b.len() - 1] == b'_' {
143        return false;
144    }
145    let mut prev = b'_';
146    for &c in b {
147        match c {
148            b'A'..=b'Z' | b'0'..=b'9' => prev = c,
149            b'_' if prev != b'_' => prev = c,
150            _ => return false,
151        }
152    }
153    true
154}
155
156/// Trait for enums that have a canonical string code.
157///
158/// Implemented via macros across the paft workspace.
159pub trait StringCode {
160    /// Returns the canonical string code for this value.
161    fn code(&self) -> &str;
162
163    /// Whether this value is a canonical enum variant (not an `Other` payload).
164    fn is_canonical(&self) -> bool {
165        true
166    }
167}
168
169/// Errors that can occur when constructing canonical strings.
170#[derive(Debug, thiserror::Error, Clone, PartialEq, Eq)]
171pub enum CanonicalError {
172    /// Invalid canonical token produced by normalization helpers.
173    #[error("Invalid canonical token: '{value}' - canonicalized value must be non-empty")]
174    InvalidCanonicalToken {
175        /// The original input that failed to produce a canonical token.
176        value: String,
177    },
178}