paft_utils/string_canonical.rs
1//! Shared canonical string utilities for extensible enums.
2//!
3//! All extensible enum `Other` branches must construct their canonical token via
4//! [`Canonical::try_new`] to guarantee we never serialize an empty string and thus
5//! preserve serde/display round-trips.
6
7use std::{
8 borrow::{Borrow, Cow},
9 fmt,
10 str::FromStr,
11};
12
13/// Canonical string wrapper used for `Other` variants.
14///
15/// Invariants:
16/// - Trimmed
17/// - ASCII uppercased
18/// - Whitespace collapsed to single underscores
19#[derive(Debug, Clone, PartialEq, Eq, Hash)]
20pub struct Canonical(String);
21
22impl Canonical {
23 /// Attempts to create a new canonical string from arbitrary input, rejecting
24 /// values that would canonicalize to an empty token (e.g., strings composed
25 /// solely of separators or non-alphanumeric characters).
26 ///
27 /// This should be used by all enum `Other` variants to ensure the emitted
28 /// string is always non-empty and round-trips via serde and `Display`.
29 ///
30 /// # Errors
31 ///
32 /// Returns `CanonicalError::InvalidCanonicalToken` when the canonicalized token would
33 /// be empty.
34 pub fn try_new(input: &str) -> Result<Self, CanonicalError> {
35 let token = canonicalize(input);
36 if token.is_empty() {
37 return Err(CanonicalError::InvalidCanonicalToken {
38 value: input.to_string(),
39 });
40 }
41 Ok(Self(token.into_owned()))
42 }
43
44 /// Returns the inner canonical string slice.
45 #[inline]
46 #[must_use]
47 pub fn as_str(&self) -> &str {
48 &self.0
49 }
50
51 /// Consumes the `Canonical` and returns the inner `String`.
52 #[inline]
53 #[must_use]
54 pub fn into_inner(self) -> String {
55 self.0
56 }
57}
58
59impl fmt::Display for Canonical {
60 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
61 f.write_str(self.as_ref())
62 }
63}
64
65impl AsRef<str> for Canonical {
66 fn as_ref(&self) -> &str {
67 self.0.as_ref()
68 }
69}
70
71impl Borrow<str> for Canonical {
72 fn borrow(&self) -> &str {
73 self.as_ref()
74 }
75}
76
77impl FromStr for Canonical {
78 type Err = CanonicalError;
79
80 fn from_str(s: &str) -> Result<Self, Self::Err> {
81 Self::try_new(s)
82 }
83}
84
85/// Produces the canonical representation of an input string used across enums.
86///
87/// All `Display`/serde string forms across enums are canonical tokens produced by this function.
88///
89/// # Canonical Form Contract
90///
91/// Canonical form is `[A-Z0-9]+(?:_[A-Z0-9]+)*`. Non-ASCII and non-alphanumeric characters
92/// are treated as separators. Empty after normalization → error.
93///
94/// # Canonicalization Rules
95///
96/// - **ASCII-only**: Only ASCII uppercase letters (A-Z) and digits (0-9) are preserved as-is
97/// - **Case normalization**: ASCII lowercase letters are converted to uppercase
98/// - **Separators**: All non-alphanumeric ASCII characters and Unicode codepoints become separators
99/// - **Separator handling**: Contiguous separators collapse to a single underscore `_`
100/// - **Trimming**: Leading and trailing separators are removed
101/// - **Underscores**: Multiple underscores collapse to single underscores; no leading/trailing/double underscores
102///
103/// Returns `Cow::Borrowed(input)` if `input` is already canonical; otherwise returns an owned, normalized string.
104#[inline]
105#[must_use]
106pub fn canonicalize(input: &str) -> Cow<'_, str> {
107 // Fast path: check if input is already canonical
108 if is_canonical(input) {
109 return Cow::Borrowed(input);
110 }
111
112 let mut out = String::with_capacity(input.len());
113 let mut prev_sep = true; // treat start as "just saw a separator" to skip leading seps
114
115 for ch in input.chars() {
116 let c = ch.to_ascii_uppercase();
117 if c.is_ascii_alphanumeric() {
118 out.push(c);
119 prev_sep = false;
120 } else if !prev_sep {
121 out.push('_');
122 prev_sep = true;
123 }
124 }
125
126 if out.ends_with('_') {
127 out.pop(); // drop trailing separator without reallocation
128 }
129
130 Cow::Owned(out)
131}
132
133/// Checks if a string is already in canonical form.
134///
135/// A string is canonical if:
136/// - All characters are ASCII uppercase letters or digits
137/// - There are no consecutive non-alphanumeric characters
138/// - There are no leading or trailing underscores
139#[inline]
140fn is_canonical(input: &str) -> bool {
141 let b = input.as_bytes();
142 if b.is_empty() || b[0] == b'_' || b[b.len() - 1] == b'_' {
143 return false;
144 }
145 let mut prev = b'_';
146 for &c in b {
147 match c {
148 b'A'..=b'Z' | b'0'..=b'9' => prev = c,
149 b'_' if prev != b'_' => prev = c,
150 _ => return false,
151 }
152 }
153 true
154}
155
156/// Trait for enums that have a canonical string code.
157///
158/// Implemented via macros across the paft workspace.
159pub trait StringCode {
160 /// Returns the canonical string code for this value.
161 fn code(&self) -> &str;
162
163 /// Whether this value is a canonical enum variant (not an `Other` payload).
164 fn is_canonical(&self) -> bool {
165 true
166 }
167}
168
169/// Errors that can occur when constructing canonical strings.
170#[derive(Debug, thiserror::Error, Clone, PartialEq, Eq)]
171pub enum CanonicalError {
172 /// Invalid canonical token produced by normalization helpers.
173 #[error("Invalid canonical token: '{value}' - canonicalized value must be non-empty")]
174 InvalidCanonicalToken {
175 /// The original input that failed to produce a canonical token.
176 value: String,
177 },
178}