Skip to main content

buffa_codegen/
idents.rs

1//! Rust identifier and path construction helpers.
2//!
3//! These are shared between buffa's codegen and downstream code generators
4//! (e.g. `connectrpc-codegen`) that emit Rust code alongside buffa's message
5//! types and need identical keyword-escaping and path-tokenization behavior.
6//!
7//! The guarantee is that if buffa generates `pub struct r#type::Foo { ... }`,
8//! downstream callers using [`rust_path_to_tokens`]`("type::Foo")` produce the
9//! matching `r#type::Foo` reference.
10
11use proc_macro2::{Ident, Span, TokenStream};
12use quote::{format_ident, quote};
13
14/// Parse a `::`-separated Rust path string into a [`TokenStream`], using raw
15/// identifiers (`r#type`) for segments that are Rust keywords.
16///
17/// Used instead of `syn::parse_str::<syn::Type>` because the latter cannot
18/// handle raw identifiers in path position: `"google::type::LatLng"` would
19/// fail to parse because `type` is a keyword, but this function correctly
20/// produces `google::r#type::LatLng`.
21///
22/// Path-position keywords (`self`, `super`, `Self`, `crate`) are emitted as
23/// plain idents (they're valid in paths) — this differs from
24/// [`make_field_ident`], which suffixes them with `_`.
25///
26/// Leading `::` (absolute path, e.g. `"::buffa::Message"`) is preserved.
27///
28/// # Panics
29///
30/// Panics (in debug) if `path` is empty.
31pub fn rust_path_to_tokens(path: &str) -> TokenStream {
32    debug_assert!(
33        !path.is_empty(),
34        "rust_path_to_tokens called with empty path"
35    );
36
37    // Handle absolute paths (starting with `::`, e.g. extern crate paths).
38    let (prefix, rest) = if let Some(stripped) = path.strip_prefix("::") {
39        (quote! { :: }, stripped)
40    } else {
41        (TokenStream::new(), path)
42    };
43
44    // For path segments, non-raw-able keywords (`self`, `super`, `Self`,
45    // `crate`) are emitted as plain idents because they are valid in path
46    // position. This differs from `make_field_ident`, which appends `_` for
47    // these keywords since they are invalid as struct field names.
48    let segments: Vec<Ident> = rest
49        .split("::")
50        .map(|seg| {
51            if is_rust_keyword(seg) && can_be_raw_ident(seg) {
52                Ident::new_raw(seg, Span::call_site())
53            } else {
54                Ident::new(seg, Span::call_site())
55            }
56        })
57        .collect();
58
59    quote! { #prefix #(#segments)::* }
60}
61
62/// Create a field identifier, escaping Rust keywords.
63///
64/// Most keywords use raw identifiers (`r#type`). The keywords `self`, `super`,
65/// `Self`, `crate` cannot be raw identifiers and are suffixed with `_` instead
66/// (e.g. `self_`), matching prost's convention.
67pub fn make_field_ident(name: &str) -> Ident {
68    if is_rust_keyword(name) {
69        if can_be_raw_ident(name) {
70            Ident::new_raw(name, Span::call_site())
71        } else {
72            format_ident!("{}_", name)
73        }
74    } else {
75        format_ident!("{}", name)
76    }
77}
78
79/// Convert a protobuf enum value name to `UpperCamelCase`.
80///
81/// Word boundaries are underscores **and** case transitions, so the conversion
82/// works on the canonical `SHOUTY_SNAKE_CASE` (`RULE_LEVEL_HIGH` → `RuleLevelHigh`)
83/// as well as non-canonical mixed-case inputs: a lower→upper transition starts a
84/// word (`myValue` → `MyValue`) and an acronym ends a word at the upper→lower
85/// transition (`HTTPServer` → `HttpServer`). Each word's first character is
86/// upper-cased and the rest lower-cased.
87///
88/// The conversion is intentionally lossy: `FOO_BAR` and `FOO__BAR` both collapse
89/// to `FooBar`, and `HTTPServer` and `HTTP_SERVER` both produce `HttpServer`. The
90/// caller is responsible for detecting the resulting collisions.
91///
92/// A leading digit in the output is only reachable when the caller has stripped
93/// a prefix first (e.g. `VERSION_2` → `2`); it is preserved verbatim, so callers
94/// that need a valid Rust identifier must check for it themselves.
95#[must_use]
96pub fn to_upper_camel_case(s: &str) -> String {
97    let chars: Vec<char> = s.chars().collect();
98    let mut out = String::new();
99    let mut start_of_word = true;
100    for (i, &ch) in chars.iter().enumerate() {
101        if ch == '_' {
102            start_of_word = true;
103            continue;
104        }
105        // Within a run of non-underscore characters, detect a word boundary at
106        // case transitions so mixed-case input splits correctly.
107        if !start_of_word && i > 0 {
108            let prev = chars[i - 1];
109            let lower_to_upper = prev.is_lowercase() && ch.is_uppercase();
110            let acronym_end = prev.is_uppercase()
111                && ch.is_uppercase()
112                && chars.get(i + 1).is_some_and(|c| c.is_lowercase());
113            if lower_to_upper || acronym_end {
114                start_of_word = true;
115            }
116        }
117        if start_of_word {
118            out.extend(ch.to_uppercase());
119            start_of_word = false;
120        } else {
121            out.extend(ch.to_lowercase());
122        }
123    }
124    out
125}
126
127/// Convert a type name to `SHOUTY_SNAKE_CASE`.
128///
129/// Used to reconstruct the conventional enum-value prefix from an enum's proto
130/// name so it can be stripped: `RuleLevel` → `RULE_LEVEL` (then values like
131/// `RULE_LEVEL_HIGH` lose the `RULE_LEVEL_` prefix). An underscore is inserted
132/// at each lower→upper boundary and at acronym→word boundaries
133/// (`HTTPServer` → `HTTP_SERVER`); existing underscores are preserved without
134/// doubling.
135#[must_use]
136pub fn to_shouty_snake_case(s: &str) -> String {
137    let chars: Vec<char> = s.chars().collect();
138    let mut out = String::new();
139    for (i, &ch) in chars.iter().enumerate() {
140        if ch == '_' {
141            out.push('_');
142            continue;
143        }
144        if i > 0 && ch.is_uppercase() && chars[i - 1] != '_' {
145            let prev = chars[i - 1];
146            let prev_starts_word = prev.is_lowercase() || prev.is_ascii_digit();
147            let acronym_boundary =
148                prev.is_uppercase() && chars.get(i + 1).is_some_and(|c| c.is_lowercase());
149            if prev_starts_word || acronym_boundary {
150                out.push('_');
151            }
152        }
153        out.extend(ch.to_uppercase());
154    }
155    out
156}
157
158/// Escape a proto package segment for use as a Rust `mod` name.
159///
160/// Returns `r#` prefix for raw-able keywords, `_` suffix for path-position
161/// keywords (which can't be raw), and the name as-is otherwise.
162///
163/// This is a `String` (not `Ident`) because callers typically emit it into
164/// source text (e.g. `pub mod {name} { ... }` via `format!`), not via `quote!`.
165pub fn escape_mod_ident(name: &str) -> String {
166    if is_rust_keyword(name) {
167        if can_be_raw_ident(name) {
168            format!("r#{name}")
169        } else {
170            format!("{name}_")
171        }
172    } else {
173        name.to_string()
174    }
175}
176
177/// Is `name` a Rust keyword (strict, edition-2018+, edition-2024+, or reserved)?
178///
179/// Covers all editions up to 2024. See `scripts/check-keywords.py` for the
180/// maintenance script that diffs this list against the upstream rustc source.
181pub fn is_rust_keyword(name: &str) -> bool {
182    matches!(
183        name,
184        // Strict keywords — all editions
185        "as" | "break"
186            | "const"
187            | "continue"
188            | "crate"
189            | "else"
190            | "enum"
191            | "extern"
192            | "false"
193            | "fn"
194            | "for"
195            | "if"
196            | "impl"
197            | "in"
198            | "let"
199            | "loop"
200            | "match"
201            | "mod"
202            | "move"
203            | "mut"
204            | "pub"
205            | "ref"
206            | "return"
207            | "self"
208            | "Self"
209            | "static"
210            | "struct"
211            | "super"
212            | "trait"
213            | "true"
214            | "type"
215            | "unsafe"
216            | "use"
217            | "where"
218            | "while"
219            // Strict keywords — edition 2018+
220            | "async"
221            | "await"
222            | "dyn"
223            // Strict keywords — edition 2024+
224            | "gen"
225            // Reserved for future use (all editions)
226            | "abstract"
227            | "become"
228            | "box"
229            | "do"
230            | "final"
231            | "macro"
232            | "override"
233            | "priv"
234            | "try"
235            | "typeof"
236            | "unsized"
237            | "virtual"
238            | "yield"
239    )
240}
241
242/// Can `name` be used as a raw identifier (`r#name`)?
243///
244/// `self`, `super`, `Self`, `crate` are valid path segments and cannot be
245/// prefixed with `r#`. They get a `_` suffix in field/mod position instead.
246fn can_be_raw_ident(name: &str) -> bool {
247    !matches!(name, "self" | "super" | "Self" | "crate")
248}
249
250#[cfg(test)]
251mod tests {
252    use super::*;
253
254    #[test]
255    fn rust_path_simple() {
256        assert_eq!(rust_path_to_tokens("Foo").to_string(), "Foo");
257    }
258
259    #[test]
260    fn rust_path_nested() {
261        assert_eq!(
262            rust_path_to_tokens("foo::bar::Baz").to_string(),
263            "foo :: bar :: Baz"
264        );
265    }
266
267    #[test]
268    fn rust_path_keyword_segment() {
269        // `type` is a keyword → raw identifier.
270        assert_eq!(
271            rust_path_to_tokens("google::type::LatLng").to_string(),
272            "google :: r#type :: LatLng"
273        );
274    }
275
276    #[test]
277    fn rust_path_absolute() {
278        assert_eq!(
279            rust_path_to_tokens("::buffa::Message").to_string(),
280            ":: buffa :: Message"
281        );
282    }
283
284    #[test]
285    fn rust_path_super_segment() {
286        // `super` is valid in path position → plain ident (no r# or _).
287        assert_eq!(
288            rust_path_to_tokens("super::super::Foo").to_string(),
289            "super :: super :: Foo"
290        );
291    }
292
293    #[test]
294    fn field_ident_normal() {
295        assert_eq!(make_field_ident("foo").to_string(), "foo");
296    }
297
298    #[test]
299    fn field_ident_keyword() {
300        assert_eq!(make_field_ident("type").to_string(), "r#type");
301    }
302
303    #[test]
304    fn field_ident_non_raw_keyword() {
305        // `self` can't be r#self → suffixed.
306        assert_eq!(make_field_ident("self").to_string(), "self_");
307        assert_eq!(make_field_ident("super").to_string(), "super_");
308        assert_eq!(make_field_ident("crate").to_string(), "crate_");
309        assert_eq!(make_field_ident("Self").to_string(), "Self_");
310    }
311
312    #[test]
313    fn escape_mod_normal() {
314        assert_eq!(escape_mod_ident("foo"), "foo");
315    }
316
317    #[test]
318    fn escape_mod_keyword() {
319        assert_eq!(escape_mod_ident("type"), "r#type");
320        assert_eq!(escape_mod_ident("async"), "r#async");
321    }
322
323    #[test]
324    fn escape_mod_non_raw_keyword() {
325        assert_eq!(escape_mod_ident("self"), "self_");
326        assert_eq!(escape_mod_ident("super"), "super_");
327    }
328
329    #[test]
330    fn upper_camel_basic() {
331        assert_eq!(to_upper_camel_case("RULE_LEVEL_HIGH"), "RuleLevelHigh");
332        assert_eq!(to_upper_camel_case("UNKNOWN"), "Unknown");
333        assert_eq!(to_upper_camel_case("low_priority"), "LowPriority");
334        assert_eq!(to_upper_camel_case("HTTP_SERVER"), "HttpServer");
335    }
336
337    #[test]
338    fn upper_camel_lossy_collisions() {
339        // Doubled and absent underscores collapse to the same identifier — the
340        // caller must detect this.
341        assert_eq!(to_upper_camel_case("FOO_BAR"), "FooBar");
342        assert_eq!(to_upper_camel_case("FOO__BAR"), "FooBar");
343        // Acronym vs snake also collapse — both must resolve to one identifier
344        // so the caller can detect the collision.
345        assert_eq!(to_upper_camel_case("HTTPServer"), "HttpServer");
346        assert_eq!(to_upper_camel_case("HTTP_SERVER"), "HttpServer");
347    }
348
349    #[test]
350    fn upper_camel_mixed_case_input() {
351        // Case transitions are word boundaries, so an already-CamelCase value
352        // round-trips (and is later skipped as a redundant alias).
353        assert_eq!(to_upper_camel_case("MyValue"), "MyValue");
354        assert_eq!(to_upper_camel_case("fooBar"), "FooBar");
355        assert_eq!(to_upper_camel_case("Active"), "Active");
356    }
357
358    #[test]
359    fn upper_camel_digit_and_empty() {
360        // Reachable only after a prefix strip; preserved verbatim for the
361        // caller's validity check.
362        assert_eq!(to_upper_camel_case("2"), "2");
363        assert_eq!(to_upper_camel_case(""), "");
364        assert_eq!(to_upper_camel_case("FOO_2"), "Foo2");
365    }
366
367    #[test]
368    fn upper_camel_keyword_source() {
369        // `SELF` folds to the keyword `Self`; identifier escaping is the
370        // caller's job (via `make_field_ident`).
371        assert_eq!(to_upper_camel_case("SELF"), "Self");
372    }
373
374    #[test]
375    fn shouty_snake_basic() {
376        assert_eq!(to_shouty_snake_case("RuleLevel"), "RULE_LEVEL");
377        assert_eq!(to_shouty_snake_case("NullValue"), "NULL_VALUE");
378        assert_eq!(to_shouty_snake_case("Type"), "TYPE");
379    }
380
381    #[test]
382    fn shouty_snake_acronym() {
383        assert_eq!(to_shouty_snake_case("HTTPServer"), "HTTP_SERVER");
384    }
385
386    #[test]
387    fn shouty_snake_already_snakey() {
388        // Idempotent on names that already carry underscores.
389        assert_eq!(to_shouty_snake_case("RULE_LEVEL"), "RULE_LEVEL");
390    }
391
392    #[test]
393    fn keyword_coverage() {
394        assert!(is_rust_keyword("type"));
395        assert!(is_rust_keyword("async"));
396        assert!(is_rust_keyword("gen")); // 2024
397        assert!(is_rust_keyword("yield")); // reserved
398        assert!(!is_rust_keyword("foo"));
399        assert!(!is_rust_keyword("Type")); // case-sensitive
400    }
401}