ploidy_core/codegen/
unique.rs

1use std::{borrow::Cow, collections::hash_map::Entry, iter::Peekable, str::CharIndices};
2
3use bumpalo::{
4    Bump,
5    collections::{CollectIn, Vec as BumpVec},
6};
7use rustc_hash::FxHashMap;
8use unicase::UniCase;
9
10/// Deduplicates names across case conventions.
11#[derive(Debug, Default)]
12pub struct UniqueNames(Bump);
13
14impl UniqueNames {
15    /// Creates a new arena for deduplicating names.
16    #[inline]
17    pub fn new() -> Self {
18        Self::default()
19    }
20
21    /// Creates a new, empty scope that's backed by this arena.
22    ///
23    /// A scope produces names that will never collide with other names
24    /// within the same scope, even when converted to a different case.
25    ///
26    /// This is useful for disambiguating type and property names that are
27    /// distinct in the source spec, but collide when transformed
28    /// to a different case. For example, `HTTP_Response` and `HTTPResponse`
29    /// are distinct, but both become `http_response` in snake case.
30    #[inline]
31    pub fn scope(&self) -> UniqueNamesScope<'_> {
32        UniqueNamesScope::new(&self.0)
33    }
34
35    /// Creates a new scope that's backed by this arena, and that
36    /// reserves the given names.
37    ///
38    /// This is useful for reserving variable names in generated code, or
39    /// reserving placeholder names that would be invalid identifiers
40    /// on their own.
41    ///
42    /// # Examples
43    ///
44    /// ```
45    /// # use ploidy_core::codegen::UniqueNames;
46    /// let unique = UniqueNames::new();
47    /// let mut scope = unique.scope_with_reserved(["_"]);
48    /// assert_eq!(scope.uniquify("_"), "_2");
49    /// assert_eq!(scope.uniquify("_"), "_3");
50    /// ```
51    #[inline]
52    pub fn scope_with_reserved<S: AsRef<str>>(
53        &self,
54        reserved: impl IntoIterator<Item = S>,
55    ) -> UniqueNamesScope<'_> {
56        UniqueNamesScope::with_reserved(&self.0, reserved)
57    }
58}
59
60/// A scope for unique names.
61#[derive(Debug)]
62pub struct UniqueNamesScope<'a> {
63    arena: &'a Bump,
64    space: FxHashMap<&'a [UniCase<&'a str>], usize>,
65}
66
67impl<'a> UniqueNamesScope<'a> {
68    fn new(arena: &'a Bump) -> Self {
69        Self {
70            arena,
71            space: FxHashMap::default(),
72        }
73    }
74
75    fn with_reserved<S: AsRef<str>>(
76        arena: &'a Bump,
77        reserved: impl IntoIterator<Item = S>,
78    ) -> Self {
79        let space = reserved
80            .into_iter()
81            .map(|name| arena.alloc_str(name.as_ref()))
82            .map(|name| {
83                WordSegments::new(name)
84                    .map(UniCase::new)
85                    .collect_in::<BumpVec<_>>(arena)
86            })
87            .fold(FxHashMap::default(), |mut names, segments| {
88                // Setting the count to 1 automatically filters out duplicates.
89                names.insert(segments.into_bump_slice(), 1);
90                names
91            });
92        Self { arena, space }
93    }
94
95    /// Adds a name to this scope. If the name doesn't exist within this scope
96    /// yet, returns the name as-is; otherwise, returns the name with a
97    /// unique numeric suffix.
98    ///
99    /// # Examples
100    ///
101    /// ```
102    /// # use ploidy_core::codegen::UniqueNames;
103    /// let unique = UniqueNames::new();
104    /// let mut scope = unique.scope();
105    /// assert_eq!(scope.uniquify("HTTPResponse"), "HTTPResponse");
106    /// assert_eq!(scope.uniquify("HTTP_Response"), "HTTP_Response2");
107    /// assert_eq!(scope.uniquify("httpResponse"), "httpResponse3");
108    /// ```
109    pub fn uniquify<'b>(&mut self, name: &'b str) -> Cow<'b, str> {
110        match self.space.entry(
111            WordSegments::new(name)
112                .map(|name| UniCase::new(&*self.arena.alloc_str(name)))
113                .collect_in::<BumpVec<_>>(self.arena)
114                .into_bump_slice(),
115        ) {
116            Entry::Occupied(mut entry) => {
117                let count = entry.get_mut();
118                *count += 1;
119                format!("{name}{count}").into()
120            }
121            Entry::Vacant(entry) => {
122                entry.insert(1);
123                name.into()
124            }
125        }
126    }
127}
128
129/// Segments a string into words, detecting word boundaries for
130/// case transformation.
131///
132/// Word boundaries occur on:
133///
134/// * Non-alphanumeric characters: underscores, hyphens, etc.
135/// * Lowercase-to-uppercase transitions (`httpResponse`).
136/// * Uppercase-to-lowercase after an uppercase run (`XMLHttp`).
137/// * Digit-to-letter transitions (`1099KStatus`, `250g`).
138///
139/// The digit-to-letter rule is stricter than Heck's segmentation,
140/// to ensure that names like `1099KStatus` and `1099_K_Status` collide.
141/// Without this rule, these cases would produce similar-but-distinct names
142/// differing only in their internal capitalization.
143///
144/// # Examples
145///
146/// ```
147/// # use itertools::Itertools;
148/// # use ploidy_core::codegen::WordSegments;
149/// assert_eq!(WordSegments::new("HTTPResponse").collect_vec(), vec!["HTTP", "Response"]);
150/// assert_eq!(WordSegments::new("HTTP_Response").collect_vec(), vec!["HTTP", "Response"]);
151/// assert_eq!(WordSegments::new("httpResponse").collect_vec(), vec!["http", "Response"]);
152/// assert_eq!(WordSegments::new("XMLHttpRequest").collect_vec(), vec!["XML", "Http", "Request"]);
153/// assert_eq!(WordSegments::new("1099KStatus").collect_vec(), vec!["1099", "K", "Status"]);
154/// assert_eq!(WordSegments::new("250g").collect_vec(), vec!["250", "g"]);
155/// ```
156pub struct WordSegments<'a> {
157    input: &'a str,
158    chars: Peekable<CharIndices<'a>>,
159    current_word_starts_at: Option<usize>,
160    mode: WordMode,
161}
162
163impl<'a> WordSegments<'a> {
164    #[inline]
165    pub fn new(input: &'a str) -> Self {
166        Self {
167            input,
168            chars: input.char_indices().peekable(),
169            current_word_starts_at: None,
170            mode: WordMode::Boundary,
171        }
172    }
173}
174
175impl<'a> Iterator for WordSegments<'a> {
176    type Item = &'a str;
177
178    fn next(&mut self) -> Option<Self::Item> {
179        while let Some((index, c)) = self.chars.next() {
180            if c.is_uppercase() {
181                match self.mode {
182                    WordMode::Boundary => {
183                        // Start a new word with this uppercase character.
184                        let start = self.current_word_starts_at.replace(index);
185                        self.mode = WordMode::Uppercase;
186                        if let Some(start) = start {
187                            return Some(&self.input[start..index]);
188                        }
189                    }
190                    WordMode::Lowercase => {
191                        // camelCased word (previous was lowercase;
192                        // current is uppercase), start a new word.
193                        let start = self.current_word_starts_at.replace(index);
194                        self.mode = WordMode::Uppercase;
195                        if let Some(start) = start {
196                            return Some(&self.input[start..index]);
197                        }
198                    }
199                    WordMode::Uppercase => {
200                        let next_is_lowercase = self
201                            .chars
202                            .peek()
203                            .map(|&(_, next)| next.is_lowercase())
204                            .unwrap_or(false);
205                        if next_is_lowercase && let Some(start) = self.current_word_starts_at {
206                            // `XMLHttp` case; start a new word with this uppercase
207                            // character (the "H" in "Http").
208                            self.current_word_starts_at = Some(index);
209                            return Some(&self.input[start..index]);
210                        }
211                        // (Stay in uppercase mode).
212                    }
213                }
214            } else if c.is_lowercase() {
215                match self.mode {
216                    WordMode::Boundary => {
217                        // Start a new word with this lowercase character.
218                        let start = self.current_word_starts_at.replace(index);
219                        self.mode = WordMode::Lowercase;
220                        if let Some(start) = start {
221                            return Some(&self.input[start..index]);
222                        }
223                    }
224                    WordMode::Lowercase | WordMode::Uppercase => {
225                        if self.current_word_starts_at.is_none() {
226                            // Start or continue the current word.
227                            self.current_word_starts_at = Some(index);
228                        }
229                        self.mode = WordMode::Lowercase;
230                    }
231                }
232            } else if !c.is_alphanumeric() {
233                // Start a new word at this non-alphanumeric character.
234                let start = std::mem::take(&mut self.current_word_starts_at);
235                self.mode = WordMode::Boundary;
236                if let Some(start) = start {
237                    return Some(&self.input[start..index]);
238                }
239            } else {
240                // Digit or other character: continue the current word.
241                if self.current_word_starts_at.is_none() {
242                    self.current_word_starts_at = Some(index);
243                }
244            }
245        }
246        if let Some(start) = std::mem::take(&mut self.current_word_starts_at) {
247            // Trailing word.
248            return Some(&self.input[start..]);
249        }
250        None
251    }
252}
253
254/// The current state of a [`WordSegments`] iterator.
255#[derive(Clone, Copy)]
256enum WordMode {
257    /// At a word boundary: either at the start of a new word, or
258    /// after a non-alphanumeric character.
259    Boundary,
260    /// Currently in a lowercase segment.
261    Lowercase,
262    /// Currently in an uppercase segment.
263    Uppercase,
264}
265
266#[cfg(test)]
267mod tests {
268    use super::*;
269    use itertools::Itertools;
270
271    #[test]
272    fn test_segment_camel_case() {
273        assert_eq!(
274            WordSegments::new("camelCase").collect_vec(),
275            vec!["camel", "Case"]
276        );
277        assert_eq!(
278            WordSegments::new("httpResponse").collect_vec(),
279            vec!["http", "Response"]
280        );
281    }
282
283    #[test]
284    fn test_segment_pascal_case() {
285        assert_eq!(
286            WordSegments::new("PascalCase").collect_vec(),
287            vec!["Pascal", "Case"]
288        );
289        assert_eq!(
290            WordSegments::new("HttpResponse").collect_vec(),
291            vec!["Http", "Response"]
292        );
293    }
294
295    #[test]
296    fn test_segment_snake_case() {
297        assert_eq!(
298            WordSegments::new("snake_case").collect_vec(),
299            vec!["snake", "case"]
300        );
301        assert_eq!(
302            WordSegments::new("http_response").collect_vec(),
303            vec!["http", "response"]
304        );
305    }
306
307    #[test]
308    fn test_segment_screaming_snake() {
309        assert_eq!(
310            WordSegments::new("SCREAMING_SNAKE").collect_vec(),
311            vec!["SCREAMING", "SNAKE"]
312        );
313        assert_eq!(
314            WordSegments::new("HTTP_RESPONSE").collect_vec(),
315            vec!["HTTP", "RESPONSE"]
316        );
317    }
318
319    #[test]
320    fn test_segment_consecutive_uppercase() {
321        assert_eq!(
322            WordSegments::new("XMLHttpRequest").collect_vec(),
323            vec!["XML", "Http", "Request"]
324        );
325        assert_eq!(
326            WordSegments::new("HTTPResponse").collect_vec(),
327            vec!["HTTP", "Response"]
328        );
329        assert_eq!(
330            WordSegments::new("HTTP_Response").collect_vec(),
331            vec!["HTTP", "Response"]
332        );
333        assert_eq!(WordSegments::new("ALLCAPS").collect_vec(), vec!["ALLCAPS"]);
334    }
335
336    #[test]
337    fn test_segment_with_numbers() {
338        assert_eq!(
339            WordSegments::new("Response2").collect_vec(),
340            vec!["Response2"]
341        );
342        assert_eq!(
343            WordSegments::new("response_2").collect_vec(),
344            vec!["response", "2"]
345        );
346        assert_eq!(
347            WordSegments::new("HTTP2Protocol").collect_vec(),
348            vec!["HTTP2", "Protocol"]
349        );
350        assert_eq!(
351            WordSegments::new("OAuth2Token").collect_vec(),
352            vec!["O", "Auth2", "Token"]
353        );
354        assert_eq!(
355            WordSegments::new("HTTP2XML").collect_vec(),
356            vec!["HTTP2XML"]
357        );
358        assert_eq!(
359            WordSegments::new("1099KStatus").collect_vec(),
360            vec!["1099", "K", "Status"]
361        );
362        assert_eq!(
363            WordSegments::new("123abc").collect_vec(),
364            vec!["123", "abc"]
365        );
366        assert_eq!(
367            WordSegments::new("123ABC").collect_vec(),
368            vec!["123", "ABC"]
369        );
370    }
371
372    #[test]
373    fn test_segment_empty_and_special() {
374        assert!(WordSegments::new("").collect_vec().is_empty());
375        assert!(WordSegments::new("___").collect_vec().is_empty());
376        assert_eq!(WordSegments::new("a").collect_vec(), vec!["a"]);
377        assert_eq!(WordSegments::new("A").collect_vec(), vec!["A"]);
378    }
379
380    #[test]
381    fn test_segment_mixed_separators() {
382        assert_eq!(
383            WordSegments::new("foo-bar_baz").collect_vec(),
384            vec!["foo", "bar", "baz"]
385        );
386        assert_eq!(
387            WordSegments::new("foo--bar").collect_vec(),
388            vec!["foo", "bar"]
389        );
390    }
391
392    #[test]
393    fn test_deduplication_http_response_collision() {
394        let unique = UniqueNames::new();
395        let mut scope = unique.scope();
396
397        assert_eq!(scope.uniquify("HTTPResponse"), "HTTPResponse");
398        assert_eq!(scope.uniquify("HTTP_Response"), "HTTP_Response2");
399        assert_eq!(scope.uniquify("httpResponse"), "httpResponse3");
400        assert_eq!(scope.uniquify("http_response"), "http_response4");
401        // `HTTPRESPONSE` isn't a collision; it's a single word.
402        assert_eq!(scope.uniquify("HTTPRESPONSE"), "HTTPRESPONSE");
403    }
404
405    #[test]
406    fn test_deduplication_xml_http_request() {
407        let unique = UniqueNames::new();
408        let mut scope = unique.scope();
409
410        assert_eq!(scope.uniquify("XMLHttpRequest"), "XMLHttpRequest");
411        assert_eq!(scope.uniquify("xml_http_request"), "xml_http_request2");
412        assert_eq!(scope.uniquify("XmlHttpRequest"), "XmlHttpRequest3");
413    }
414
415    #[test]
416    fn test_deduplication_preserves_original_casing() {
417        let unique = UniqueNames::new();
418        let mut scope = unique.scope();
419
420        assert_eq!(scope.uniquify("HTTP_Response"), "HTTP_Response");
421        assert_eq!(scope.uniquify("httpResponse"), "httpResponse2");
422    }
423
424    #[test]
425    fn test_deduplication_same_prefix() {
426        let unique = UniqueNames::new();
427        let mut scope = unique.scope();
428
429        assert_eq!(scope.uniquify("HttpRequest"), "HttpRequest");
430        assert_eq!(scope.uniquify("HttpResponse"), "HttpResponse");
431        assert_eq!(scope.uniquify("HttpError"), "HttpError");
432    }
433
434    #[test]
435    fn test_deduplication_with_numbers() {
436        let unique = UniqueNames::new();
437        let mut scope = unique.scope();
438
439        assert_eq!(scope.uniquify("Response2"), "Response2");
440        assert_eq!(scope.uniquify("response_2"), "response_2");
441
442        // Digit-to-uppercase collisions.
443        assert_eq!(scope.uniquify("1099KStatus"), "1099KStatus");
444        assert_eq!(scope.uniquify("1099K_Status"), "1099K_Status2");
445        assert_eq!(scope.uniquify("1099KStatus"), "1099KStatus3");
446        assert_eq!(scope.uniquify("1099_K_Status"), "1099_K_Status4");
447
448        // Digit-to-lowercase collisions.
449        assert_eq!(scope.uniquify("123abc"), "123abc");
450        assert_eq!(scope.uniquify("123_abc"), "123_abc2");
451    }
452
453    #[test]
454    fn test_with_reserved_underscore() {
455        let unique = UniqueNames::new();
456        let mut scope = unique.scope_with_reserved(["_"]);
457
458        // `_` is reserved, so the first use gets a suffix.
459        assert_eq!(scope.uniquify("_"), "_2");
460        assert_eq!(scope.uniquify("_"), "_3");
461    }
462
463    #[test]
464    fn test_with_reserved_multiple() {
465        let unique = UniqueNames::new();
466        let mut scope = unique.scope_with_reserved(["_", "reserved"]);
467
468        assert_eq!(scope.uniquify("_"), "_2");
469        assert_eq!(scope.uniquify("reserved"), "reserved2");
470        assert_eq!(scope.uniquify("other"), "other");
471    }
472
473    #[test]
474    fn test_with_reserved_empty() {
475        let unique = UniqueNames::new();
476        let mut scope = unique.scope_with_reserved([""]);
477
478        assert_eq!(scope.uniquify(""), "2");
479        assert_eq!(scope.uniquify(""), "3");
480    }
481}