ploidy_core/codegen/
unique.rs

1use std::borrow::Cow;
2use std::collections::btree_map::Entry;
3use std::str::CharIndices;
4use std::{collections::BTreeMap, iter::Peekable};
5
6use bumpalo::{
7    Bump,
8    collections::{CollectIn, Vec as BumpVec},
9};
10use unicase::UniCase;
11
12/// Deduplicates names across case conventions.
13#[derive(Debug, Default)]
14pub struct UniqueNames(Bump);
15
16impl UniqueNames {
17    /// Creates a new arena for deduplicating names.
18    #[inline]
19    pub fn new() -> Self {
20        Self::default()
21    }
22
23    /// Creates a new, empty scope that's backed by this arena.
24    ///
25    /// A scope produces names that will never collide with other names
26    /// within the same scope, even when converted to a different case.
27    ///
28    /// This is useful for disambiguating type and property names that are
29    /// distinct in the source spec, but collide when transformed
30    /// to a different case. For example, `HTTP_Response` and `HTTPResponse`
31    /// are distinct, but both become `http_response` in snake case.
32    #[inline]
33    pub fn scope(&self) -> UniqueNamesScope<'_> {
34        UniqueNamesScope::new(&self.0)
35    }
36
37    /// Creates a new scope that's backed by this arena, and that
38    /// reserves the given names.
39    ///
40    /// This is useful for reserving variable names in generated code, or
41    /// reserving placeholder names that would be invalid identifiers
42    /// on their own.
43    ///
44    /// # Examples
45    ///
46    /// ```
47    /// # use ploidy_core::codegen::UniqueNames;
48    /// let unique = UniqueNames::new();
49    /// let mut scope = unique.scope_with_reserved(["_"]);
50    /// assert_eq!(scope.uniquify("_"), "_2");
51    /// assert_eq!(scope.uniquify("_"), "_3");
52    /// ```
53    #[inline]
54    pub fn scope_with_reserved<S: AsRef<str>>(
55        &self,
56        reserved: impl IntoIterator<Item = S>,
57    ) -> UniqueNamesScope<'_> {
58        UniqueNamesScope::with_reserved(&self.0, reserved)
59    }
60}
61
62/// A scope for unique names.
63#[derive(Debug)]
64pub struct UniqueNamesScope<'a> {
65    arena: &'a Bump,
66    space: BTreeMap<&'a [UniCase<&'a str>], usize>,
67}
68
69impl<'a> UniqueNamesScope<'a> {
70    fn new(arena: &'a Bump) -> Self {
71        Self {
72            arena,
73            space: BTreeMap::new(),
74        }
75    }
76
77    fn with_reserved<S: AsRef<str>>(
78        arena: &'a Bump,
79        reserved: impl IntoIterator<Item = S>,
80    ) -> Self {
81        let space = reserved
82            .into_iter()
83            .map(|name| arena.alloc_str(name.as_ref()))
84            .map(|name| {
85                WordSegments::new(name)
86                    .map(UniCase::new)
87                    .collect_in::<BumpVec<_>>(arena)
88            })
89            .fold(BTreeMap::new(), |mut names, segments| {
90                // Setting the count to 1 automatically filters out duplicates.
91                names.insert(segments.into_bump_slice(), 1);
92                names
93            });
94        Self { arena, space }
95    }
96
97    /// Adds a name to this scope. If the name doesn't exist within this scope
98    /// yet, returns the name as-is; otherwise, returns the name with a
99    /// unique numeric suffix.
100    ///
101    /// # Examples
102    ///
103    /// ```
104    /// # use ploidy_core::codegen::UniqueNames;
105    /// let unique = UniqueNames::new();
106    /// let mut scope = unique.scope();
107    /// assert_eq!(scope.uniquify("HTTPResponse"), "HTTPResponse");
108    /// assert_eq!(scope.uniquify("HTTP_Response"), "HTTP_Response2");
109    /// assert_eq!(scope.uniquify("httpResponse"), "httpResponse3");
110    /// ```
111    pub fn uniquify<'b>(&mut self, name: &'b str) -> Cow<'b, str> {
112        match self.space.entry(
113            WordSegments::new(name)
114                .map(|name| UniCase::new(&*self.arena.alloc_str(name)))
115                .collect_in::<BumpVec<_>>(self.arena)
116                .into_bump_slice(),
117        ) {
118            Entry::Occupied(mut entry) => {
119                let count = entry.get_mut();
120                *count += 1;
121                format!("{name}{count}").into()
122            }
123            Entry::Vacant(entry) => {
124                entry.insert(1);
125                name.into()
126            }
127        }
128    }
129}
130
131/// Segments a string into words, detecting word boundaries for
132/// case transformation.
133///
134/// Word boundaries occur on:
135///
136/// * Non-alphanumeric characters: underscores, hyphens, etc.
137/// * Lowercase-to-uppercase transitions (`httpResponse`).
138/// * Uppercase-to-lowercase after an uppercase run (`XMLHttp`).
139/// * Digit-to-letter transitions (`1099KStatus`, `250g`).
140///
141/// The digit-to-letter rule is stricter than Heck's segmentation,
142/// to ensure that names like `1099KStatus` and `1099_K_Status` collide.
143/// Without this rule, these cases would produce similar-but-distinct names
144/// differing only in their internal capitalization.
145///
146/// # Examples
147///
148/// ```
149/// # use itertools::Itertools;
150/// # use ploidy_core::codegen::WordSegments;
151/// assert_eq!(WordSegments::new("HTTPResponse").collect_vec(), vec!["HTTP", "Response"]);
152/// assert_eq!(WordSegments::new("HTTP_Response").collect_vec(), vec!["HTTP", "Response"]);
153/// assert_eq!(WordSegments::new("httpResponse").collect_vec(), vec!["http", "Response"]);
154/// assert_eq!(WordSegments::new("XMLHttpRequest").collect_vec(), vec!["XML", "Http", "Request"]);
155/// assert_eq!(WordSegments::new("1099KStatus").collect_vec(), vec!["1099", "K", "Status"]);
156/// assert_eq!(WordSegments::new("250g").collect_vec(), vec!["250", "g"]);
157/// ```
158pub struct WordSegments<'a> {
159    input: &'a str,
160    chars: Peekable<CharIndices<'a>>,
161    current_word_starts_at: Option<usize>,
162    mode: WordMode,
163}
164
165impl<'a> WordSegments<'a> {
166    #[inline]
167    pub fn new(input: &'a str) -> Self {
168        Self {
169            input,
170            chars: input.char_indices().peekable(),
171            current_word_starts_at: None,
172            mode: WordMode::Boundary,
173        }
174    }
175}
176
177impl<'a> Iterator for WordSegments<'a> {
178    type Item = &'a str;
179
180    fn next(&mut self) -> Option<Self::Item> {
181        while let Some((index, c)) = self.chars.next() {
182            if c.is_uppercase() {
183                match self.mode {
184                    WordMode::Boundary => {
185                        // Start a new word with this uppercase character.
186                        let start = self.current_word_starts_at.replace(index);
187                        self.mode = WordMode::Uppercase;
188                        if let Some(start) = start {
189                            return Some(&self.input[start..index]);
190                        }
191                    }
192                    WordMode::Lowercase => {
193                        // camelCased word (previous was lowercase;
194                        // current is uppercase), start a new word.
195                        let start = self.current_word_starts_at.replace(index);
196                        self.mode = WordMode::Uppercase;
197                        if let Some(start) = start {
198                            return Some(&self.input[start..index]);
199                        }
200                    }
201                    WordMode::Uppercase => {
202                        let next_is_lowercase = self
203                            .chars
204                            .peek()
205                            .map(|&(_, next)| next.is_lowercase())
206                            .unwrap_or(false);
207                        if next_is_lowercase && let Some(start) = self.current_word_starts_at {
208                            // `XMLHttp` case; start a new word with this uppercase
209                            // character (the "H" in "Http").
210                            self.current_word_starts_at = Some(index);
211                            return Some(&self.input[start..index]);
212                        }
213                        // (Stay in uppercase mode).
214                    }
215                }
216            } else if c.is_lowercase() {
217                match self.mode {
218                    WordMode::Boundary => {
219                        // Start a new word with this lowercase character.
220                        let start = self.current_word_starts_at.replace(index);
221                        self.mode = WordMode::Lowercase;
222                        if let Some(start) = start {
223                            return Some(&self.input[start..index]);
224                        }
225                    }
226                    WordMode::Lowercase | WordMode::Uppercase => {
227                        if self.current_word_starts_at.is_none() {
228                            // Start or continue the current word.
229                            self.current_word_starts_at = Some(index);
230                        }
231                        self.mode = WordMode::Lowercase;
232                    }
233                }
234            } else if !c.is_alphanumeric() {
235                // Start a new word at this non-alphanumeric character.
236                let start = std::mem::take(&mut self.current_word_starts_at);
237                self.mode = WordMode::Boundary;
238                if let Some(start) = start {
239                    return Some(&self.input[start..index]);
240                }
241            } else {
242                // Digit or other character: continue the current word.
243                if self.current_word_starts_at.is_none() {
244                    self.current_word_starts_at = Some(index);
245                }
246            }
247        }
248        if let Some(start) = std::mem::take(&mut self.current_word_starts_at) {
249            // Trailing word.
250            return Some(&self.input[start..]);
251        }
252        None
253    }
254}
255
256/// The current state of a [`WordSegments`] iterator.
257#[derive(Clone, Copy)]
258enum WordMode {
259    /// At a word boundary: either at the start of a new word, or
260    /// after a non-alphanumeric character.
261    Boundary,
262    /// Currently in a lowercase segment.
263    Lowercase,
264    /// Currently in an uppercase segment.
265    Uppercase,
266}
267
268#[cfg(test)]
269mod tests {
270    use super::*;
271    use itertools::Itertools;
272
273    #[test]
274    fn test_segment_camel_case() {
275        assert_eq!(
276            WordSegments::new("camelCase").collect_vec(),
277            vec!["camel", "Case"]
278        );
279        assert_eq!(
280            WordSegments::new("httpResponse").collect_vec(),
281            vec!["http", "Response"]
282        );
283    }
284
285    #[test]
286    fn test_segment_pascal_case() {
287        assert_eq!(
288            WordSegments::new("PascalCase").collect_vec(),
289            vec!["Pascal", "Case"]
290        );
291        assert_eq!(
292            WordSegments::new("HttpResponse").collect_vec(),
293            vec!["Http", "Response"]
294        );
295    }
296
297    #[test]
298    fn test_segment_snake_case() {
299        assert_eq!(
300            WordSegments::new("snake_case").collect_vec(),
301            vec!["snake", "case"]
302        );
303        assert_eq!(
304            WordSegments::new("http_response").collect_vec(),
305            vec!["http", "response"]
306        );
307    }
308
309    #[test]
310    fn test_segment_screaming_snake() {
311        assert_eq!(
312            WordSegments::new("SCREAMING_SNAKE").collect_vec(),
313            vec!["SCREAMING", "SNAKE"]
314        );
315        assert_eq!(
316            WordSegments::new("HTTP_RESPONSE").collect_vec(),
317            vec!["HTTP", "RESPONSE"]
318        );
319    }
320
321    #[test]
322    fn test_segment_consecutive_uppercase() {
323        assert_eq!(
324            WordSegments::new("XMLHttpRequest").collect_vec(),
325            vec!["XML", "Http", "Request"]
326        );
327        assert_eq!(
328            WordSegments::new("HTTPResponse").collect_vec(),
329            vec!["HTTP", "Response"]
330        );
331        assert_eq!(
332            WordSegments::new("HTTP_Response").collect_vec(),
333            vec!["HTTP", "Response"]
334        );
335        assert_eq!(WordSegments::new("ALLCAPS").collect_vec(), vec!["ALLCAPS"]);
336    }
337
338    #[test]
339    fn test_segment_with_numbers() {
340        assert_eq!(
341            WordSegments::new("Response2").collect_vec(),
342            vec!["Response2"]
343        );
344        assert_eq!(
345            WordSegments::new("response_2").collect_vec(),
346            vec!["response", "2"]
347        );
348        assert_eq!(
349            WordSegments::new("HTTP2Protocol").collect_vec(),
350            vec!["HTTP2", "Protocol"]
351        );
352        assert_eq!(
353            WordSegments::new("OAuth2Token").collect_vec(),
354            vec!["O", "Auth2", "Token"]
355        );
356        assert_eq!(
357            WordSegments::new("HTTP2XML").collect_vec(),
358            vec!["HTTP2XML"]
359        );
360        assert_eq!(
361            WordSegments::new("1099KStatus").collect_vec(),
362            vec!["1099", "K", "Status"]
363        );
364        assert_eq!(
365            WordSegments::new("123abc").collect_vec(),
366            vec!["123", "abc"]
367        );
368        assert_eq!(
369            WordSegments::new("123ABC").collect_vec(),
370            vec!["123", "ABC"]
371        );
372    }
373
374    #[test]
375    fn test_segment_empty_and_special() {
376        assert!(WordSegments::new("").collect_vec().is_empty());
377        assert!(WordSegments::new("___").collect_vec().is_empty());
378        assert_eq!(WordSegments::new("a").collect_vec(), vec!["a"]);
379        assert_eq!(WordSegments::new("A").collect_vec(), vec!["A"]);
380    }
381
382    #[test]
383    fn test_segment_mixed_separators() {
384        assert_eq!(
385            WordSegments::new("foo-bar_baz").collect_vec(),
386            vec!["foo", "bar", "baz"]
387        );
388        assert_eq!(
389            WordSegments::new("foo--bar").collect_vec(),
390            vec!["foo", "bar"]
391        );
392    }
393
394    #[test]
395    fn test_deduplication_http_response_collision() {
396        let unique = UniqueNames::new();
397        let mut scope = unique.scope();
398
399        assert_eq!(scope.uniquify("HTTPResponse"), "HTTPResponse");
400        assert_eq!(scope.uniquify("HTTP_Response"), "HTTP_Response2");
401        assert_eq!(scope.uniquify("httpResponse"), "httpResponse3");
402        assert_eq!(scope.uniquify("http_response"), "http_response4");
403        // `HTTPRESPONSE` isn't a collision; it's a single word.
404        assert_eq!(scope.uniquify("HTTPRESPONSE"), "HTTPRESPONSE");
405    }
406
407    #[test]
408    fn test_deduplication_xml_http_request() {
409        let unique = UniqueNames::new();
410        let mut scope = unique.scope();
411
412        assert_eq!(scope.uniquify("XMLHttpRequest"), "XMLHttpRequest");
413        assert_eq!(scope.uniquify("xml_http_request"), "xml_http_request2");
414        assert_eq!(scope.uniquify("XmlHttpRequest"), "XmlHttpRequest3");
415    }
416
417    #[test]
418    fn test_deduplication_preserves_original_casing() {
419        let unique = UniqueNames::new();
420        let mut scope = unique.scope();
421
422        assert_eq!(scope.uniquify("HTTP_Response"), "HTTP_Response");
423        assert_eq!(scope.uniquify("httpResponse"), "httpResponse2");
424    }
425
426    #[test]
427    fn test_deduplication_same_prefix() {
428        let unique = UniqueNames::new();
429        let mut scope = unique.scope();
430
431        assert_eq!(scope.uniquify("HttpRequest"), "HttpRequest");
432        assert_eq!(scope.uniquify("HttpResponse"), "HttpResponse");
433        assert_eq!(scope.uniquify("HttpError"), "HttpError");
434    }
435
436    #[test]
437    fn test_deduplication_with_numbers() {
438        let unique = UniqueNames::new();
439        let mut scope = unique.scope();
440
441        assert_eq!(scope.uniquify("Response2"), "Response2");
442        assert_eq!(scope.uniquify("response_2"), "response_2");
443
444        // Digit-to-uppercase collisions.
445        assert_eq!(scope.uniquify("1099KStatus"), "1099KStatus");
446        assert_eq!(scope.uniquify("1099K_Status"), "1099K_Status2");
447        assert_eq!(scope.uniquify("1099KStatus"), "1099KStatus3");
448        assert_eq!(scope.uniquify("1099_K_Status"), "1099_K_Status4");
449
450        // Digit-to-lowercase collisions.
451        assert_eq!(scope.uniquify("123abc"), "123abc");
452        assert_eq!(scope.uniquify("123_abc"), "123_abc2");
453    }
454
455    #[test]
456    fn test_with_reserved_underscore() {
457        let unique = UniqueNames::new();
458        let mut scope = unique.scope_with_reserved(["_"]);
459
460        // `_` is reserved, so the first use gets a suffix.
461        assert_eq!(scope.uniquify("_"), "_2");
462        assert_eq!(scope.uniquify("_"), "_3");
463    }
464
465    #[test]
466    fn test_with_reserved_multiple() {
467        let unique = UniqueNames::new();
468        let mut scope = unique.scope_with_reserved(["_", "reserved"]);
469
470        assert_eq!(scope.uniquify("_"), "_2");
471        assert_eq!(scope.uniquify("reserved"), "reserved2");
472        assert_eq!(scope.uniquify("other"), "other");
473    }
474
475    #[test]
476    fn test_with_reserved_empty() {
477        let unique = UniqueNames::new();
478        let mut scope = unique.scope_with_reserved([""]);
479
480        assert_eq!(scope.uniquify(""), "2");
481        assert_eq!(scope.uniquify(""), "3");
482    }
483}