Skip to main content

ploidy_core/codegen/
unique.rs

1use std::{borrow::Cow, collections::hash_map::Entry, iter::Peekable, str::CharIndices};
2
3use rustc_hash::FxHashMap;
4use unicase::UniCase;
5
6use crate::arena::Arena;
7
8/// Deduplicates names across case conventions.
9#[derive(Debug, Default)]
10pub struct UniqueNames(Arena);
11
12impl UniqueNames {
13    /// Creates a new arena for deduplicating names.
14    #[inline]
15    pub fn new() -> Self {
16        Self::default()
17    }
18
19    /// Creates a new, empty scope that's backed by this arena.
20    ///
21    /// A scope produces names that will never collide with other names
22    /// within the same scope, even when converted to a different case.
23    ///
24    /// This is useful for disambiguating type and property names that are
25    /// distinct in the source spec, but collide when transformed
26    /// to a different case. For example, `HTTP_Response` and `HTTPResponse`
27    /// are distinct, but both become `http_response` in snake case.
28    #[inline]
29    pub fn scope(&self) -> UniqueNamesScope<'_> {
30        UniqueNamesScope::new(&self.0)
31    }
32
33    /// Creates a new scope that's backed by this arena, and that
34    /// reserves the given names.
35    ///
36    /// This is useful for reserving variable names in generated code, or
37    /// reserving placeholder names that would be invalid identifiers
38    /// on their own.
39    ///
40    /// # Examples
41    ///
42    /// ```
43    /// # use ploidy_core::codegen::UniqueNames;
44    /// let unique = UniqueNames::new();
45    /// let mut scope = unique.scope_with_reserved(["_"]);
46    /// assert_eq!(scope.uniquify("_"), "_2");
47    /// assert_eq!(scope.uniquify("_"), "_3");
48    /// ```
49    #[inline]
50    pub fn scope_with_reserved<S: AsRef<str>>(
51        &self,
52        reserved: impl IntoIterator<Item = S>,
53    ) -> UniqueNamesScope<'_> {
54        UniqueNamesScope::with_reserved(&self.0, reserved)
55    }
56}
57
58/// A scope for unique names.
59#[derive(Debug)]
60pub struct UniqueNamesScope<'a> {
61    arena: &'a Arena,
62    space: FxHashMap<&'a [UniCase<&'a str>], usize>,
63}
64
65impl<'a> UniqueNamesScope<'a> {
66    fn new(arena: &'a Arena) -> Self {
67        Self {
68            arena,
69            space: FxHashMap::default(),
70        }
71    }
72
73    fn with_reserved<S: AsRef<str>>(
74        arena: &'a Arena,
75        reserved: impl IntoIterator<Item = S>,
76    ) -> Self {
77        let space = reserved
78            .into_iter()
79            .map(|name| arena.alloc_str(name.as_ref()))
80            .map(|name| arena.alloc_slice(WordSegments::new(name).map(UniCase::new)))
81            .fold(FxHashMap::default(), |mut names, segments| {
82                // Setting the count to 1 automatically filters out duplicates.
83                names.insert(&*segments, 1);
84                names
85            });
86        Self { arena, space }
87    }
88
89    /// Adds a name to this scope. If the name doesn't exist within this scope
90    /// yet, returns the name as-is; otherwise, returns the name with a
91    /// unique numeric suffix.
92    ///
93    /// # Examples
94    ///
95    /// ```
96    /// # use ploidy_core::codegen::UniqueNames;
97    /// let unique = UniqueNames::new();
98    /// let mut scope = unique.scope();
99    /// assert_eq!(scope.uniquify("HTTPResponse"), "HTTPResponse");
100    /// assert_eq!(scope.uniquify("HTTP_Response"), "HTTP_Response2");
101    /// assert_eq!(scope.uniquify("httpResponse"), "httpResponse3");
102    /// ```
103    pub fn uniquify<'b>(&mut self, name: &'b str) -> Cow<'b, str> {
104        match self.space.entry(self.arena.alloc_slice(
105            WordSegments::new(name).map(|name| UniCase::new(&*self.arena.alloc_str(name))),
106        )) {
107            Entry::Occupied(mut entry) => {
108                let count = entry.get_mut();
109                *count += 1;
110                format!("{name}{count}").into()
111            }
112            Entry::Vacant(entry) => {
113                entry.insert(1);
114                name.into()
115            }
116        }
117    }
118}
119
120/// Segments a string into words, detecting word boundaries for
121/// case transformation.
122///
123/// Word boundaries occur on:
124///
125/// * Non-alphanumeric characters: underscores, hyphens, etc.
126/// * Lowercase-to-uppercase transitions (`httpResponse`).
127/// * Uppercase-to-lowercase after an uppercase run (`XMLHttp`).
128/// * Digit-to-letter transitions (`1099KStatus`, `250g`).
129///
130/// The digit-to-letter rule is stricter than Heck's segmentation,
131/// to ensure that names like `1099KStatus` and `1099_K_Status` collide.
132/// Without this rule, these cases would produce similar-but-distinct names
133/// differing only in their internal capitalization.
134///
135/// # Examples
136///
137/// ```
138/// # use itertools::Itertools;
139/// # use ploidy_core::codegen::WordSegments;
140/// assert_eq!(WordSegments::new("HTTPResponse").collect_vec(), vec!["HTTP", "Response"]);
141/// assert_eq!(WordSegments::new("HTTP_Response").collect_vec(), vec!["HTTP", "Response"]);
142/// assert_eq!(WordSegments::new("httpResponse").collect_vec(), vec!["http", "Response"]);
143/// assert_eq!(WordSegments::new("XMLHttpRequest").collect_vec(), vec!["XML", "Http", "Request"]);
144/// assert_eq!(WordSegments::new("1099KStatus").collect_vec(), vec!["1099", "K", "Status"]);
145/// assert_eq!(WordSegments::new("250g").collect_vec(), vec!["250", "g"]);
146/// ```
147pub struct WordSegments<'a> {
148    input: &'a str,
149    chars: Peekable<CharIndices<'a>>,
150    current_word_starts_at: Option<usize>,
151    mode: WordMode,
152}
153
154impl<'a> WordSegments<'a> {
155    #[inline]
156    pub fn new(input: &'a str) -> Self {
157        Self {
158            input,
159            chars: input.char_indices().peekable(),
160            current_word_starts_at: None,
161            mode: WordMode::Boundary,
162        }
163    }
164}
165
166impl<'a> Iterator for WordSegments<'a> {
167    type Item = &'a str;
168
169    fn next(&mut self) -> Option<Self::Item> {
170        while let Some((index, c)) = self.chars.next() {
171            if c.is_uppercase() {
172                match self.mode {
173                    WordMode::Boundary => {
174                        // Start a new word with this uppercase character.
175                        let start = self.current_word_starts_at.replace(index);
176                        self.mode = WordMode::Uppercase;
177                        if let Some(start) = start {
178                            return Some(&self.input[start..index]);
179                        }
180                    }
181                    WordMode::Lowercase => {
182                        // camelCased word (previous was lowercase;
183                        // current is uppercase), start a new word.
184                        let start = self.current_word_starts_at.replace(index);
185                        self.mode = WordMode::Uppercase;
186                        if let Some(start) = start {
187                            return Some(&self.input[start..index]);
188                        }
189                    }
190                    WordMode::Uppercase => {
191                        let next_is_lowercase = self
192                            .chars
193                            .peek()
194                            .map(|&(_, next)| next.is_lowercase())
195                            .unwrap_or(false);
196                        if next_is_lowercase && let Some(start) = self.current_word_starts_at {
197                            // `XMLHttp` case; start a new word with this uppercase
198                            // character (the "H" in "Http").
199                            self.current_word_starts_at = Some(index);
200                            return Some(&self.input[start..index]);
201                        }
202                        // (Stay in uppercase mode).
203                    }
204                }
205            } else if c.is_lowercase() {
206                match self.mode {
207                    WordMode::Boundary => {
208                        // Start a new word with this lowercase character.
209                        let start = self.current_word_starts_at.replace(index);
210                        self.mode = WordMode::Lowercase;
211                        if let Some(start) = start {
212                            return Some(&self.input[start..index]);
213                        }
214                    }
215                    WordMode::Lowercase | WordMode::Uppercase => {
216                        if self.current_word_starts_at.is_none() {
217                            // Start or continue the current word.
218                            self.current_word_starts_at = Some(index);
219                        }
220                        self.mode = WordMode::Lowercase;
221                    }
222                }
223            } else if !c.is_alphanumeric() {
224                // Start a new word at this non-alphanumeric character.
225                let start = std::mem::take(&mut self.current_word_starts_at);
226                self.mode = WordMode::Boundary;
227                if let Some(start) = start {
228                    return Some(&self.input[start..index]);
229                }
230            } else {
231                // Digit or other character: continue the current word.
232                if self.current_word_starts_at.is_none() {
233                    self.current_word_starts_at = Some(index);
234                }
235            }
236        }
237        if let Some(start) = std::mem::take(&mut self.current_word_starts_at) {
238            // Trailing word.
239            return Some(&self.input[start..]);
240        }
241        None
242    }
243}
244
245/// The current state of a [`WordSegments`] iterator.
246#[derive(Clone, Copy)]
247enum WordMode {
248    /// At a word boundary: either at the start of a new word, or
249    /// after a non-alphanumeric character.
250    Boundary,
251    /// Currently in a lowercase segment.
252    Lowercase,
253    /// Currently in an uppercase segment.
254    Uppercase,
255}
256
257#[cfg(test)]
258mod tests {
259    use super::*;
260    use itertools::Itertools;
261
262    #[test]
263    fn test_segment_camel_case() {
264        assert_eq!(
265            WordSegments::new("camelCase").collect_vec(),
266            vec!["camel", "Case"]
267        );
268        assert_eq!(
269            WordSegments::new("httpResponse").collect_vec(),
270            vec!["http", "Response"]
271        );
272    }
273
274    #[test]
275    fn test_segment_pascal_case() {
276        assert_eq!(
277            WordSegments::new("PascalCase").collect_vec(),
278            vec!["Pascal", "Case"]
279        );
280        assert_eq!(
281            WordSegments::new("HttpResponse").collect_vec(),
282            vec!["Http", "Response"]
283        );
284    }
285
286    #[test]
287    fn test_segment_snake_case() {
288        assert_eq!(
289            WordSegments::new("snake_case").collect_vec(),
290            vec!["snake", "case"]
291        );
292        assert_eq!(
293            WordSegments::new("http_response").collect_vec(),
294            vec!["http", "response"]
295        );
296    }
297
298    #[test]
299    fn test_segment_screaming_snake() {
300        assert_eq!(
301            WordSegments::new("SCREAMING_SNAKE").collect_vec(),
302            vec!["SCREAMING", "SNAKE"]
303        );
304        assert_eq!(
305            WordSegments::new("HTTP_RESPONSE").collect_vec(),
306            vec!["HTTP", "RESPONSE"]
307        );
308    }
309
310    #[test]
311    fn test_segment_consecutive_uppercase() {
312        assert_eq!(
313            WordSegments::new("XMLHttpRequest").collect_vec(),
314            vec!["XML", "Http", "Request"]
315        );
316        assert_eq!(
317            WordSegments::new("HTTPResponse").collect_vec(),
318            vec!["HTTP", "Response"]
319        );
320        assert_eq!(
321            WordSegments::new("HTTP_Response").collect_vec(),
322            vec!["HTTP", "Response"]
323        );
324        assert_eq!(WordSegments::new("ALLCAPS").collect_vec(), vec!["ALLCAPS"]);
325    }
326
327    #[test]
328    fn test_segment_with_numbers() {
329        assert_eq!(
330            WordSegments::new("Response2").collect_vec(),
331            vec!["Response2"]
332        );
333        assert_eq!(
334            WordSegments::new("response_2").collect_vec(),
335            vec!["response", "2"]
336        );
337        assert_eq!(
338            WordSegments::new("HTTP2Protocol").collect_vec(),
339            vec!["HTTP2", "Protocol"]
340        );
341        assert_eq!(
342            WordSegments::new("OAuth2Token").collect_vec(),
343            vec!["O", "Auth2", "Token"]
344        );
345        assert_eq!(
346            WordSegments::new("HTTP2XML").collect_vec(),
347            vec!["HTTP2XML"]
348        );
349        assert_eq!(
350            WordSegments::new("1099KStatus").collect_vec(),
351            vec!["1099", "K", "Status"]
352        );
353        assert_eq!(
354            WordSegments::new("123abc").collect_vec(),
355            vec!["123", "abc"]
356        );
357        assert_eq!(
358            WordSegments::new("123ABC").collect_vec(),
359            vec!["123", "ABC"]
360        );
361    }
362
363    #[test]
364    fn test_segment_empty_and_special() {
365        assert!(WordSegments::new("").collect_vec().is_empty());
366        assert!(WordSegments::new("___").collect_vec().is_empty());
367        assert_eq!(WordSegments::new("a").collect_vec(), vec!["a"]);
368        assert_eq!(WordSegments::new("A").collect_vec(), vec!["A"]);
369    }
370
371    #[test]
372    fn test_segment_mixed_separators() {
373        assert_eq!(
374            WordSegments::new("foo-bar_baz").collect_vec(),
375            vec!["foo", "bar", "baz"]
376        );
377        assert_eq!(
378            WordSegments::new("foo--bar").collect_vec(),
379            vec!["foo", "bar"]
380        );
381    }
382
383    #[test]
384    fn test_deduplication_http_response_collision() {
385        let unique = UniqueNames::new();
386        let mut scope = unique.scope();
387
388        assert_eq!(scope.uniquify("HTTPResponse"), "HTTPResponse");
389        assert_eq!(scope.uniquify("HTTP_Response"), "HTTP_Response2");
390        assert_eq!(scope.uniquify("httpResponse"), "httpResponse3");
391        assert_eq!(scope.uniquify("http_response"), "http_response4");
392        // `HTTPRESPONSE` isn't a collision; it's a single word.
393        assert_eq!(scope.uniquify("HTTPRESPONSE"), "HTTPRESPONSE");
394    }
395
396    #[test]
397    fn test_deduplication_xml_http_request() {
398        let unique = UniqueNames::new();
399        let mut scope = unique.scope();
400
401        assert_eq!(scope.uniquify("XMLHttpRequest"), "XMLHttpRequest");
402        assert_eq!(scope.uniquify("xml_http_request"), "xml_http_request2");
403        assert_eq!(scope.uniquify("XmlHttpRequest"), "XmlHttpRequest3");
404    }
405
406    #[test]
407    fn test_deduplication_preserves_original_casing() {
408        let unique = UniqueNames::new();
409        let mut scope = unique.scope();
410
411        assert_eq!(scope.uniquify("HTTP_Response"), "HTTP_Response");
412        assert_eq!(scope.uniquify("httpResponse"), "httpResponse2");
413    }
414
415    #[test]
416    fn test_deduplication_same_prefix() {
417        let unique = UniqueNames::new();
418        let mut scope = unique.scope();
419
420        assert_eq!(scope.uniquify("HttpRequest"), "HttpRequest");
421        assert_eq!(scope.uniquify("HttpResponse"), "HttpResponse");
422        assert_eq!(scope.uniquify("HttpError"), "HttpError");
423    }
424
425    #[test]
426    fn test_deduplication_with_numbers() {
427        let unique = UniqueNames::new();
428        let mut scope = unique.scope();
429
430        assert_eq!(scope.uniquify("Response2"), "Response2");
431        assert_eq!(scope.uniquify("response_2"), "response_2");
432
433        // Digit-to-uppercase collisions.
434        assert_eq!(scope.uniquify("1099KStatus"), "1099KStatus");
435        assert_eq!(scope.uniquify("1099K_Status"), "1099K_Status2");
436        assert_eq!(scope.uniquify("1099KStatus"), "1099KStatus3");
437        assert_eq!(scope.uniquify("1099_K_Status"), "1099_K_Status4");
438
439        // Digit-to-lowercase collisions.
440        assert_eq!(scope.uniquify("123abc"), "123abc");
441        assert_eq!(scope.uniquify("123_abc"), "123_abc2");
442    }
443
444    #[test]
445    fn test_with_reserved_underscore() {
446        let unique = UniqueNames::new();
447        let mut scope = unique.scope_with_reserved(["_"]);
448
449        // `_` is reserved, so the first use gets a suffix.
450        assert_eq!(scope.uniquify("_"), "_2");
451        assert_eq!(scope.uniquify("_"), "_3");
452    }
453
454    #[test]
455    fn test_with_reserved_multiple() {
456        let unique = UniqueNames::new();
457        let mut scope = unique.scope_with_reserved(["_", "reserved"]);
458
459        assert_eq!(scope.uniquify("_"), "_2");
460        assert_eq!(scope.uniquify("reserved"), "reserved2");
461        assert_eq!(scope.uniquify("other"), "other");
462    }
463
464    #[test]
465    fn test_with_reserved_empty() {
466        let unique = UniqueNames::new();
467        let mut scope = unique.scope_with_reserved([""]);
468
469        assert_eq!(scope.uniquify(""), "2");
470        assert_eq!(scope.uniquify(""), "3");
471    }
472}