Skip to main content

ploidy_core/codegen/
unique.rs

1use std::{iter::Peekable, str::CharIndices};
2
3use itertools::Itertools;
4use rustc_hash::{FxHashMap, FxHashSet};
5use unicase::UniCase;
6
7use crate::arena::Arena;
8
9/// Deduplicates names across case conventions.
10#[derive(Debug)]
11pub struct UniqueNames<'a> {
12    arena: &'a Arena,
13    space: FxHashMap<&'a [UniCase<&'a str>], FxHashSet<usize>>,
14}
15
16impl<'a> UniqueNames<'a> {
17    pub fn new(arena: &'a Arena) -> Self {
18        Self {
19            arena,
20            space: FxHashMap::default(),
21        }
22    }
23
24    pub fn with_reserved<S: AsRef<str>>(
25        arena: &'a Arena,
26        reserved: impl IntoIterator<Item = S>,
27    ) -> Self {
28        let mut names = Self::new(arena);
29        for name in reserved {
30            names.reserve(name.as_ref());
31        }
32        names
33    }
34
35    /// Adds a name to this scope and returns a unique form.
36    ///
37    /// Names without word content use numeric forms starting at `1`.
38    ///
39    /// # Examples
40    ///
41    /// ```
42    /// # use ploidy_core::{arena::Arena, codegen::UniqueNames};
43    /// let arena = Arena::new();
44    /// let mut names = UniqueNames::new(&arena);
45    /// assert_eq!(names.uniquify("HTTPResponse"), "HTTPResponse");
46    /// assert_eq!(names.uniquify("HTTP_Response"), "HTTP_Response2");
47    /// assert_eq!(names.uniquify("httpResponse"), "httpResponse3");
48    /// assert_eq!(names.uniquify("http_response_2"), "http_response4");
49    /// ```
50    pub fn uniquify(&mut self, name: &str) -> &'a str {
51        let parsed = ParsedName::parse(name);
52        let segments = self.arena.alloc_slice_exact(
53            parsed
54                .segments()
55                .iter()
56                .map(|(_, segment)| UniCase::new(&*self.arena.alloc_str(segment))),
57        );
58        let occupied = self.space.entry(segments).or_default();
59
60        let suffix = if occupied.insert(parsed.suffix()) {
61            parsed.suffix()
62        } else {
63            let mut suffix = parsed.min_suffix();
64            while !occupied.insert(suffix) {
65                suffix = suffix.checked_add(1).unwrap();
66            }
67            suffix
68        };
69
70        match parsed {
71            ParsedName::Empty | ParsedName::Numeric(_) => self.arena.alloc_str(&suffix.to_string()),
72            ParsedName::Stemmed { stem, .. } if suffix > 0 => {
73                self.arena.alloc_str(&format!("{stem}{suffix}"))
74            }
75            ParsedName::Stemmed { stem, .. } => self.arena.alloc_str(stem),
76        }
77    }
78
79    fn reserve(&mut self, name: &str) {
80        let parsed = ParsedName::parse(name);
81        let segments = self.arena.alloc_slice_exact(
82            parsed
83                .segments()
84                .iter()
85                .map(|(_, segment)| UniCase::new(&*self.arena.alloc_str(segment))),
86        );
87        self.space
88            .entry(segments)
89            .or_default()
90            .insert(parsed.reserved_suffix());
91    }
92}
93
94/// Segments a string into words and their byte offsets,
95/// detecting word boundaries for case transformations.
96///
97/// Word boundaries occur on:
98///
99/// * Non-alphanumeric characters: underscores, hyphens, etc.
100/// * Lowercase-to-uppercase transitions (`httpResponse`).
101/// * Uppercase-to-lowercase after an uppercase run (`XMLHttp`).
102/// * Letter-to-digit and digit-to-letter transitions (`Response2`, `250g`).
103///
104/// The digit transition rules are stricter than Heck's segmentation,
105/// to ensure that names like (`Response2`, `Response_2`) and
106/// (`1099KStatus`, `1099_K_Status`) collide. Without this rule, these cases
107/// would produce similar-but-distinct names differing only in their
108/// internal capitalization.
109///
110/// # Examples
111///
112/// ```
113/// # use itertools::Itertools;
114/// # use ploidy_core::codegen::WordSegments;
115/// assert_eq!(
116///     WordSegments::new("HTTPResponse").collect_vec(),
117///     vec![(0, "HTTP"), (4, "Response")]
118/// );
119/// assert_eq!(
120///     WordSegments::new("HTTP_Response").collect_vec(),
121///     vec![(0, "HTTP"), (5, "Response")]
122/// );
123/// assert_eq!(
124///     WordSegments::new("httpResponse").collect_vec(),
125///     vec![(0, "http"), (4, "Response")]
126/// );
127/// assert_eq!(
128///     WordSegments::new("XMLHttpRequest").collect_vec(),
129///     vec![(0, "XML"), (3, "Http"), (7, "Request")]
130/// );
131/// assert_eq!(
132///     WordSegments::new("Response2").collect_vec(),
133///     vec![(0, "Response"), (8, "2")]
134/// );
135/// assert_eq!(
136///     WordSegments::new("250g").collect_vec(),
137///     vec![(0, "250"), (3, "g")]
138/// );
139/// ```
140pub struct WordSegments<'a> {
141    input: &'a str,
142    chars: Peekable<CharIndices<'a>>,
143    current_word_starts_at: Option<usize>,
144    mode: WordMode,
145}
146
147impl<'a> WordSegments<'a> {
148    #[inline]
149    pub fn new(input: &'a str) -> Self {
150        Self {
151            input,
152            chars: input.char_indices().peekable(),
153            current_word_starts_at: None,
154            mode: WordMode::Boundary,
155        }
156    }
157}
158
159impl<'a> Iterator for WordSegments<'a> {
160    type Item = (usize, &'a str);
161
162    fn next(&mut self) -> Option<Self::Item> {
163        while let Some((index, c)) = self.chars.next() {
164            if c.is_uppercase() {
165                match self.mode {
166                    WordMode::Boundary | WordMode::Digit => {
167                        // Start a new word with this uppercase character.
168                        let start = self.current_word_starts_at.replace(index);
169                        self.mode = WordMode::Uppercase;
170                        if let Some(start) = start {
171                            return Some((start, &self.input[start..index]));
172                        }
173                    }
174                    WordMode::Lowercase => {
175                        // camelCased word (previous was lowercase;
176                        // current is uppercase), start a new word.
177                        let start = self.current_word_starts_at.replace(index);
178                        self.mode = WordMode::Uppercase;
179                        if let Some(start) = start {
180                            return Some((start, &self.input[start..index]));
181                        }
182                    }
183                    WordMode::Uppercase => {
184                        let next_is_lowercase = self
185                            .chars
186                            .peek()
187                            .map(|&(_, next)| next.is_lowercase())
188                            .unwrap_or(false);
189                        if next_is_lowercase && let Some(start) = self.current_word_starts_at {
190                            // `XMLHttp` case; start a new word with this uppercase
191                            // character (the "H" in "Http").
192                            self.current_word_starts_at = Some(index);
193                            return Some((start, &self.input[start..index]));
194                        }
195                        // (Stay in uppercase mode).
196                    }
197                }
198            } else if c.is_lowercase() {
199                match self.mode {
200                    WordMode::Boundary | WordMode::Digit => {
201                        // Start a new word with this lowercase character.
202                        let start = self.current_word_starts_at.replace(index);
203                        self.mode = WordMode::Lowercase;
204                        if let Some(start) = start {
205                            return Some((start, &self.input[start..index]));
206                        }
207                    }
208                    WordMode::Lowercase | WordMode::Uppercase => {
209                        if self.current_word_starts_at.is_none() {
210                            // Start or continue the current word.
211                            self.current_word_starts_at = Some(index);
212                        }
213                        self.mode = WordMode::Lowercase;
214                    }
215                }
216            } else if c.is_ascii_digit() {
217                match self.mode {
218                    WordMode::Boundary | WordMode::Digit => {
219                        if self.current_word_starts_at.is_none() {
220                            self.current_word_starts_at = Some(index);
221                        }
222                        self.mode = WordMode::Digit;
223                    }
224                    WordMode::Lowercase | WordMode::Uppercase => {
225                        let start = self.current_word_starts_at.replace(index);
226                        self.mode = WordMode::Digit;
227                        if let Some(start) = start {
228                            return Some((start, &self.input[start..index]));
229                        }
230                    }
231                }
232            } else if !c.is_alphanumeric() {
233                // Start a new word at this non-alphanumeric character.
234                let start = std::mem::take(&mut self.current_word_starts_at);
235                self.mode = WordMode::Boundary;
236                if let Some(start) = start {
237                    return Some((start, &self.input[start..index]));
238                }
239            } else {
240                // Other alphanumeric character: continue the current word.
241                if self.current_word_starts_at.is_none() {
242                    self.current_word_starts_at = Some(index);
243                }
244            }
245        }
246        if let Some(start) = std::mem::take(&mut self.current_word_starts_at) {
247            // Trailing word.
248            return Some((start, &self.input[start..]));
249        }
250        None
251    }
252}
253
254/// The current state of a [`WordSegments`] iterator.
255#[derive(Clone, Copy)]
256enum WordMode {
257    /// At a word boundary: either at the start of a new word, or
258    /// after a non-alphanumeric character.
259    Boundary,
260    /// Currently in a lowercase segment.
261    Lowercase,
262    /// Currently in an uppercase segment.
263    Uppercase,
264    /// Currently in a digit segment.
265    Digit,
266}
267
268enum ParsedName<'a> {
269    Empty,
270    Numeric(usize),
271    Stemmed {
272        segments: Vec<(usize, &'a str)>,
273        stem: &'a str,
274        suffix: usize,
275    },
276}
277
278impl<'a> ParsedName<'a> {
279    fn parse(name: &'a str) -> Self {
280        let mut segments = WordSegments::new(name).collect_vec();
281        if segments.is_empty() {
282            return Self::Empty;
283        }
284
285        let Some((suffix_start, suffix)) = segments
286            .iter()
287            .last()
288            .and_then(|&(offset, segment)| Some((offset, segment.parse::<usize>().ok()?)))
289        else {
290            return Self::Stemmed {
291                segments,
292                stem: name,
293                suffix: 0,
294            };
295        };
296
297        segments.pop();
298        if segments.is_empty() {
299            return Self::Numeric(suffix);
300        }
301
302        let stem = name[..suffix_start].trim_end_matches(|c: char| !c.is_alphanumeric());
303        Self::Stemmed {
304            segments,
305            stem,
306            suffix,
307        }
308    }
309
310    fn segments(&self) -> &[(usize, &'a str)] {
311        match self {
312            Self::Empty | Self::Numeric(_) => &[],
313            Self::Stemmed { segments, .. } => segments,
314        }
315    }
316
317    fn suffix(&self) -> usize {
318        match self {
319            Self::Empty | Self::Numeric(0) => 1,
320            &(Self::Numeric(suffix) | Self::Stemmed { suffix, .. }) => suffix,
321        }
322    }
323
324    fn reserved_suffix(&self) -> usize {
325        match self {
326            Self::Empty | Self::Numeric(0) => 0,
327            &(Self::Numeric(suffix) | Self::Stemmed { suffix, .. }) => suffix,
328        }
329    }
330
331    fn min_suffix(&self) -> usize {
332        match self {
333            Self::Empty => 1,
334            &Self::Numeric(suffix) => suffix.max(1),
335            &Self::Stemmed { suffix, .. } => suffix.max(2),
336        }
337    }
338}
339
340#[cfg(test)]
341mod tests {
342    use super::*;
343    use itertools::Itertools;
344
345    #[test]
346    fn test_segment_camel_case() {
347        assert_eq!(
348            WordSegments::new("camelCase").collect_vec(),
349            vec![(0, "camel"), (5, "Case")]
350        );
351        assert_eq!(
352            WordSegments::new("httpResponse").collect_vec(),
353            vec![(0, "http"), (4, "Response")]
354        );
355    }
356
357    #[test]
358    fn test_segment_pascal_case() {
359        assert_eq!(
360            WordSegments::new("PascalCase").collect_vec(),
361            vec![(0, "Pascal"), (6, "Case")]
362        );
363        assert_eq!(
364            WordSegments::new("HttpResponse").collect_vec(),
365            vec![(0, "Http"), (4, "Response")]
366        );
367    }
368
369    #[test]
370    fn test_segment_snake_case() {
371        assert_eq!(
372            WordSegments::new("snake_case").collect_vec(),
373            vec![(0, "snake"), (6, "case")]
374        );
375        assert_eq!(
376            WordSegments::new("http_response").collect_vec(),
377            vec![(0, "http"), (5, "response")]
378        );
379    }
380
381    #[test]
382    fn test_segment_screaming_snake() {
383        assert_eq!(
384            WordSegments::new("SCREAMING_SNAKE").collect_vec(),
385            vec![(0, "SCREAMING"), (10, "SNAKE")]
386        );
387        assert_eq!(
388            WordSegments::new("HTTP_RESPONSE").collect_vec(),
389            vec![(0, "HTTP"), (5, "RESPONSE")]
390        );
391    }
392
393    #[test]
394    fn test_segment_consecutive_uppercase() {
395        assert_eq!(
396            WordSegments::new("XMLHttpRequest").collect_vec(),
397            vec![(0, "XML"), (3, "Http"), (7, "Request")]
398        );
399        assert_eq!(
400            WordSegments::new("HTTPResponse").collect_vec(),
401            vec![(0, "HTTP"), (4, "Response")]
402        );
403        assert_eq!(
404            WordSegments::new("HTTP_Response").collect_vec(),
405            vec![(0, "HTTP"), (5, "Response")]
406        );
407        assert_eq!(
408            WordSegments::new("ALLCAPS").collect_vec(),
409            vec![(0, "ALLCAPS")]
410        );
411    }
412
413    #[test]
414    fn test_segment_with_numbers() {
415        assert_eq!(
416            WordSegments::new("Response2").collect_vec(),
417            vec![(0, "Response"), (8, "2")]
418        );
419        assert_eq!(
420            WordSegments::new("response_2").collect_vec(),
421            vec![(0, "response"), (9, "2")]
422        );
423        assert_eq!(
424            WordSegments::new("HTTP2Protocol").collect_vec(),
425            vec![(0, "HTTP"), (4, "2"), (5, "Protocol")]
426        );
427        assert_eq!(
428            WordSegments::new("OAuth2Token").collect_vec(),
429            vec![(0, "O"), (1, "Auth"), (5, "2"), (6, "Token")]
430        );
431        assert_eq!(
432            WordSegments::new("HTTP2XML").collect_vec(),
433            vec![(0, "HTTP"), (4, "2"), (5, "XML")]
434        );
435        assert_eq!(
436            WordSegments::new("1099KStatus").collect_vec(),
437            vec![(0, "1099"), (4, "K"), (5, "Status")]
438        );
439        assert_eq!(
440            WordSegments::new("123abc").collect_vec(),
441            vec![(0, "123"), (3, "abc")]
442        );
443        assert_eq!(
444            WordSegments::new("123ABC").collect_vec(),
445            vec![(0, "123"), (3, "ABC")]
446        );
447    }
448
449    #[test]
450    fn test_segment_empty_and_special() {
451        assert!(WordSegments::new("").collect_vec().is_empty());
452        assert!(WordSegments::new("___").collect_vec().is_empty());
453        assert_eq!(WordSegments::new("a").collect_vec(), vec![(0, "a")]);
454        assert_eq!(WordSegments::new("A").collect_vec(), vec![(0, "A")]);
455    }
456
457    #[test]
458    fn test_segment_mixed_separators() {
459        assert_eq!(
460            WordSegments::new("foo-bar_baz").collect_vec(),
461            vec![(0, "foo"), (4, "bar"), (8, "baz")]
462        );
463        assert_eq!(
464            WordSegments::new("foo--bar").collect_vec(),
465            vec![(0, "foo"), (5, "bar")]
466        );
467    }
468
469    #[test]
470    fn test_deduplication_http_response_collision() {
471        let arena = Arena::new();
472        let mut names = UniqueNames::new(&arena);
473
474        assert_eq!(names.uniquify("HTTPResponse"), "HTTPResponse");
475        assert_eq!(names.uniquify("HTTP_Response"), "HTTP_Response2");
476        assert_eq!(names.uniquify("httpResponse"), "httpResponse3");
477        assert_eq!(names.uniquify("http_response"), "http_response4");
478        // `HTTPRESPONSE` isn't a collision; it's a single word.
479        assert_eq!(names.uniquify("HTTPRESPONSE"), "HTTPRESPONSE");
480    }
481
482    #[test]
483    fn test_deduplication_xml_http_request() {
484        let arena = Arena::new();
485        let mut names = UniqueNames::new(&arena);
486
487        assert_eq!(names.uniquify("XMLHttpRequest"), "XMLHttpRequest");
488        assert_eq!(names.uniquify("xml_http_request"), "xml_http_request2");
489        assert_eq!(names.uniquify("XmlHttpRequest"), "XmlHttpRequest3");
490    }
491
492    #[test]
493    fn test_deduplication_preserves_original_casing() {
494        let arena = Arena::new();
495        let mut names = UniqueNames::new(&arena);
496
497        assert_eq!(names.uniquify("HTTP_Response"), "HTTP_Response");
498        assert_eq!(names.uniquify("httpResponse"), "httpResponse2");
499    }
500
501    #[test]
502    fn test_deduplication_same_prefix() {
503        let arena = Arena::new();
504        let mut names = UniqueNames::new(&arena);
505
506        assert_eq!(names.uniquify("HttpRequest"), "HttpRequest");
507        assert_eq!(names.uniquify("HttpResponse"), "HttpResponse");
508        assert_eq!(names.uniquify("HttpError"), "HttpError");
509    }
510
511    #[test]
512    fn test_deduplication_with_numbers() {
513        let arena = Arena::new();
514        let mut names = UniqueNames::new(&arena);
515
516        assert_eq!(names.uniquify("Response2"), "Response2");
517        assert_eq!(names.uniquify("response_2"), "response3");
518
519        // `0` becomes the bare stem.
520        assert_eq!(names.uniquify("Response0"), "Response");
521        assert_eq!(names.uniquify("response"), "response4");
522
523        // Digit-to-uppercase collisions.
524        assert_eq!(names.uniquify("1099KStatus"), "1099KStatus");
525        assert_eq!(names.uniquify("1099K_Status"), "1099K_Status2");
526        assert_eq!(names.uniquify("1099KStatus"), "1099KStatus3");
527        assert_eq!(names.uniquify("1099_K_Status"), "1099_K_Status4");
528
529        // Digit-to-lowercase collisions.
530        assert_eq!(names.uniquify("123abc"), "123abc");
531        assert_eq!(names.uniquify("123_abc"), "123_abc2");
532    }
533
534    #[test]
535    fn test_deduplication_numeric_suffixes() {
536        let arena = Arena::new();
537        let mut names = UniqueNames::new(&arena);
538
539        assert_eq!(names.uniquify("OAuth2"), "OAuth2");
540        assert_eq!(names.uniquify("OAuth_2"), "OAuth3");
541        assert_eq!(names.uniquify("OAuth"), "OAuth");
542        assert_eq!(names.uniquify("OAuth0"), "OAuth4");
543    }
544
545    #[test]
546    fn test_deduplication_empty_names_start_at_one() {
547        let arena = Arena::new();
548        let mut names = UniqueNames::new(&arena);
549
550        assert_eq!(names.uniquify(""), "1");
551        assert_eq!(names.uniquify("_"), "2");
552        assert_eq!(names.uniquify("---"), "3");
553    }
554
555    #[test]
556    fn test_deduplication_numeric_names_share_empty_stem() {
557        let arena = Arena::new();
558        let mut names = UniqueNames::new(&arena);
559
560        assert_eq!(names.uniquify("2"), "2");
561        assert_eq!(names.uniquify(""), "1");
562        assert_eq!(names.uniquify("2"), "3");
563
564        let arena = Arena::new();
565        let mut names = UniqueNames::new(&arena);
566
567        assert_eq!(names.uniquify("0"), "1");
568        assert_eq!(names.uniquify(""), "2");
569    }
570
571    #[test]
572    fn test_with_reserved_empty_stem_uses_zero_slot() {
573        let arena = Arena::new();
574        let mut names = UniqueNames::with_reserved(&arena, [""]);
575
576        assert_eq!(names.uniquify(""), "1");
577        assert_eq!(names.uniquify(""), "2");
578
579        let arena = Arena::new();
580        let mut names = UniqueNames::with_reserved(&arena, [""]);
581
582        assert_eq!(names.uniquify("0"), "1");
583        assert_eq!(names.uniquify(""), "2");
584
585        let arena = Arena::new();
586        let mut names = UniqueNames::with_reserved(&arena, ["0"]);
587
588        assert_eq!(names.uniquify("0"), "1");
589        assert_eq!(names.uniquify(""), "2");
590
591        let arena = Arena::new();
592        let mut names = UniqueNames::with_reserved(&arena, ["_"]);
593
594        assert_eq!(names.uniquify("_"), "1");
595        assert_eq!(names.uniquify("_"), "2");
596    }
597
598    #[test]
599    fn test_with_reserved_multiple() {
600        let arena = Arena::new();
601        let mut names = UniqueNames::with_reserved(&arena, ["_", "reserved"]);
602
603        assert_eq!(names.uniquify("_"), "1");
604        assert_eq!(names.uniquify("reserved"), "reserved2");
605        assert_eq!(names.uniquify("other"), "other");
606    }
607
608    #[test]
609    fn test_with_reserved_numeric_suffixes() {
610        let arena = Arena::new();
611        let mut names = UniqueNames::with_reserved(&arena, ["crate"]);
612
613        assert_eq!(names.uniquify("crate"), "crate2");
614        assert_eq!(names.uniquify("crate2"), "crate3");
615    }
616}