Skip to main content

convert_case/
boundary.rs

1use unicode_segmentation::UnicodeSegmentation;
2
3use alloc::vec::Vec;
4
5fn grapheme_is_digit(c: &&str) -> bool {
6    c.chars().all(|c| c.is_ascii_digit())
7}
8
9fn grapheme_is_uppercase(c: &&str) -> bool {
10    c.to_uppercase() != c.to_lowercase() && *c == c.to_uppercase()
11}
12
13fn grapheme_is_lowercase(c: &&str) -> bool {
14    c.to_uppercase() != c.to_lowercase() && *c == c.to_lowercase()
15}
16
17/// Conditions for splitting an identifier into words.
18///
19/// Some boundaries, [`Hyphen`](Boundary::Hyphen), [`Underscore`](Boundary::Underscore), and [`Space`](Boundary::Space),
20/// consume the character they split on, whereas the other boundaries do not.
21///
22/// `Boundary` includes methods that return useful groups of boundaries.  It also
23/// contains the [`defaults_from`](Boundary::defaults_from) method which will generate a subset
24/// of default boundaries based on the boundaries present in a string.
25///
26/// You can also create custom delimiter boundaries using the [`separator`](crate::separator)
27/// macro or directly instantiate `Boundary` for complex boundary conditions.
28/// ```
29/// use convert_case::{Boundary, Case, Casing, Converter};
30///
31/// assert_eq!(
32///     "TransformationsIn3D"
33///         .from_case(Case::Camel)
34///         .remove_boundaries(&Boundary::digit_letter())
35///         .to_case(Case::Snake),
36///     "transformations_in_3d",
37/// );
38///
39/// let conv = Converter::new()
40///     .set_boundaries(&Boundary::defaults_from("aA "))
41///     .to_case(Case::Title);
42/// assert_eq!(conv.convert("myVariable Name"), "My Variable Name");
43/// ```
44///
45/// ## Example
46///
47/// For more complex boundaries, such as splitting based on the first character being a certain
48/// symbol and the second is lowercase, you can instantiate a boundary directly.
49///
50/// ```
51/// # use convert_case::{Boundary, Case, Casing};
52/// let at_then_letter = Boundary::Custom {
53///     condition: |s| {
54///         s.get(0).map(|c| *c == "@") == Some(true)
55///             && s.get(1).map(|c| *c == c.to_lowercase()) == Some(true)
56///     },
57///     start: 1,
58///     len: 0,
59/// };
60/// assert_eq!(
61///     "name@domain"
62///         .set_boundaries(&[at_then_letter])
63///         .to_case(Case::Title),
64///     "Name@ Domain",
65/// )
66/// ```
67
68#[derive(Debug, Clone, Copy)]
69pub enum Boundary {
70    Custom {
71        /// A function that determines if this boundary is present at the start
72        /// of the string.  Second argument is the `arg` field.
73        condition: fn(&[&str]) -> bool,
74        /// Where the beginning of the boundary is.
75        start: usize,
76        /// The length of the boundary.  This is the number of graphemes that
77        /// are removed when splitting.
78        len: usize,
79    },
80
81    /// Splits on `-`, consuming the character on segmentation.
82    /// ```
83    /// # use convert_case::Boundary;
84    /// assert_eq!(
85    ///     Boundary::defaults_from("-"),
86    ///     vec![Boundary::Hyphen],
87    /// );
88    /// ```
89    Hyphen,
90
91    /// Splits on `_`, consuming the character on segmentation.
92    /// ```
93    /// # use convert_case::Boundary;
94    /// assert_eq!(
95    ///     Boundary::defaults_from("_"),
96    ///     vec![Boundary::Underscore],
97    /// );
98    /// ```
99    Underscore,
100
101    /// Splits on space, consuming the character on segmentation.
102    /// ```
103    /// # use convert_case::Boundary;
104    /// assert_eq!(
105    ///     Boundary::defaults_from(" "),
106    ///     vec![Boundary::Space],
107    /// );
108    /// ```
109    Space,
110
111    /// Splits where an uppercase letter is followed by a lowercase letter.  This is seldom used,
112    /// and is **not** included in the [defaults](Boundary::defaults).
113    /// ```
114    /// # use convert_case::Boundary;
115    /// assert!(Boundary::defaults_from("Aa").is_empty());
116    UpperLower,
117
118    /// Splits where a lowercase letter is followed by an uppercase letter.
119    /// ```
120    /// # use convert_case::Boundary;
121    /// assert_eq!(
122    ///     Boundary::defaults_from("aA"),
123    ///     vec![Boundary::LowerUpper],
124    /// );
125    /// ```
126    LowerUpper,
127
128    /// Splits where digit is followed by an uppercase letter.
129    /// ```
130    /// # use convert_case::Boundary;
131    /// assert_eq!(
132    ///     Boundary::defaults_from("1A"),
133    ///     vec![Boundary::DigitUpper],
134    /// );
135    /// ```
136    DigitUpper,
137
138    /// Splits where an uppercase letter is followed by a digit.
139    /// ```
140    /// # use convert_case::Boundary;
141    /// assert_eq!(
142    ///     Boundary::defaults_from("A1"),
143    ///     vec![Boundary::UpperDigit],
144    /// );
145    /// ```
146    UpperDigit,
147
148    /// Splits where digit is followed by a lowercase letter.
149    /// ```
150    /// # use convert_case::Boundary;
151    /// assert_eq!(
152    ///     Boundary::defaults_from("1a"),
153    ///     vec![Boundary::DigitLower],
154    /// );
155    /// ```
156    DigitLower,
157
158    /// Splits where a lowercase letter is followed by a digit.
159    /// ```
160    /// # use convert_case::Boundary;
161    /// assert_eq!(
162    ///     Boundary::defaults_from("a1"),
163    ///     vec![Boundary::LowerDigit],
164    /// );
165    /// ```
166    LowerDigit,
167
168    /// Acronyms are identified by two uppercase letters followed by a lowercase letter.
169    /// The word boundary is between the two uppercase letters.  For example, "HTTPRequest"
170    /// would have an acronym boundary identified at "PRe" and split into "HTTP" and "Request".
171    /// ```
172    /// # use convert_case::Boundary;
173    /// assert_eq!(
174    ///     Boundary::defaults_from("AAa"),
175    ///     vec![Boundary::Acronym],
176    /// );
177    /// ```
178    Acronym,
179}
180
181impl Boundary {
182    pub fn matches(self, s: &[&str]) -> bool {
183        use Boundary::*;
184        match self {
185            Underscore => s.first() == Some(&"_"),
186            Hyphen => s.first() == Some(&"-"),
187            Space => s.first() == Some(&" "),
188            Acronym => {
189                s.first().map(grapheme_is_uppercase) == Some(true)
190                    && s.get(1).map(grapheme_is_uppercase) == Some(true)
191                    && s.get(2).map(grapheme_is_lowercase) == Some(true)
192            }
193            LowerUpper => {
194                s.first().map(grapheme_is_lowercase) == Some(true)
195                    && s.get(1).map(grapheme_is_uppercase) == Some(true)
196            }
197            UpperLower => {
198                s.first().map(grapheme_is_uppercase) == Some(true)
199                    && s.get(1).map(grapheme_is_lowercase) == Some(true)
200            }
201            LowerDigit => {
202                s.first().map(grapheme_is_lowercase) == Some(true)
203                    && s.get(1).map(grapheme_is_digit) == Some(true)
204            }
205            UpperDigit => {
206                s.first().map(grapheme_is_uppercase) == Some(true)
207                    && s.get(1).map(grapheme_is_digit) == Some(true)
208            }
209            DigitLower => {
210                s.first().map(grapheme_is_digit) == Some(true)
211                    && s.get(1).map(grapheme_is_lowercase) == Some(true)
212            }
213            DigitUpper => {
214                s.first().map(grapheme_is_digit) == Some(true)
215                    && s.get(1).map(grapheme_is_uppercase) == Some(true)
216            }
217            Custom { condition, .. } => condition(s),
218        }
219    }
220
221    /// The number of graphemes consumed when splitting at the boundary.
222    pub fn len(self) -> usize {
223        use Boundary::*;
224        match self {
225            Underscore | Hyphen | Space => 1,
226            LowerUpper | UpperLower | LowerDigit | UpperDigit | DigitLower | DigitUpper
227            | Acronym => 0,
228            Custom { len, .. } => len,
229        }
230    }
231
232    /// Returns true if this boundary consumes no graphemes when splitting.
233    pub fn is_empty(self) -> bool {
234        self.len() == 0
235    }
236
237    /// The index of the character to split at.
238    pub fn start(self) -> usize {
239        use Boundary::*;
240        match self {
241            Underscore | Hyphen | Space => 0,
242            LowerUpper | UpperLower | LowerDigit | UpperDigit | DigitLower | DigitUpper
243            | Acronym => 1,
244            Custom { start, .. } => start,
245        }
246    }
247
248    /// The default list of boundaries used when `Casing::to_case` is called directly
249    /// and in a `Converter` generated from `Converter::new()`.
250    /// ```
251    /// # use convert_case::Boundary;
252    /// assert_eq!(
253    ///     Boundary::defaults(),
254    ///     [
255    ///         Boundary::Underscore,
256    ///         Boundary::Hyphen,
257    ///         Boundary::Space,
258    ///         Boundary::LowerUpper,
259    ///         Boundary::LowerDigit,
260    ///         Boundary::UpperDigit,
261    ///         Boundary::DigitLower,
262    ///         Boundary::DigitUpper,
263    ///         Boundary::Acronym,
264    ///     ],
265    /// );
266    /// ```
267    pub const fn defaults() -> [Boundary; 9] {
268        [
269            Boundary::Underscore,
270            Boundary::Hyphen,
271            Boundary::Space,
272            Boundary::LowerUpper,
273            Boundary::LowerDigit,
274            Boundary::UpperDigit,
275            Boundary::DigitLower,
276            Boundary::DigitUpper,
277            Boundary::Acronym,
278        ]
279    }
280
281    /// Returns the boundaries that involve digits.
282    /// ```
283    /// # use convert_case::Boundary;
284    /// assert_eq!(
285    ///     Boundary::digits(),
286    ///     [
287    ///         Boundary::LowerDigit,
288    ///         Boundary::UpperDigit,
289    ///         Boundary::DigitLower,
290    ///         Boundary::DigitUpper,
291    ///     ],
292    /// );
293    /// ```
294    pub const fn digits() -> [Boundary; 4] {
295        [
296            Boundary::LowerDigit,
297            Boundary::UpperDigit,
298            Boundary::DigitLower,
299            Boundary::DigitUpper,
300        ]
301    }
302
303    /// Returns the boundaries that are letters followed by digits.
304    /// ```
305    /// # use convert_case::Boundary;
306    /// assert_eq!(
307    ///     Boundary::letter_digit(),
308    ///     [
309    ///         Boundary::LowerDigit,
310    ///         Boundary::UpperDigit,
311    ///     ],
312    /// );
313    /// ```
314    pub const fn letter_digit() -> [Boundary; 2] {
315        [Boundary::LowerDigit, Boundary::UpperDigit]
316    }
317
318    /// Returns the boundaries that are digits followed by letters.
319    /// ```
320    /// # use convert_case::Boundary;
321    /// assert_eq!(
322    ///     Boundary::digit_letter(),
323    ///     [
324    ///         Boundary::DigitLower,
325    ///         Boundary::DigitUpper
326    ///     ],
327    /// );
328    /// ```
329    pub const fn digit_letter() -> [Boundary; 2] {
330        [Boundary::DigitLower, Boundary::DigitUpper]
331    }
332
333    /// Returns a list of all boundaries that are identified within the given string.
334    /// Could be a short of writing out all the boundaries in a list directly.  This will not
335    /// identify boundary `UpperLower` if it also used as part of `Acronym`.
336    ///
337    /// If you want to be very explicit and not overlap boundaries, it is recommended to use a colon
338    /// character.
339    /// ```
340    /// # use convert_case::Boundary;
341    /// assert_eq!(
342    ///     Boundary::defaults_from("aA8a -"),
343    ///     vec![
344    ///         Boundary::Hyphen,
345    ///         Boundary::Space,
346    ///         Boundary::LowerUpper,
347    ///         Boundary::UpperDigit,
348    ///         Boundary::DigitLower,
349    ///     ],
350    /// );
351    /// assert_eq!(
352    ///     Boundary::defaults_from("bD:0B:_:AAa"),
353    ///     vec![
354    ///         Boundary::Underscore,
355    ///         Boundary::LowerUpper,
356    ///         Boundary::DigitUpper,
357    ///         Boundary::Acronym,
358    ///     ],
359    /// );
360    /// ```
361    pub fn defaults_from(pattern: &str) -> Vec<Boundary> {
362        let mut boundaries = Vec::new();
363        for boundary in Boundary::defaults() {
364            let parts = split(&pattern, &[boundary]);
365            if parts.len() > 1 || parts.is_empty() || parts[0] != pattern {
366                boundaries.push(boundary);
367            }
368        }
369        boundaries
370    }
371}
372
373impl PartialEq for Boundary {
374    fn eq(&self, other: &Self) -> bool {
375        match (self, other) {
376            (Self::Hyphen, Self::Hyphen) => true,
377            (Self::Underscore, Self::Underscore) => true,
378            (Self::Space, Self::Space) => true,
379            (Self::UpperLower, Self::UpperLower) => true,
380            (Self::LowerUpper, Self::LowerUpper) => true,
381            (Self::DigitUpper, Self::DigitUpper) => true,
382            (Self::UpperDigit, Self::UpperDigit) => true,
383            (Self::DigitLower, Self::DigitLower) => true,
384            (Self::LowerDigit, Self::LowerDigit) => true,
385            (Self::Acronym, Self::Acronym) => true,
386            // Custom boundaries are never equal because they contain function pointers,
387            // which cannot be reliably compared.
388            (Self::Custom { .. }, Self::Custom { .. }) => false,
389            _ => false,
390        }
391    }
392}
393
394impl Eq for Boundary {}
395
396impl core::hash::Hash for Boundary {
397    fn hash<H: core::hash::Hasher>(&self, state: &mut H) {
398        // Hash only the discriminant. Custom variants can't be meaningfully
399        // compared or hashed by their function pointer, so all Custom variants
400        // hash to the same value (their discriminant).
401        core::mem::discriminant(self).hash(state);
402    }
403}
404
405/// Split an identifier into a list of words using the list of boundaries.
406///
407/// This is used internally for splitting an identifier before mutating by
408/// a pattern and joining again with a delimiter.
409/// ```
410/// use convert_case::{Boundary, split};
411/// assert_eq!(
412///     split(&"one_two-three.four", &[Boundary::Underscore, Boundary::Hyphen]),
413///     vec!["one", "two", "three.four"],
414/// )
415/// ```
416pub fn split<'s, T>(s: &'s T, boundaries: &[Boundary]) -> Vec<&'s str>
417where
418    T: AsRef<str>,
419{
420    let s = s.as_ref();
421
422    if s.is_empty() {
423        return Vec::new();
424    }
425
426    let mut words = Vec::new();
427    let mut last_boundary_end = 0;
428
429    let (indices, graphemes): (Vec<_>, Vec<_>) = s.grapheme_indices(true).unzip();
430    let grapheme_length = indices[graphemes.len() - 1] + graphemes[graphemes.len() - 1].len();
431
432    // TODO:
433    // swapping the order of these would be faster
434    // end the loop sooner if any boundary condition is met
435    // could also hit a bitvector and do the splitting at the end?  May or may not be faster
436    for i in 0..graphemes.len() {
437        for boundary in boundaries {
438            //let byte_index = indices[i];
439
440            if boundary.matches(&graphemes[i..]) {
441                // What if we find a condition at the end of the array?
442                // Maybe we can stop early based on length
443                // To do this, need to switch the loops
444                // TODO
445                let boundary_byte_start: usize = *indices
446                    .get(i + boundary.start())
447                    .unwrap_or(&grapheme_length);
448                let boundary_byte_end: usize = *indices
449                    .get(i + boundary.start() + boundary.len())
450                    .unwrap_or(&grapheme_length);
451
452                // todo clean this up a bit
453                words.push(&s[last_boundary_end..boundary_byte_start]);
454                last_boundary_end = boundary_byte_end;
455                break;
456            }
457        }
458    }
459    words.push(&s[last_boundary_end..]);
460    //words.into_iter().filter(|s| !s.is_empty()).collect()
461    words.into_iter().collect()
462}
463
464/// Create a new boundary based on a string.
465///
466/// This is shorthand for creating a boundary that splits on a specific string, and
467/// omits that string from the list of words.  For more information, see [`Boundary`].
468/// ```
469/// # use convert_case::{Case, Converter, separator};
470/// let conv = Converter::new()
471///     .set_boundaries(&[separator!("::")])
472///     .to_case(Case::Camel);
473///
474/// assert_eq!(
475///     conv.convert("my::var::name"),
476///     "myVarName",
477/// )
478/// ```
479#[macro_export]
480macro_rules! separator {
481    ($delim:expr) => {
482        convert_case::Boundary::Custom {
483            condition: |s| s.join("").starts_with($delim),
484            start: 0,
485            len: $delim.len(),
486        }
487    };
488}
489
490#[cfg(test)]
491mod tests {
492    use super::*;
493    use rstest::rstest;
494
495    #[test]
496    fn custom_boundary_inequality() {
497        // Custom boundaries are never equal because they contain function pointers
498        let a = Boundary::Custom {
499            condition: |_| true,
500            start: 0,
501            len: 0,
502        };
503        let b = a;
504
505        assert_ne!(a, b)
506    }
507
508    #[test]
509    fn default_boundary_equality() {
510        assert_eq!(Boundary::Hyphen, Boundary::Hyphen);
511        assert_eq!(Boundary::Space, Boundary::Space);
512        assert_ne!(Boundary::Hyphen, Boundary::Space);
513    }
514
515    #[rstest]
516    #[case(Boundary::Hyphen, "a-b-c", vec!["a", "b", "c"])]
517    #[case(Boundary::Underscore, "a_b_c", vec!["a", "b", "c"])]
518    #[case(Boundary::Space, "a b c", vec!["a", "b", "c"])]
519    #[case(Boundary::LowerUpper, "lowerUpperUpper", vec!["lower", "Upper", "Upper"])]
520    #[case(Boundary::UpperLower, "ABc", vec!["AB", "c"])]
521    #[case(Boundary::Acronym, "XMLRequest", vec!["XML", "Request"])]
522    #[case(Boundary::LowerDigit, "abc123", vec!["abc", "123"])]
523    #[case(Boundary::UpperDigit, "ABC123", vec!["ABC", "123"])]
524    #[case(Boundary::DigitLower, "123abc", vec!["123", "abc"])]
525    #[case(Boundary::DigitUpper, "123ABC", vec!["123", "ABC"])]
526    fn split_on_boundary(
527        #[case] boundary: Boundary,
528        #[case] input: &str,
529        #[case] expected: Vec<&str>,
530    ) {
531        assert_eq!(split(&input, &[boundary]), expected);
532    }
533
534    #[test]
535    fn split_on_multiple_delimiters() {
536        let s = "aaa-bbb_ccc ddd ddd-eee";
537        let v = split(
538            &s,
539            &[Boundary::Space, Boundary::Underscore, Boundary::Hyphen],
540        );
541        assert_eq!(v, vec!["aaa", "bbb", "ccc", "ddd", "ddd", "eee"]);
542    }
543
544    #[test]
545    fn boundaries_found_in_string() {
546        // upper lower is no longer a default
547        assert_eq!(Boundary::defaults_from(".Aaaa"), Vec::<Boundary>::new());
548        assert_eq!(
549            Boundary::defaults_from("a8.Aa.aA"),
550            vec![Boundary::LowerUpper, Boundary::LowerDigit]
551        );
552        assert_eq!(
553            Boundary::defaults_from("b1B1b"),
554            Boundary::digits().to_vec()
555        );
556        assert_eq!(
557            Boundary::defaults_from("AAa -_"),
558            vec![
559                Boundary::Underscore,
560                Boundary::Hyphen,
561                Boundary::Space,
562                Boundary::Acronym,
563            ]
564        );
565    }
566}