bp3d_util/
string.rs

1// Copyright (c) 2025, BlockProject 3D
2//
3// All rights reserved.
4//
5// Redistribution and use in source and binary forms, with or without modification,
6// are permitted provided that the following conditions are met:
7//
8//     * Redistributions of source code must retain the above copyright notice,
9//       this list of conditions and the following disclaimer.
10//     * Redistributions in binary form must reproduce the above copyright notice,
11//       this list of conditions and the following disclaimer in the documentation
12//       and/or other materials provided with the distribution.
13//     * Neither the name of BlockProject 3D nor the names of its contributors
14//       may be used to endorse or promote products derived from this software
15//       without specific prior written permission.
16//
17// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
21// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
22// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
23// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
24// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
25// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
26// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28
29//! String utilities.
30
31use crate::extension;
32use std::borrow::Cow;
33
34/// The range trait that represents all supported range types for sub_nearest method.
35pub trait Range {
36    /// The actual implementation of nearest substring, see [sub_nearest](StrTools::sub_nearest) for
37    /// more information.
38    fn sub_nearest<'a>(&self, obj: &'a str) -> &'a str;
39}
40
41extension! {
42    /// The main StrTools extension trait.
43    pub extension StrTools: str {
44        /// A substring method which truncates strings at the nearest UTF-8 code rather than
45        /// panicking.
46        ///
47        /// # Panics
48        ///
49        /// This function still panics if the given range is out of bounds. It however does not panic
50        /// if the passed range falls withing a UTF-8 code.
51        fn sub_nearest(&self, range: impl Range) -> &str;
52
53        /// A string capitalize function which operates on UTF-8 strings.
54        fn capitalise(&self) -> Cow<str>;
55
56        /// A string decapitalize function which operates on UTF-8 strings. This essentially does
57        /// the inverse of the [capitalise](StrTools::capitalise) function.
58        fn decapitalise(&self) -> Cow<str>;
59    }
60
61    /// The main string tools operating on raw byte slices.
62    pub extension BufTools: [u8] {
63        /// A string capitalize function which operates on ASCII only strings.
64        fn capitalise_ascii(&self) -> Cow<[u8]>;
65
66        /// A string decapitalize function which operates on ASCII only strings. This essentially does
67        /// the inverse of the [capitalise](BufTools::capitalise_ascii) function.
68        fn decapitalise_ascii(&self) -> Cow<[u8]>;
69    }
70}
71
72fn utf8_max(buf: &[u8], max: usize) -> &[u8] {
73    if unsafe { buf.get_unchecked(max.unchecked_sub(1)) } & 0x80 == 0x00 {
74        &buf[..max]
75    } else {
76        let start = unsafe { max.unchecked_sub(1) };
77        let mut i = start;
78        unsafe {
79            while buf.get_unchecked(i) & 0xC0 == 0x80 {
80                i = i.unchecked_sub(1);
81            }
82            let n = start.unchecked_sub(i);
83            if (buf.get_unchecked(i) & 0xF0 == 0xF0 && n == 4)
84                || (buf.get_unchecked(i) & 0xE0 == 0xE0 && n == 3)
85                || (buf.get_unchecked(i) & 0xC0 == 0xC0 && n == 2)
86            {
87                &buf[..max]
88            } else {
89                &buf[..i]
90            }
91        }
92    }
93}
94
95fn utf8_min(buf: &[u8], start: usize) -> &[u8] {
96    if unsafe { buf.get_unchecked(start) } & 0x80 == 0x00 {
97        &buf[start..]
98    } else {
99        let mut i = start;
100        unsafe {
101            while i < buf.len() && buf.get_unchecked(i) & 0xC0 == 0x80 {
102                i = i.unchecked_add(1);
103            }
104            &buf[i..]
105        }
106    }
107}
108
109impl Range for std::ops::Range<usize> {
110    fn sub_nearest<'a>(&self, obj: &'a str) -> &'a str {
111        let bytes = obj.as_bytes();
112        let bytes = utf8_max(bytes, self.end);
113        if bytes.is_empty() {
114            return "";
115        }
116        let bytes = utf8_min(bytes, self.start);
117        unsafe { std::str::from_utf8(bytes).unwrap_unchecked() }
118    }
119}
120
121impl Range for std::ops::RangeTo<usize> {
122    fn sub_nearest<'a>(&self, obj: &'a str) -> &'a str {
123        let bytes = obj.as_bytes();
124        let bytes = utf8_max(bytes, self.end);
125        unsafe { std::str::from_utf8(bytes).unwrap_unchecked() }
126    }
127}
128
129impl Range for std::ops::RangeFrom<usize> {
130    fn sub_nearest<'a>(&self, obj: &'a str) -> &'a str {
131        let bytes = obj.as_bytes();
132        let bytes = utf8_min(bytes, self.start);
133        unsafe { std::str::from_utf8(bytes).unwrap_unchecked() }
134    }
135}
136
137impl StrTools for str {
138    fn sub_nearest(&self, range: impl Range) -> &str {
139        range.sub_nearest(self)
140    }
141
142    fn capitalise(&self) -> Cow<str> {
143        if self.is_empty() {
144            return self.into();
145        }
146        let first = unsafe { self.chars().next().unwrap_unchecked() };
147        if first.is_uppercase() {
148            self.into()
149        } else {
150            (self.sub_nearest(..1).to_uppercase() + self.sub_nearest(1..)).into()
151        }
152    }
153
154    fn decapitalise(&self) -> Cow<str> {
155        if self.is_empty() {
156            return self.into();
157        }
158        let first = unsafe { self.chars().next().unwrap_unchecked() };
159        if first.is_uppercase() {
160            (self.sub_nearest(..1).to_lowercase() + self.sub_nearest(1..)).into()
161        } else {
162            self.into()
163        }
164    }
165}
166
167impl BufTools for [u8] {
168    fn capitalise_ascii(&self) -> Cow<[u8]> {
169        if self.is_empty() {
170            return self.into();
171        }
172        if self[0] >= b'A' && self[0] <= b'Z' {
173            self.into()
174        } else {
175            let mut v: Vec<u8> = self.into();
176            v[0] = v[0].to_ascii_uppercase();
177            v.into()
178        }
179    }
180
181    fn decapitalise_ascii(&self) -> Cow<[u8]> {
182        if self.is_empty() {
183            return self.into();
184        }
185        if self[0] >= b'A' && self[0] <= b'Z' {
186            let mut v: Vec<u8> = self.into();
187            v[0] = v[0].to_ascii_lowercase();
188            v.into()
189        } else {
190            self.into()
191        }
192    }
193}
194
195#[cfg(test)]
196mod tests {
197    use crate::string::{BufTools, StrTools};
198    use std::borrow::Cow;
199
200    #[test]
201    fn sub_basic() {
202        let str = "Hello";
203        assert_eq!(str.sub_nearest(..1), "H");
204        assert_eq!(str.sub_nearest(1..), "ello");
205    }
206
207    #[test]
208    fn truncate_ascii() {
209        let s = "this is a test";
210        assert_eq!(s.sub_nearest(..4), "this");
211        assert_eq!(&s[4..7], " is");
212        assert_eq!(s.sub_nearest(4..7), " is");
213    }
214
215    #[test]
216    fn truncate_utf8() {
217        let msg = "我";
218        assert_eq!(msg.sub_nearest(..3), "我");
219        assert_eq!(msg.sub_nearest(..1), "");
220        assert_eq!(msg.sub_nearest(1..), "");
221    }
222
223    #[test]
224    fn truncate_utf82() {
225        let msg = "我是";
226        assert_eq!(msg.sub_nearest(..6), "我是");
227        assert_eq!(msg.sub_nearest(..5), "我");
228        assert_eq!(msg.sub_nearest(1..), "是");
229    }
230
231    #[test]
232    fn truncate_utf83() {
233        let msg = "我abcd";
234        assert_eq!(msg.sub_nearest(..6), "我abc");
235        assert_eq!(msg.sub_nearest(1..), "abcd");
236        assert_eq!(msg.sub_nearest(1..2), "");
237        assert_eq!(msg.sub_nearest(1..4), "a");
238        assert_eq!(msg.sub_nearest(1..5), "ab");
239        assert_eq!(msg.sub_nearest(1..msg.len()), "abcd");
240        assert_eq!(msg.sub_nearest(1..msg.len() - 1), "abc");
241    }
242
243    #[test]
244    fn basic_capitalize() {
245        let msg = "abc";
246        let msg1 = "Abc";
247        assert_eq!(msg.capitalise(), "Abc");
248        assert_eq!(msg1.capitalise(), "Abc");
249        assert!(matches!(msg1.capitalise(), Cow::Borrowed(_)));
250        assert_eq!(msg1.decapitalise(), "abc");
251    }
252
253    #[test]
254    fn ascii_capitalize() {
255        let msg = "abc";
256        let msg1 = "Abc";
257        assert_eq!(&*msg.as_bytes().capitalise_ascii(), b"Abc");
258        assert_eq!(&*msg1.as_bytes().capitalise_ascii(), b"Abc");
259        assert!(matches!(
260            msg1.as_bytes().capitalise_ascii(),
261            Cow::Borrowed(_)
262        ));
263        assert_eq!(&*msg1.as_bytes().decapitalise_ascii(), b"abc");
264    }
265}