bp3d_util/
string.rs

1// Copyright (c) 2025, BlockProject 3D
2//
3// All rights reserved.
4//
5// Redistribution and use in source and binary forms, with or without modification,
6// are permitted provided that the following conditions are met:
7//
8//     * Redistributions of source code must retain the above copyright notice,
9//       this list of conditions and the following disclaimer.
10//     * Redistributions in binary form must reproduce the above copyright notice,
11//       this list of conditions and the following disclaimer in the documentation
12//       and/or other materials provided with the distribution.
13//     * Neither the name of BlockProject 3D nor the names of its contributors
14//       may be used to endorse or promote products derived from this software
15//       without specific prior written permission.
16//
17// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
21// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
22// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
23// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
24// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
25// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
26// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28
29//! String utilities.
30
31use crate::extension;
32use std::borrow::Cow;
33
34/// The range trait that represents all supported range types for sub_nearest method.
35pub trait Range {
36    /// The actual implementation of nearest substring, see [sub_nearest](StrTools::sub_nearest) for
37    /// more information.
38    fn sub_nearest<'a>(&self, obj: &'a str) -> &'a str;
39}
40
41extension! {
42    /// The main StrTools extension trait.
43    pub extension StrTools: str {
44        /// A substring method which truncates strings at the nearest UTF-8 code rather than
45        /// panicking.
46        ///
47        /// # Panics
48        ///
49        /// This function still panics if the given range is out of bounds. It however does not panic
50        /// if the passed range falls withing a UTF-8 code.
51        fn sub_nearest(&self, range: impl Range) -> &str;
52
53        /// A string capitalize function which operates on UTF-8 strings.
54        fn capitalise(&self) -> Cow<str>;
55
56        /// A string decapitalize function which operates on UTF-8 strings. This essentially does
57        /// the inverse of the [capitalise](StrTools::capitalise) function.
58        fn decapitalise(&self) -> Cow<str>;
59    }
60
61    /// The main string tools operating on raw byte slices.
62    pub extension BufTools: [u8] {
63        /// A string capitalize function which operates on ASCII only strings.
64        fn capitalise_ascii(&self) -> Cow<[u8]>;
65
66        /// A string decapitalize function which operates on ASCII only strings. This essentially does
67        /// the inverse of the [capitalise](BufTools::capitalise_ascii) function.
68        fn decapitalise_ascii(&self) -> Cow<[u8]>;
69    }
70}
71
72fn utf8_max(buf: &[u8], max: usize) -> &[u8] {
73    if unsafe { buf.get_unchecked(max.unchecked_sub(1)) } & 0x80 == 0x00 {
74        &buf[..max]
75    } else {
76        let start = unsafe { max.unchecked_sub(1) };
77        let mut i = start;
78        unsafe {
79            while buf.get_unchecked(i) & 0xC0 == 0x80 {
80                i = i.unchecked_sub(1);
81            }
82            let n = start.unchecked_sub(i);
83            if (buf.get_unchecked(i) & 0xF0 == 0xF0 && n == 4)
84                || (buf.get_unchecked(i) & 0xE0 == 0xE0 && n == 3)
85                || (buf.get_unchecked(i) & 0xC0 == 0xC0 && n == 2)
86            {
87                &buf[..max]
88            } else {
89                &buf[..i]
90            }
91        }
92    }
93}
94
95fn utf8_min(buf: &[u8], start: usize) -> &[u8] {
96    if unsafe { buf.get_unchecked(start) } & 0x80 == 0x00 {
97        &buf[start..]
98    } else {
99        let start = start;
100        let mut i = start;
101        unsafe {
102            while i < buf.len() && buf.get_unchecked(i) & 0xC0 == 0x80 {
103                i = i.unchecked_add(1);
104            }
105            &buf[i..]
106        }
107    }
108}
109
110impl Range for std::ops::Range<usize> {
111    fn sub_nearest<'a>(&self, obj: &'a str) -> &'a str {
112        let bytes = obj.as_bytes();
113        let bytes = utf8_max(bytes, self.end);
114        if bytes.is_empty() {
115            return "";
116        }
117        let bytes = utf8_min(bytes, self.start);
118        unsafe { std::str::from_utf8(bytes).unwrap_unchecked() }
119    }
120}
121
122impl Range for std::ops::RangeTo<usize> {
123    fn sub_nearest<'a>(&self, obj: &'a str) -> &'a str {
124        let bytes = obj.as_bytes();
125        let bytes = utf8_max(bytes, self.end);
126        unsafe { std::str::from_utf8(bytes).unwrap_unchecked() }
127    }
128}
129
130impl Range for std::ops::RangeFrom<usize> {
131    fn sub_nearest<'a>(&self, obj: &'a str) -> &'a str {
132        let bytes = obj.as_bytes();
133        let bytes = utf8_min(bytes, self.start);
134        unsafe { std::str::from_utf8(bytes).unwrap_unchecked() }
135    }
136}
137
138impl StrTools for str {
139    fn sub_nearest(&self, range: impl Range) -> &str {
140        range.sub_nearest(self)
141    }
142
143    fn capitalise(&self) -> Cow<str> {
144        if self.is_empty() {
145            return self.into();
146        }
147        let first = unsafe { self.chars().next().unwrap_unchecked() };
148        if first.is_uppercase() {
149            self.into()
150        } else {
151            (self.sub_nearest(..1).to_uppercase() + self.sub_nearest(1..)).into()
152        }
153    }
154
155    fn decapitalise(&self) -> Cow<str> {
156        if self.is_empty() {
157            return self.into();
158        }
159        let first = unsafe { self.chars().next().unwrap_unchecked() };
160        if first.is_uppercase() {
161            (self.sub_nearest(..1).to_lowercase() + self.sub_nearest(1..)).into()
162        } else {
163            self.into()
164        }
165    }
166}
167
168impl BufTools for [u8] {
169    fn capitalise_ascii(&self) -> Cow<[u8]> {
170        if self.is_empty() {
171            return self.into();
172        }
173        if self[0] >= b'A' && self[0] <= b'Z' {
174            self.into()
175        } else {
176            let mut v: Vec<u8> = self.into();
177            v[0] = v[0].to_ascii_uppercase();
178            v.into()
179        }
180    }
181
182    fn decapitalise_ascii(&self) -> Cow<[u8]> {
183        if self.is_empty() {
184            return self.into();
185        }
186        if self[0] >= b'A' && self[0] <= b'Z' {
187            let mut v: Vec<u8> = self.into();
188            v[0] = v[0].to_ascii_lowercase();
189            v.into()
190        } else {
191            self.into()
192        }
193    }
194}
195
196#[cfg(test)]
197mod tests {
198    use crate::string::{BufTools, StrTools};
199    use std::borrow::Cow;
200
201    #[test]
202    fn sub_basic() {
203        let str = "Hello";
204        assert_eq!(str.sub_nearest(..1), "H");
205        assert_eq!(str.sub_nearest(1..), "ello");
206    }
207
208    #[test]
209    fn truncate_ascii() {
210        let s = "this is a test";
211        assert_eq!(s.sub_nearest(..4), "this");
212        assert_eq!(&s[4..7], " is");
213        assert_eq!(s.sub_nearest(4..7), " is");
214    }
215
216    #[test]
217    fn truncate_utf8() {
218        let msg = "我";
219        assert_eq!(msg.sub_nearest(..3), "我");
220        assert_eq!(msg.sub_nearest(..1), "");
221        assert_eq!(msg.sub_nearest(1..), "");
222    }
223
224    #[test]
225    fn truncate_utf82() {
226        let msg = "我是";
227        assert_eq!(msg.sub_nearest(..6), "我是");
228        assert_eq!(msg.sub_nearest(..5), "我");
229        assert_eq!(msg.sub_nearest(1..), "是");
230    }
231
232    #[test]
233    fn truncate_utf83() {
234        let msg = "我abcd";
235        assert_eq!(msg.sub_nearest(..6), "我abc");
236        assert_eq!(msg.sub_nearest(1..), "abcd");
237        assert_eq!(msg.sub_nearest(1..2), "");
238        assert_eq!(msg.sub_nearest(1..4), "a");
239        assert_eq!(msg.sub_nearest(1..5), "ab");
240        assert_eq!(msg.sub_nearest(1..msg.len()), "abcd");
241        assert_eq!(msg.sub_nearest(1..msg.len() - 1), "abc");
242    }
243
244    #[test]
245    fn basic_capitalize() {
246        let msg = "abc";
247        let msg1 = "Abc";
248        assert_eq!(msg.capitalise(), "Abc");
249        assert_eq!(msg1.capitalise(), "Abc");
250        assert!(matches!(msg1.capitalise(), Cow::Borrowed(_)));
251        assert_eq!(msg1.decapitalise(), "abc");
252    }
253
254    #[test]
255    fn ascii_capitalize() {
256        let msg = "abc";
257        let msg1 = "Abc";
258        assert_eq!(&*msg.as_bytes().capitalise_ascii(), b"Abc");
259        assert_eq!(&*msg1.as_bytes().capitalise_ascii(), b"Abc");
260        assert!(matches!(
261            msg1.as_bytes().capitalise_ascii(),
262            Cow::Borrowed(_)
263        ));
264        assert_eq!(&*msg1.as_bytes().decapitalise_ascii(), b"abc");
265    }
266}