1#[cfg(test)]
2mod tests;
3
4use std::borrow::Cow;
5use std::ops::{Bound, RangeBounds};
6use std::{fmt, slice};
7
8use memchr::memmem;
9
10use crate::chars;
11
12#[inline]
20fn has_ascii_graphemes(string: &str) -> bool {
21 string.is_ascii() && memmem::find(string.as_bytes(), b"\r\n").is_none()
22}
23
24#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Hash)]
96pub enum Utf32Str<'a> {
97 Ascii(&'a [u8]),
101 Unicode(&'a [char]),
103}
104
105impl<'a> Utf32Str<'a> {
106 pub fn new(str: &'a str, buf: &'a mut Vec<char>) -> Self {
108 if has_ascii_graphemes(str) {
109 Utf32Str::Ascii(str.as_bytes())
110 } else {
111 buf.clear();
112 buf.extend(crate::chars::graphemes(str));
113 Utf32Str::Unicode(buf)
114 }
115 }
116
117 #[inline]
119 pub fn len(self) -> usize {
120 match self {
121 Utf32Str::Unicode(codepoints) => codepoints.len(),
122 Utf32Str::Ascii(ascii_bytes) => ascii_bytes.len(),
123 }
124 }
125
126 #[inline]
128 pub fn is_empty(self) -> bool {
129 match self {
130 Utf32Str::Unicode(codepoints) => codepoints.is_empty(),
131 Utf32Str::Ascii(ascii_bytes) => ascii_bytes.is_empty(),
132 }
133 }
134
135 #[inline]
138 pub fn slice(self, range: impl RangeBounds<usize>) -> Utf32Str<'a> {
139 let start = match range.start_bound() {
140 Bound::Included(&start) => start,
141 Bound::Excluded(&start) => start + 1,
142 Bound::Unbounded => 0,
143 };
144 let end = match range.end_bound() {
145 Bound::Included(&end) => end + 1,
146 Bound::Excluded(&end) => end,
147 Bound::Unbounded => self.len(),
148 };
149 match self {
150 Utf32Str::Ascii(bytes) => Utf32Str::Ascii(&bytes[start..end]),
151 Utf32Str::Unicode(codepoints) => Utf32Str::Unicode(&codepoints[start..end]),
152 }
153 }
154
155 #[inline]
157 pub(crate) fn leading_white_space(self) -> usize {
158 match self {
159 Utf32Str::Ascii(bytes) => bytes
160 .iter()
161 .position(|b| !b.is_ascii_whitespace())
162 .unwrap_or(0),
163 Utf32Str::Unicode(codepoints) => codepoints
164 .iter()
165 .position(|c| !c.is_whitespace())
166 .unwrap_or(0),
167 }
168 }
169
170 #[inline]
172 pub(crate) fn trailing_white_space(self) -> usize {
173 match self {
174 Utf32Str::Ascii(bytes) => bytes
175 .iter()
176 .rev()
177 .position(|b| !b.is_ascii_whitespace())
178 .unwrap_or(0),
179 Utf32Str::Unicode(codepoints) => codepoints
180 .iter()
181 .rev()
182 .position(|c| !c.is_whitespace())
183 .unwrap_or(0),
184 }
185 }
186
187 #[inline]
190 pub fn slice_u32(self, range: impl RangeBounds<u32>) -> Utf32Str<'a> {
191 let start = match range.start_bound() {
192 Bound::Included(&start) => start as usize,
193 Bound::Excluded(&start) => start as usize + 1,
194 Bound::Unbounded => 0,
195 };
196 let end = match range.end_bound() {
197 Bound::Included(&end) => end as usize + 1,
198 Bound::Excluded(&end) => end as usize,
199 Bound::Unbounded => self.len(),
200 };
201 match self {
202 Utf32Str::Ascii(bytes) => Utf32Str::Ascii(&bytes[start..end]),
203 Utf32Str::Unicode(codepoints) => Utf32Str::Unicode(&codepoints[start..end]),
204 }
205 }
206
207 pub fn is_ascii(self) -> bool {
213 matches!(self, Utf32Str::Ascii(_))
214 }
215
216 pub fn get(self, n: u32) -> char {
218 match self {
219 Utf32Str::Ascii(bytes) => bytes[n as usize] as char,
220 Utf32Str::Unicode(codepoints) => codepoints[n as usize],
221 }
222 }
223
224 pub(crate) fn last(self) -> char {
228 match self {
229 Utf32Str::Ascii(bytes) => bytes[bytes.len() - 1] as char,
230 Utf32Str::Unicode(codepoints) => codepoints[codepoints.len() - 1],
231 }
232 }
233
234 pub(crate) fn first(self) -> char {
238 match self {
239 Utf32Str::Ascii(bytes) => bytes[0] as char,
240 Utf32Str::Unicode(codepoints) => codepoints[0],
241 }
242 }
243
244 pub fn chars(self) -> Chars<'a> {
246 match self {
247 Utf32Str::Ascii(bytes) => Chars::Ascii(bytes.iter()),
248 Utf32Str::Unicode(codepoints) => Chars::Unicode(codepoints.iter()),
249 }
250 }
251}
252
253impl fmt::Debug for Utf32Str<'_> {
254 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
255 use std::fmt::Write;
256 f.write_char('"')?;
257 for c in self.chars() {
258 for c in c.escape_debug() {
259 f.write_char(c)?;
260 }
261 }
262 f.write_char('"')
263 }
264}
265
266impl fmt::Display for Utf32Str<'_> {
267 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
268 use std::fmt::Write;
269 for c in self.chars() {
270 f.write_char(c)?;
271 }
272 Ok(())
273 }
274}
275
276pub enum Chars<'a> {
277 Ascii(slice::Iter<'a, u8>),
278 Unicode(slice::Iter<'a, char>),
279}
280
281impl Iterator for Chars<'_> {
282 type Item = char;
283
284 fn next(&mut self) -> Option<Self::Item> {
285 match self {
286 Chars::Ascii(iter) => iter.next().map(|&c| c as char),
287 Chars::Unicode(iter) => iter.next().copied(),
288 }
289 }
290}
291
292impl DoubleEndedIterator for Chars<'_> {
293 fn next_back(&mut self) -> Option<Self::Item> {
294 match self {
295 Chars::Ascii(iter) => iter.next_back().map(|&c| c as char),
296 Chars::Unicode(iter) => iter.next_back().copied(),
297 }
298 }
299}
300
301#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Hash)]
305pub enum Utf32String {
306 Ascii(Box<str>),
310 Unicode(Box<[char]>),
312}
313
314impl Default for Utf32String {
315 fn default() -> Self {
316 Self::Ascii(String::new().into_boxed_str())
317 }
318}
319
320impl Utf32String {
321 #[inline]
323 pub fn len(&self) -> usize {
324 match self {
325 Utf32String::Unicode(codepoints) => codepoints.len(),
326 Utf32String::Ascii(ascii_bytes) => ascii_bytes.len(),
327 }
328 }
329
330 #[inline]
332 pub fn is_empty(&self) -> bool {
333 match self {
334 Utf32String::Unicode(codepoints) => codepoints.is_empty(),
335 Utf32String::Ascii(ascii_bytes) => ascii_bytes.is_empty(),
336 }
337 }
338
339 #[inline]
342 pub fn slice(&self, range: impl RangeBounds<usize>) -> Utf32Str<'_> {
343 let start = match range.start_bound() {
344 Bound::Included(&start) => start,
345 Bound::Excluded(&start) => start + 1,
346 Bound::Unbounded => 0,
347 };
348 let end = match range.end_bound() {
349 Bound::Included(&end) => end + 1,
350 Bound::Excluded(&end) => end,
351 Bound::Unbounded => self.len(),
352 };
353 match self {
354 Utf32String::Ascii(bytes) => Utf32Str::Ascii(&bytes.as_bytes()[start..end]),
355 Utf32String::Unicode(codepoints) => Utf32Str::Unicode(&codepoints[start..end]),
356 }
357 }
358
359 #[inline]
362 pub fn slice_u32(&self, range: impl RangeBounds<u32>) -> Utf32Str<'_> {
363 let start = match range.start_bound() {
364 Bound::Included(&start) => start,
365 Bound::Excluded(&start) => start + 1,
366 Bound::Unbounded => 0,
367 };
368 let end = match range.end_bound() {
369 Bound::Included(&end) => end + 1,
370 Bound::Excluded(&end) => end,
371 Bound::Unbounded => self.len() as u32,
372 };
373 match self {
374 Utf32String::Ascii(bytes) => {
375 Utf32Str::Ascii(&bytes.as_bytes()[start as usize..end as usize])
376 }
377 Utf32String::Unicode(codepoints) => {
378 Utf32Str::Unicode(&codepoints[start as usize..end as usize])
379 }
380 }
381 }
382}
383
384impl From<&str> for Utf32String {
385 #[inline]
386 fn from(value: &str) -> Self {
387 if has_ascii_graphemes(value) {
388 Self::Ascii(value.to_owned().into_boxed_str())
389 } else {
390 Self::Unicode(chars::graphemes(value).collect())
391 }
392 }
393}
394
395impl From<Box<str>> for Utf32String {
396 fn from(value: Box<str>) -> Self {
397 if has_ascii_graphemes(&value) {
398 Self::Ascii(value)
399 } else {
400 Self::Unicode(chars::graphemes(&value).collect())
401 }
402 }
403}
404
405impl From<String> for Utf32String {
406 #[inline]
407 fn from(value: String) -> Self {
408 value.into_boxed_str().into()
409 }
410}
411
412impl<'a> From<Cow<'a, str>> for Utf32String {
413 #[inline]
414 fn from(value: Cow<'a, str>) -> Self {
415 match value {
416 Cow::Borrowed(value) => value.into(),
417 Cow::Owned(value) => value.into(),
418 }
419 }
420}
421
422impl fmt::Debug for Utf32String {
423 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
424 fmt::Debug::fmt(&self.slice(..), f)
425 }
426}
427
428impl fmt::Display for Utf32String {
429 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
430 fmt::Display::fmt(&self.slice(..), f)
431 }
432}