ib_unicode/str.rs
1use crate::Sealed;
2
3/// Polyfill for unstable [`#![feature(round_char_boundary)]`](https://github.com/rust-lang/rust/issues/93743)
4pub trait RoundCharBoundaryExt: Sealed {
5 /// Finds the closest `x` not exceeding `index` where [`is_char_boundary(x)`] is `true`.
6 ///
7 /// This method can help you truncate a string so that it's still valid UTF-8, but doesn't
8 /// exceed a given number of bytes. Note that this is done purely at the character level
9 /// and can still visually split graphemes, even though the underlying characters aren't
10 /// split. For example, the emoji ๐งโ๐ฌ (scientist) could be split so that the string only
11 /// includes ๐ง (person) instead.
12 ///
13 /// [`is_char_boundary(x)`]: str::is_char_boundary
14 ///
15 /// # Examples
16 ///
17 /// ```
18 /// use ib_unicode::str::RoundCharBoundaryExt;
19 ///
20 /// let s = "โค๏ธ๐งก๐๐๐๐";
21 /// assert_eq!(s.len(), 26);
22 /// assert!(!s.is_char_boundary(13));
23 ///
24 /// let closest = s.floor_char_boundary_ib(13);
25 /// assert_eq!(closest, 10);
26 /// assert_eq!(&s[..closest], "โค๏ธ๐งก");
27 /// ```
28 fn floor_char_boundary_ib(&self, index: usize) -> usize;
29
30 /// Finds the closest `x` not below `index` where [`is_char_boundary(x)`] is `true`.
31 ///
32 /// If `index` is greater than the length of the string, this returns the length of the string.
33 ///
34 /// This method is the natural complement to [`floor_char_boundary`]. See that method
35 /// for more details.
36 ///
37 /// [`floor_char_boundary`]: str::floor_char_boundary
38 /// [`is_char_boundary(x)`]: str::is_char_boundary
39 ///
40 /// # Examples
41 ///
42 /// ```
43 /// use ib_unicode::str::RoundCharBoundaryExt;
44 ///
45 /// let s = "โค๏ธ๐งก๐๐๐๐";
46 /// assert_eq!(s.len(), 26);
47 /// assert!(!s.is_char_boundary(13));
48 ///
49 /// let closest = s.ceil_char_boundary_ib(13);
50 /// assert_eq!(closest, 14);
51 /// assert_eq!(&s[..closest], "โค๏ธ๐งก๐");
52 /// ```
53 fn ceil_char_boundary_ib(&self, index: usize) -> usize;
54}
55
56impl RoundCharBoundaryExt for str {
57 #[inline]
58 fn floor_char_boundary_ib(&self, index: usize) -> usize {
59 if index >= self.len() {
60 self.len()
61 } else {
62 let lower_bound = index.saturating_sub(3);
63 let new_index = self.as_bytes()[lower_bound..=index].iter().rposition(|&b| {
64 // b.is_utf8_char_boundary()
65 (b as i8) >= -0x40
66 });
67
68 // SAFETY: we know that the character boundary will be within four bytes
69 unsafe { lower_bound + new_index.unwrap_unchecked() }
70 }
71 }
72
73 #[inline]
74 fn ceil_char_boundary_ib(&self, index: usize) -> usize {
75 if index > self.len() {
76 self.len()
77 } else {
78 let upper_bound = Ord::min(index + 4, self.len());
79 self.as_bytes()[index..upper_bound]
80 .iter()
81 .position(|&b| {
82 // b.is_utf8_char_boundary()
83 (b as i8) >= -0x40
84 })
85 .map_or(upper_bound, |pos| pos + index)
86 }
87 }
88}