ib_unicode/
str.rs

1use crate::Sealed;
2
3/// Polyfill for unstable [`#![feature(round_char_boundary)]`](https://github.com/rust-lang/rust/issues/93743)
4pub trait RoundCharBoundaryExt: Sealed {
5    /// Finds the closest `x` not exceeding `index` where [`is_char_boundary(x)`] is `true`.
6    ///
7    /// This method can help you truncate a string so that it's still valid UTF-8, but doesn't
8    /// exceed a given number of bytes. Note that this is done purely at the character level
9    /// and can still visually split graphemes, even though the underlying characters aren't
10    /// split. For example, the emoji ๐Ÿง‘โ€๐Ÿ”ฌ (scientist) could be split so that the string only
11    /// includes ๐Ÿง‘ (person) instead.
12    ///
13    /// [`is_char_boundary(x)`]: str::is_char_boundary
14    ///
15    /// # Examples
16    ///
17    /// ```
18    /// use ib_unicode::str::RoundCharBoundaryExt;
19    ///
20    /// let s = "โค๏ธ๐Ÿงก๐Ÿ’›๐Ÿ’š๐Ÿ’™๐Ÿ’œ";
21    /// assert_eq!(s.len(), 26);
22    /// assert!(!s.is_char_boundary(13));
23    ///
24    /// let closest = s.floor_char_boundary_ib(13);
25    /// assert_eq!(closest, 10);
26    /// assert_eq!(&s[..closest], "โค๏ธ๐Ÿงก");
27    /// ```
28    fn floor_char_boundary_ib(&self, index: usize) -> usize;
29
30    /// Finds the closest `x` not below `index` where [`is_char_boundary(x)`] is `true`.
31    ///
32    /// If `index` is greater than the length of the string, this returns the length of the string.
33    ///
34    /// This method is the natural complement to [`floor_char_boundary`]. See that method
35    /// for more details.
36    ///
37    /// [`floor_char_boundary`]: str::floor_char_boundary
38    /// [`is_char_boundary(x)`]: str::is_char_boundary
39    ///
40    /// # Examples
41    ///
42    /// ```
43    /// use ib_unicode::str::RoundCharBoundaryExt;
44    ///
45    /// let s = "โค๏ธ๐Ÿงก๐Ÿ’›๐Ÿ’š๐Ÿ’™๐Ÿ’œ";
46    /// assert_eq!(s.len(), 26);
47    /// assert!(!s.is_char_boundary(13));
48    ///
49    /// let closest = s.ceil_char_boundary_ib(13);
50    /// assert_eq!(closest, 14);
51    /// assert_eq!(&s[..closest], "โค๏ธ๐Ÿงก๐Ÿ’›");
52    /// ```
53    fn ceil_char_boundary_ib(&self, index: usize) -> usize;
54}
55
56impl RoundCharBoundaryExt for str {
57    #[inline]
58    fn floor_char_boundary_ib(&self, index: usize) -> usize {
59        if index >= self.len() {
60            self.len()
61        } else {
62            let lower_bound = index.saturating_sub(3);
63            let new_index = self.as_bytes()[lower_bound..=index].iter().rposition(|&b| {
64                // b.is_utf8_char_boundary()
65                (b as i8) >= -0x40
66            });
67
68            // SAFETY: we know that the character boundary will be within four bytes
69            unsafe { lower_bound + new_index.unwrap_unchecked() }
70        }
71    }
72
73    #[inline]
74    fn ceil_char_boundary_ib(&self, index: usize) -> usize {
75        if index > self.len() {
76            self.len()
77        } else {
78            let upper_bound = Ord::min(index + 4, self.len());
79            self.as_bytes()[index..upper_bound]
80                .iter()
81                .position(|&b| {
82                    // b.is_utf8_char_boundary()
83                    (b as i8) >= -0x40
84                })
85                .map_or(upper_bound, |pos| pos + index)
86        }
87    }
88}