rustrict/
width.rs

1use crate::is_whitespace;
2use std::str::from_utf8;
3
4const MODE_WIDTH: u8 = 10;
5
6lazy_static::lazy_static! {
7    static ref WIDTHS: Vec<(char, u8)> = {
8        use std::io::Read;
9        // Format of this file is documented in character_analyzer.rs
10        let mut raw = include_bytes!("character_widths.bin").as_slice();
11
12        // First byte is mode length.
13        let mut mode = [0u8];
14        raw.read(&mut mode).unwrap();
15        let mode = mode[0];
16
17        assert_eq!(mode, MODE_WIDTH);
18
19        let mut widths = Vec::new();
20
21        while !raw.is_empty() {
22            // Read one UTF-8 character.
23            // TODO: Once stable, use: utf8_char_width(raw[0])
24            let s = from_utf8(&raw[..1])
25                .or_else(|_| from_utf8(&raw[..2]))
26                .or_else(|_| from_utf8(&raw[..3]))
27                .or_else(|_| from_utf8(&raw[..4]))
28                .unwrap();
29            let c = s.chars().next().unwrap();
30            raw = &raw[c.len_utf8()..];
31
32            // After character comes a byte of length.
33            let mut len = [0u8];
34            raw.read(&mut len).unwrap();
35            let len = len[0];
36
37            widths.push((c, len));
38        }
39
40        widths
41    };
42}
43
44/// Returns an estimate of the worst-case display width in milli-`m`'s (thousandths of the
45/// the width of an `m` character).
46///
47/// For example, `width('m')` returns 1000 and `width('\u{FDFD}')` returns 10300 (wouldn't you like
48/// to know if your user's text is 10.3X longer per character than you might have expected?).
49///
50/// Precision is not necessarily 1 milli-`m` (currently, it is 100 milli-`m`'s).
51#[cfg_attr(doc, doc(cfg(feature = "width")))]
52pub fn width(c: char) -> usize {
53    let width = match WIDTHS.binary_search_by_key(&c, |&(c, _)| c) {
54        Ok(idx) => WIDTHS[idx].1,
55        Err(_) => MODE_WIDTH,
56    } as usize;
57
58    width * 100
59}
60
61/// Convenience method for getting the width, in `m`'s, of an entire string.
62///
63/// Warning: If the width overflows, the result is undefined (e.g. panic or overflow).
64#[cfg_attr(doc, doc(cfg(feature = "width")))]
65pub fn width_str(s: &str) -> usize {
66    s.chars().map(|c| width(c) / 100).sum::<usize>() / 10
67}
68
69/// How text is expected to be displayed.
70///
71/// Eventually, `BreakWord` will be supported.
72#[derive(Copy, Clone, Debug)]
73#[non_exhaustive]
74pub enum WordBreak {
75    // TODO: BreakWord
76    /// Same as CSS's `word-break: break-all;`.
77    BreakAll,
78}
79
80/// Like `width_str` but computes the width of the max unbroken (no line break) part of the string.
81///
82/// In certain cases, not even CSS's `word-break: break-all;` (or equivalents) will be able to
83/// break a string, so it's good to know how long the lines might get.
84///
85/// For example, try selecting the following unbroken part: ௌௌௌௌ
86pub fn width_str_max_unbroken(s: &str, _word_break: WordBreak) -> usize {
87    let mut start = 0;
88    break_all_linebreaks(&s)
89        .map(|p| {
90            let unbroken = &s[start..p];
91            start = p;
92            width_str(unbroken.trim_end_matches(is_whitespace))
93        })
94        .max()
95        .unwrap_or(0)
96}
97
98// TODO unicode-linebreak = { version = "0.1.5", optional = true }
99
100fn break_all_linebreaks(s: &str) -> impl Iterator<Item = usize> + '_ {
101    use finl_unicode::categories::{CharacterCategories, MinorCategory};
102
103    use itertools::Itertools;
104    s.char_indices()
105        .tuple_windows()
106        .filter_map(|((_, c1), (p, c2))| {
107            let c1 = c1.get_minor_category();
108            let c2 = c2.get_minor_category();
109            let break_all = !matches!(c1, MinorCategory::Mn | MinorCategory::Mc)
110                && !matches!(c2, MinorCategory::Mn | MinorCategory::Mc);
111            if break_all
112                || [c1, c2]
113                    .into_iter()
114                    .any(|c| matches!(c, MinorCategory::Zs | MinorCategory::Zl))
115            {
116                Some(p)
117            } else {
118                None
119            }
120        })
121        .chain(std::iter::once(s.len()))
122}
123
124/// Trims a string to a maximum number of `m`'s. A budget of 5 would allow five m, or more narrower
125/// characters, or fewer wider characters.
126pub fn trim_to_width(s: &str, mut budget: usize) -> &str {
127    // Convert to milli-`m`'s.
128    budget *= 10;
129    for (idx, c) in s.char_indices() {
130        match budget.checked_sub(width(c) / 100) {
131            Some(new_budget) => budget = new_budget,
132            None => return &s[..idx],
133        }
134    }
135    return s;
136}
137
138#[cfg(test)]
139mod test {
140    use crate::width::{trim_to_width, width_str, WordBreak};
141    use crate::{width, width_str_max_unbroken, CensorStr};
142    use serial_test::serial;
143
144    /*
145    #[test]
146    pub fn i() {
147        assert_eq!(width('i'), 600);
148    }
149     */
150
151    #[test]
152    pub fn unbroken() {
153        let tests = [
154            ("", 0),
155            ("m", 1),
156            ("mm", 1),
157            ("m m", 1),
158            ("m     m", 1),
159            ("mm m", 1),
160            ("m mm", 1),
161            ("m;m", 1),
162        ];
163        for (s, w) in tests {
164            assert_eq!(width_str_max_unbroken(s, WordBreak::BreakAll), w, "{s} {w}");
165        }
166    }
167
168    #[test]
169    pub fn m() {
170        assert_eq!(width('m'), 1000);
171    }
172
173    #[test]
174    pub fn fdfd() {
175        // https://commons.wikimedia.org/wiki/File:Lateef_unicode_U%2BFDFD_2020-03-09_122519.png
176        assert_eq!(width('\u{FDFD}'), 10300)
177    }
178
179    #[test]
180    pub fn three_em_dash() {
181        assert!(width('⸻') >= 2500);
182    }
183
184    #[test]
185    pub fn lattice() {
186        assert!(width('𒐫') >= 3000);
187    }
188
189    #[test]
190    pub fn cuneiform() {
191        assert!(width('𒈙') >= 3000);
192    }
193
194    #[test]
195    pub fn javanese() {
196        assert!(width('꧅') >= 1500);
197    }
198
199    #[test]
200    pub fn tamil() {
201        assert_eq!(
202            width_str_max_unbroken("abc ௌௌௌௌ def", WordBreak::BreakAll),
203            10
204        );
205        assert_eq!(width_str_max_unbroken("abc ௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌௌ", WordBreak::BreakAll), 345);
206    }
207
208    #[test]
209    pub fn emoji() {
210        assert_eq!(width_str("😀🐿"), 4);
211    }
212
213    #[test]
214    pub fn cjk() {
215        assert_eq!(width_str("大はㅂ"), 6)
216    }
217
218    #[test]
219    pub fn string() {
220        //assert_eq!(width_str("abc‱DŽဪ"), 7);
221        assert_eq!(width_str("abc‱DŽဪ"), 8);
222    }
223
224    #[test]
225    #[serial]
226    pub fn tall() {
227        assert_eq!("a꧁a".censor(), "aa");
228    }
229
230    #[test]
231    #[serial]
232    pub fn trim() {
233        assert_eq!(trim_to_width("aa", 0), "");
234        assert_eq!(trim_to_width("mmm", 1), "m");
235        assert_eq!(trim_to_width("mmm", 2), "mm");
236        assert_eq!(trim_to_width("mmm", 3), "mmm");
237        assert_eq!(trim_to_width("mmm", 4), "mmm");
238
239        let mut s = String::new();
240        for u in 0..10000 {
241            if let Some(c) = char::from_u32(u) {
242                s.push(c);
243            }
244        }
245        for b in 0..1000 {
246            let t = trim_to_width(&s, b);
247            let w = width_str(t);
248            assert!(w <= b);
249            assert!(w + 15 >= b)
250        }
251    }
252}