melib/text/
wcwidth.rs

1/*
2 * meli - text mod.
3 *
4 * Copyright 2017-2020 Manos Pitsidianakis
5 *
6 * This file is part of meli.
7 *
8 * meli is free software: you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation, either version 3 of the License, or
11 * (at your option) any later version.
12 *
13 * meli is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with meli. If not, see <http://www.gnu.org/licenses/>.
20 */
21
22/*
23 * This is an implementation of wcwidth() and wcswidth() as defined in
24 * "The Single UNIX Specification, Version 2, The Open Group, 1997"
25 * <http://www.UNIX-systems.org/online.html>
26 *
27 * Markus Kuhn -- 2001-09-08 -- public domain
28 */
29
30// Update to Unicode 12
31
32#[macro_export]
33macro_rules! big_if_true {
34    ($a:expr) => {
35        if $a {
36            1
37        } else {
38            0
39        }
40    };
41}
42
43type WChar = u32;
44type Interval = (WChar, WChar);
45
46pub struct CodePointsIterator<'a> {
47    rest: std::str::Chars<'a>,
48}
49
50/*
51 * UTF-8 uses a system of binary prefixes, in which the high bits of each
52 * byte mark whether it’s a single byte, the beginning of a multi-byte
53 * sequence, or a continuation byte; the remaining bits, concatenated, give
54 * the code point index. This table shows how it works:
55 *
56 * ```text
57 * UTF-8 (binary)                      |Code point (binary)    |Range
58 * ------------------------------------+-----------------------+-------
59 * 0xxxxxxx                            |xxxxxxx                |U+0000–U+007F
60 * 110xxxxx 10yyyyyy                   |xxxxxyyyyyy            |U+0080–U+07FF
61 * 1110xxxx 10yyyyyy 10zzzzzz          |xxxxyyyyyyzzzzzz       |U+0800–U+FFFF
62 * 11110xxx 10yyyyyy 10zzzzzz 10wwwwww |xxxyyyyyyzzzzzzwwwwww  |U+10000–U+10FFFF
63 * ```
64 *
65 */
66impl Iterator for CodePointsIterator<'_> {
67    type Item = WChar;
68
69    fn next(&mut self) -> Option<WChar> {
70        self.rest.next().map(|c| c as WChar)
71    }
72}
73pub trait CodePointsIter {
74    fn code_points(&self) -> CodePointsIterator<'_>;
75}
76
77impl CodePointsIter for str {
78    fn code_points(&self) -> CodePointsIterator<'_> {
79        CodePointsIterator { rest: self.chars() }
80    }
81}
82impl CodePointsIter for &str {
83    fn code_points(&self) -> CodePointsIterator<'_> {
84        CodePointsIterator { rest: self.chars() }
85    }
86}
87
88/* auxiliary function for binary search in Interval table */
89fn bisearch(ucs: WChar, table: &'static [Interval]) -> bool {
90    let mut min = 0;
91    let mut mid;
92
93    if table.is_empty() {
94        return false;
95    }
96    let mut max = table.len() - 1;
97
98    if ucs < table[0].0 || ucs > table[max].1 {
99        return false;
100    }
101    while max >= min {
102        mid = (min + max) / 2;
103        if ucs > table[mid].1 {
104            min = mid + 1;
105        } else if ucs < table[mid].0 {
106            max = mid - 1;
107        } else {
108            return true;
109        }
110    }
111
112    false
113}
114
115pub fn wcwidth(ucs: WChar) -> Option<usize> {
116    if bisearch(ucs, super::tables::ASCII) {
117        Some(1)
118    } else if bisearch(ucs, super::tables::PRIVATE)
119        || bisearch(ucs, super::tables::NONPRINT)
120        || bisearch(ucs, super::tables::COMBINING)
121    {
122        None
123    } else if bisearch(ucs, super::tables::DOUBLEWIDE) {
124        Some(2)
125    } else if bisearch(ucs, super::tables::AMBIGUOUS) {
126        Some(1)
127    } else if bisearch(ucs, super::tables::UNASSIGNED) || bisearch(ucs, super::tables::WIDENEDIN9) {
128        Some(2)
129    } else {
130        Some(1)
131    }
132}
133
134pub fn wcswidth(mut pwcs: WChar, mut n: usize) -> Option<usize> {
135    let mut width = 0;
136
137    while pwcs > 0 && n > 0 {
138        if let Some(w) = wcwidth(pwcs) {
139            width += w;
140        } else {
141            return None;
142        }
143
144        pwcs += 1;
145        n -= 1;
146    }
147
148    Some(width)
149}
150
151#[cfg(test)]
152mod tests {
153    use super::*;
154    use crate::text::{grapheme_clusters::TextProcessing, TextPresentation};
155
156    #[test]
157    fn test_wcwidth() {
158        assert_eq!(
159            &"abc\0".code_points().collect::<Vec<_>>(),
160            &[0x61, 0x62, 0x63, 0x0]
161        );
162        assert_eq!(&"●".code_points().collect::<Vec<_>>(), &[0x25cf]);
163        assert_eq!(&"📎".code_points().collect::<Vec<_>>(), &[0x1f4ce]);
164        assert_eq!(
165            &"𐼹𐼺𐼻𐼼𐼽".code_points().collect::<Vec<_>>(),
166            &[0x10F39, 0x10F3A, 0x10F3B, 0x10F3C, 0x10F3D]
167        ); // Sogdian alphabet
168        assert_eq!(
169            &"𐼹a𐼽b".code_points().collect::<Vec<_>>(),
170            &[0x10F39, 0x61, 0x10F3D, 0x62]
171        ); // Sogdian alphabet
172        assert_eq!(
173            &"📎\u{FE0E}".code_points().collect::<Vec<_>>(),
174            &[0x1f4ce, 0xfe0e]
175        );
176        assert_eq!("●".grapheme_width(), 1);
177        assert_eq!("●📎".grapheme_width(), 3);
178        assert_eq!("●📎︎".grapheme_width(), 3);
179        assert_eq!("●\u{FE0E}📎\u{FE0E}".grapheme_width(), 3);
180        assert_eq!("🎃".grapheme_width(), 2);
181        assert_eq!("👻".grapheme_width(), 2);
182        assert_eq!("🛡︎".grapheme_width(), 2);
183        assert_eq!("🛡︎".text_pr().grapheme_width(), 2);
184
185        assert_eq!("こんにちわ世界".grapheme_width(), 14);
186        assert_eq!("こ★ん■に●ち▲わ☆世◆界".grapheme_width(), 20);
187    }
188}