fontdue/unicode/
mod.rs

1mod tables;
2
3use crate::unicode::tables::*;
4use alloc::string::String;
5
6const CONT_MASK: u8 = 0b0011_1111;
7
8#[inline(always)]
9fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 {
10    (ch << 6) | (byte & CONT_MASK) as u32
11}
12
13pub fn decode_utf16(bytes: &[u8]) -> String {
14    let mut output = String::new();
15    let mut offset = 0;
16    while offset < bytes.len() {
17        output.push(read_utf16(bytes, &mut offset));
18    }
19    output
20}
21
22pub fn read_utf16(bytes: &[u8], offset: &mut usize) -> char {
23    let a = ((bytes[*offset] as u16) << 8) | bytes[*offset + 1] as u16;
24    *offset += 2;
25    if a < 0xD800 || 0xDFFF < a {
26        unsafe { core::char::from_u32_unchecked(a as u32) }
27    } else {
28        let b = ((bytes[*offset] as u16) << 8) | bytes[*offset + 1] as u16;
29        *offset += 2;
30        let c = (((a - 0xD800) as u32) << 10 | (b - 0xDC00) as u32) + 0x1_0000;
31        unsafe { core::char::from_u32_unchecked(c as u32) }
32    }
33}
34
35/// Returns (length, character). Cannot be run at the end of the string.
36pub fn read_utf8(bytes: &[u8], byte_offset: &mut usize) -> char {
37    let x = bytes[*byte_offset];
38    *byte_offset += 1;
39    if x < 128 {
40        return unsafe { core::char::from_u32_unchecked(x as u32) };
41    }
42    let init = (x & (0x7F >> 2)) as u32;
43    let y = bytes[*byte_offset];
44    *byte_offset += 1;
45    let mut ch = utf8_acc_cont_byte(init, y);
46    if x >= 0xE0 {
47        let z = bytes[*byte_offset];
48        *byte_offset += 1;
49        let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z);
50        ch = init << 12 | y_z;
51        if x >= 0xF0 {
52            let w = bytes[*byte_offset];
53            *byte_offset += 1;
54            ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w);
55        }
56    }
57    unsafe { core::char::from_u32_unchecked(ch) }
58}
59
60#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord)]
61/// Ordering is based on linebreak priority. Ordering is Hard > Soft > None.
62pub struct LinebreakData {
63    bits: u8,
64}
65
66pub const LINEBREAK_NONE: LinebreakData = LinebreakData::new(0b0000_0000);
67pub const LINEBREAK_SOFT: LinebreakData = LinebreakData::new(0b0000_0001);
68pub const LINEBREAK_HARD: LinebreakData = LinebreakData::new(0b0000_0010);
69
70impl LinebreakData {
71    const NONE: u8 = 0b0000_0000;
72    const SOFT: u8 = 0b0000_0001;
73    const HARD: u8 = 0b0000_0010;
74
75    const fn new(bits: u8) -> LinebreakData {
76        LinebreakData {
77            bits,
78        }
79    }
80
81    pub fn from_mask(wrap_soft_breaks: bool, wrap_hard_breaks: bool, has_width: bool) -> LinebreakData {
82        let mut mask = 0;
83        if wrap_hard_breaks {
84            mask |= LinebreakData::HARD;
85        }
86        if wrap_soft_breaks && has_width {
87            mask |= LinebreakData::SOFT;
88        }
89        LinebreakData {
90            bits: mask,
91        }
92    }
93
94    pub fn is_hard(&self) -> bool {
95        self.bits == LinebreakData::HARD
96    }
97
98    pub fn is_soft(&self) -> bool {
99        self.bits == LinebreakData::SOFT
100    }
101
102    pub fn mask(&self, other: LinebreakData) -> LinebreakData {
103        Self::new(self.bits & other.bits)
104    }
105}
106
107#[derive(Debug, Copy, Clone)]
108pub struct Linebreaker {
109    state: u8,
110}
111
112impl Linebreaker {
113    pub fn new() -> Linebreaker {
114        Linebreaker {
115            state: 0,
116        }
117    }
118
119    pub fn reset(&mut self) {
120        self.state = 0;
121    }
122
123    // [See license/xi-editor/xi-unicode] Copyright 2016 The xi-editor Authors
124    pub fn next(&mut self, codepoint: char) -> LinebreakData {
125        let cp = codepoint as usize;
126        let lb = if cp < 0x800 {
127            LINEBREAK_1_2[cp]
128        } else if cp < 0x10000 {
129            let child = LINEBREAK_3_ROOT[cp >> 6];
130            LINEBREAK_3_CHILD[(child as usize) * 0x40 + (cp & 0x3f)]
131        } else {
132            let mid = LINEBREAK_4_ROOT[cp >> 12];
133            let leaf = LINEBREAK_4_MID[(mid as usize) * 0x40 + ((cp >> 6) & 0x3f)];
134            LINEBREAK_4_LEAVES[(leaf as usize) * 0x40 + (cp & 0x3f)]
135        };
136        let i = (self.state as usize) * N_LINEBREAK_CATEGORIES + (lb as usize);
137        let new = LINEBREAK_STATE_MACHINE[i];
138        if (new as i8) < 0 {
139            self.state = new & 0x3f;
140            if new >= 0xc0 {
141                LINEBREAK_HARD
142            } else {
143                LINEBREAK_SOFT
144            }
145        } else {
146            self.state = new;
147            LINEBREAK_NONE
148        }
149    }
150}
151
152/// Miscellaneous metadata associated with a character to assist in layout.
153#[derive(Debug, Copy, Clone, PartialEq, Eq)]
154pub struct CharacterData {
155    bits: u8,
156}
157
158impl CharacterData {
159    const WHITESPACE: u8 = 0b0000_0001;
160    const CONTROL: u8 = 0b0000_0010;
161    const MISSING: u8 = 0b0000_0100;
162
163    /// Classifies a character given its index in the font.
164    pub fn classify(c: char, index: u16) -> CharacterData {
165        let mut class = 0;
166        if index == 0 {
167            class |= CharacterData::MISSING;
168        }
169        match c {
170            '\t' | '\n' | '\x0C' | '\r' | ' ' => class |= CharacterData::WHITESPACE,
171            _ => {}
172        }
173        match c {
174            '\0'..='\x1F' | '\x7F' => class |= CharacterData::CONTROL,
175            _ => {}
176        }
177        CharacterData {
178            bits: class,
179        }
180    }
181
182    /// A heuristic for if the glpyh this was classified from should be rasterized. Missing glyphs,
183    /// whitespace, and control characters will return false.
184    pub fn rasterize(&self) -> bool {
185        self.bits == 0
186    }
187
188    /// Marks if the character is an ASCII whitespace character.
189    pub fn is_whitespace(&self) -> bool {
190        self.bits & CharacterData::WHITESPACE != 0
191    }
192
193    /// Marks if the character is an ASCII control character.
194    pub fn is_control(&self) -> bool {
195        self.bits & CharacterData::CONTROL != 0
196    }
197
198    /// Marks if the character is missing from its associated font.
199    pub fn is_missing(&self) -> bool {
200        self.bits & CharacterData::MISSING != 0
201    }
202}