Skip to main content

kas_text/display/
text_runs.rs

1// Licensed under the Apache License, Version 2.0 (the "License");
2// you may not use this file except in compliance with the License.
3// You may obtain a copy of the License in the LICENSE-APACHE file or at:
4//     https://www.apache.org/licenses/LICENSE-2.0
5
6//! Text preparation: line breaking and BIDI
7
8#![allow(clippy::unnecessary_unwrap)]
9
10use super::TextDisplay;
11use crate::conv::{to_u32, to_usize};
12use crate::fonts::{self, FaceId, FontSelector, NoFontMatch};
13use crate::format::FormattableText;
14use crate::util::ends_with_hard_break;
15use crate::{Direction, Range, shaper};
16use icu_properties::props::{EmojiModifier, EmojiPresentation, RegionalIndicator, Script};
17use icu_properties::{CodePointMapData, CodePointSetData};
18use icu_segmenter::LineSegmenter;
19use std::sync::OnceLock;
20use unicode_bidi::{BidiInfo, LTR_LEVEL, RTL_LEVEL};
21
22#[derive(Clone, Copy, Debug, PartialEq)]
23pub(crate) enum RunSpecial {
24    None,
25    /// Run ends with a hard break
26    HardBreak,
27    /// Run does not end with a break
28    NoBreak,
29    /// Run is a horizontal tab (run is a single char only)
30    HTab,
31}
32
33impl TextDisplay {
34    /// Update font size
35    ///
36    /// [Requires status][Self#status-of-preparation]: level runs have been
37    /// prepared and are valid in all ways except size (`dpem`).
38    ///
39    /// This updates the result of [`TextDisplay::prepare_runs`] due to change
40    /// in font size.
41    pub fn resize_runs<F: FormattableText + ?Sized>(&mut self, text: &F, mut dpem: f32) {
42        let mut font_tokens = text.font_tokens(dpem);
43        let mut next_fmt = font_tokens.next();
44
45        let text = text.as_str();
46
47        for run in &mut self.runs {
48            while let Some(fmt) = next_fmt.as_ref() {
49                if fmt.start > run.range.start {
50                    break;
51                }
52                dpem = fmt.dpem;
53                next_fmt = font_tokens.next();
54            }
55
56            let input = shaper::Input {
57                text,
58                dpem,
59                level: run.level,
60                script: run.script,
61            };
62            let mut breaks = Default::default();
63            std::mem::swap(&mut breaks, &mut run.breaks);
64            if run.level.is_rtl() {
65                breaks.reverse();
66            }
67            *run = shaper::shape(input, run.range, run.face_id, breaks, run.special);
68        }
69    }
70
71    /// Resolve font face and shape run
72    ///
73    /// This may sub-divide text as required to find matching fonts.
74    fn push_run(
75        &mut self,
76        font: FontSelector,
77        input: shaper::Input,
78        range: Range,
79        mut breaks: tinyvec::TinyVec<[shaper::GlyphBreak; 4]>,
80        special: RunSpecial,
81        first_real: Option<char>,
82    ) -> Result<(), NoFontMatch> {
83        let fonts = fonts::library();
84        let font_id = fonts.select_font(&font, input.script.into())?;
85        let text = &input.text[range.to_std()];
86
87        // Find a font face
88        let mut face_id = None;
89        if let Some(c) = first_real {
90            face_id = fonts
91                .face_for_char(font_id, None, c)
92                .expect("invalid FontId");
93        }
94
95        let mut face = match face_id {
96            Some(id) => id,
97            None => {
98                // We failed to find a font face for the run
99                fonts.first_face_for(font_id).expect("invalid FontId")
100            }
101        };
102
103        let mut start = 0;
104        for (index, c) in text.char_indices() {
105            let index = to_u32(index);
106            if let Some(new_face) = fonts
107                .face_for_char(font_id, Some(face), c)
108                .expect("invalid FontId")
109                && new_face != face
110            {
111                if index > start {
112                    let sub_range = Range {
113                        start: range.start + start,
114                        end: range.start + index,
115                    };
116                    let mut j = 0;
117                    for i in 0..breaks.len() {
118                        if breaks[i].index < sub_range.end {
119                            j = i + 1;
120                        }
121                    }
122                    let rest = breaks.split_off(j);
123
124                    self.runs.push(shaper::shape(
125                        input,
126                        sub_range,
127                        face,
128                        breaks,
129                        RunSpecial::NoBreak,
130                    ));
131                    breaks = rest;
132                    start = index;
133                }
134
135                face = new_face;
136            }
137        }
138
139        let sub_range = Range {
140            start: range.start + start,
141            end: range.end,
142        };
143        self.runs
144            .push(shaper::shape(input, sub_range, face, breaks, special));
145        Ok(())
146    }
147
148    /// Break text into level runs
149    ///
150    /// [Requires status][Self#status-of-preparation]: none.
151    ///
152    /// Must be called again if any of `text`, `direction` or `font` change.
153    /// If only `dpem` changes, [`Self::resize_runs`] may be called instead.
154    ///
155    /// The text is broken into a set of contiguous "level runs". These runs are
156    /// maximal slices of the `text` which do not contain explicit line breaks
157    /// and have a single text direction according to the
158    /// [Unicode Bidirectional Algorithm](http://www.unicode.org/reports/tr9/).
159    pub fn prepare_runs<F: FormattableText + ?Sized>(
160        &mut self,
161        text: &F,
162        direction: Direction,
163        mut font: FontSelector,
164        mut dpem: f32,
165    ) -> Result<(), NoFontMatch> {
166        // This method constructs a list of "hard lines" (the initial line and any
167        // caused by a hard break), each composed of a list of "level runs" (the
168        // result of splitting and reversing according to Unicode TR9 aka
169        // Bidirectional algorithm), plus a list of "soft break" positions
170        // (where wrapping may introduce new lines depending on available space).
171
172        self.runs.clear();
173
174        let mut font_tokens = text.font_tokens(dpem);
175        let mut next_fmt = font_tokens.next();
176        if let Some(fmt) = next_fmt.as_ref()
177            && fmt.start == 0
178        {
179            font = fmt.font;
180            dpem = fmt.dpem;
181            next_fmt = font_tokens.next();
182        }
183
184        let text = text.as_str();
185
186        let default_para_level = match direction {
187            Direction::Auto => None,
188            Direction::AutoRtl => {
189                use unicode_bidi::Direction::*;
190                match unicode_bidi::get_base_direction(text) {
191                    Ltr | Rtl => None,
192                    Mixed => Some(RTL_LEVEL),
193                }
194            }
195            Direction::Ltr => Some(LTR_LEVEL),
196            Direction::Rtl => Some(RTL_LEVEL),
197        };
198        let info = BidiInfo::new(text, default_para_level);
199        let levels = info.levels;
200        assert_eq!(text.len(), levels.len());
201
202        let mut input = shaper::Input {
203            text,
204            dpem,
205            level: levels.first().cloned().unwrap_or(LTR_LEVEL),
206            script: Script::Unknown,
207        };
208
209        let mut start = 0;
210        let mut breaks = Default::default();
211
212        // TODO: allow segmenter configuration
213        let segmenter = LineSegmenter::new_auto(Default::default());
214        let mut break_iter = segmenter.segment_str(text);
215        let mut next_break = break_iter.next();
216
217        let mut first_real = None;
218        let mut emoji_state = EmojiState::None;
219        let mut emoji_start = 0;
220        let mut emoji_end = 0;
221
222        let mut last_is_control = false;
223        let mut last_is_htab = false;
224        let mut non_control_end = 0;
225
226        for (index, c) in text
227            .char_indices()
228            .chain(std::iter::once((text.len(), '\0')))
229        {
230            // Handling for control chars
231            if !last_is_control {
232                non_control_end = index;
233            }
234            let is_control = c.is_control();
235            let is_htab = c == '\t';
236            let mut require_break = is_htab;
237
238            // Is wrapping allowed at this position?
239            let is_break = next_break == Some(index);
240            let hard_break = is_break && ends_with_hard_break(&text[..index]);
241            if is_break {
242                next_break = break_iter.next();
243            }
244
245            let script = CodePointMapData::<Script>::new().get(c);
246
247            let emoji_break = emoji_state.advance(c);
248            let mut new_emoji_start = emoji_start;
249            let mut is_emoji = false;
250            let prohibit_break = match emoji_break {
251                EmojiBreak::None => false,
252                EmojiBreak::Start => {
253                    require_break = true;
254                    new_emoji_start = index;
255                    false
256                }
257                EmojiBreak::Prohibit => {
258                    emoji_end = index;
259                    true
260                }
261                EmojiBreak::End => {
262                    require_break = true;
263                    emoji_end = index;
264                    debug_assert!(emoji_end > emoji_start);
265                    is_emoji = true;
266                    false
267                }
268                EmojiBreak::Restart => {
269                    require_break = true;
270                    emoji_end = index;
271                    new_emoji_start = index;
272                    debug_assert!(emoji_end > emoji_start);
273                    is_emoji = true;
274                    false
275                }
276                EmojiBreak::Error => {
277                    is_emoji = emoji_end > emoji_start;
278                    require_break = is_emoji;
279                    false
280                }
281            };
282
283            // Force end of current run?
284            require_break |= levels
285                .get(index)
286                .map(|level| *level != input.level)
287                .unwrap_or(true);
288
289            if let Some(fmt) = next_fmt.as_ref()
290                && to_usize(fmt.start) == index
291            {
292                font = fmt.font;
293                dpem = fmt.dpem;
294                next_fmt = font_tokens.next();
295            }
296
297            let mut new_script = None;
298            if is_real(script) {
299                if first_real.is_none() {
300                    first_real = Some(c);
301                }
302                if script != input.script {
303                    new_script = Some(script);
304                    require_break |= is_real(input.script);
305                }
306            }
307
308            if !prohibit_break && (hard_break || require_break) {
309                let special = match () {
310                    _ if hard_break => RunSpecial::HardBreak,
311                    _ if last_is_htab => RunSpecial::HTab,
312                    _ if last_is_control || is_break => RunSpecial::None,
313                    _ => RunSpecial::NoBreak,
314                };
315
316                if is_emoji {
317                    let range = (emoji_start..emoji_end).into();
318                    let face = emoji_face_id()?;
319                    self.runs
320                        .push(shaper::shape(input, range, face, breaks, special));
321                } else {
322                    // NOTE: the range may be empty; we need it anyway (unless
323                    // we modify the last run's special property).
324                    let range = (start..non_control_end).into();
325                    self.push_run(font, input, range, breaks, special, first_real)?;
326                };
327                first_real = None;
328
329                start = index;
330                non_control_end = index;
331                if let Some(level) = levels.get(index) {
332                    input.level = *level;
333                }
334                input.script = script;
335                breaks = Default::default();
336            } else if is_break && !is_control {
337                // We do break runs when hitting control chars, but only when
338                // encountering the next non-control character.
339                breaks.push(shaper::GlyphBreak::new(to_u32(index)));
340            }
341
342            last_is_control = is_control;
343            last_is_htab = is_htab;
344            emoji_start = new_emoji_start;
345            input.dpem = dpem;
346            if let Some(script) = new_script {
347                input.script = script;
348            }
349        }
350
351        let hard_break = ends_with_hard_break(&text);
352
353        // Following a hard break we have an implied empty line.
354        if hard_break {
355            let range = (text.len()..text.len()).into();
356            input.level = default_para_level.unwrap_or(LTR_LEVEL);
357            breaks = Default::default();
358            self.push_run(font, input, range, breaks, RunSpecial::None, None)?;
359        }
360
361        /*
362        println!("text: {}", text);
363        let fonts = fonts::library();
364        for run in &self.runs {
365            let slice = &text[run.range];
366            print!(
367                "\t{:?}, text[{}..{}]: '{}', ",
368                run.level, run.range.start, run.range.end, slice
369            );
370            match run.special {
371                RunSpecial::None => (),
372                RunSpecial::HardBreak => print!("HardBreak, "),
373                RunSpecial::NoBreak => print!("NoBreak, "),
374                RunSpecial::HTab => print!("HTab, "),
375            }
376            print!("breaks=[");
377            let mut iter = run.breaks.iter();
378            if let Some(b) = iter.next() {
379                print!("{}", b.index);
380            }
381            for b in iter {
382                print!(", {}", b.index);
383            }
384            print!("]");
385            if let Some(name) = fonts.get_face_store(run.face_id).name_full() {
386                print!(", {name}");
387            }
388            println!();
389        }
390        */
391        Ok(())
392    }
393}
394
395fn is_real(script: Script) -> bool {
396    !matches!(script, Script::Common | Script::Unknown | Script::Inherited)
397}
398
399fn emoji_face_id() -> Result<FaceId, NoFontMatch> {
400    static ONCE: OnceLock<Result<FaceId, NoFontMatch>> = OnceLock::new();
401    *ONCE.get_or_init(|| {
402        let fonts = fonts::library();
403        let font = fonts.select_font(&FontSelector::EMOJI, Script::Common.into());
404        font.map(|font_id| fonts.first_face_for(font_id).expect("invalid FontId"))
405    })
406}
407
408#[derive(Clone, Copy, Debug, PartialEq, Eq)]
409enum EmojiBreak {
410    /// Not an Emoji
411    None,
412    /// Start of an Emoji sequence
413    Start,
414    /// Mid Emoji sequence, valid
415    Prohibit,
416    /// End of a valid Emoji sequence
417    End,
418    /// End of one Emoji and start of another
419    Restart,
420    /// Error; revert to last known good index
421    Error,
422}
423
424enum EmojiState {
425    None,
426    RI1,
427    RI2,
428    Emoji,
429    EMod,
430    VarSelector,
431    TagModifier,
432    ZWJ,
433}
434
435impl EmojiState {
436    /// Advance the emoji state machine
437    ///
438    /// Returns whether a break should occur before `c`.
439    fn advance(&mut self, c: char) -> EmojiBreak {
440        // Reference: https://unicode.org/reports/tr51/#EBNF_and_Regex
441        #[allow(non_snake_case)]
442        fn end_unless_ZWJ(c: char, b: &mut EmojiBreak) -> EmojiState {
443            if c == '\u{200D}' {
444                EmojiState::ZWJ
445            } else {
446                *b = EmojiBreak::End;
447                EmojiState::None
448            }
449        }
450        let mut b = EmojiBreak::None;
451        *self = match *self {
452            EmojiState::None => {
453                if CodePointSetData::new::<RegionalIndicator>().contains(c) {
454                    b = EmojiBreak::Start;
455                    EmojiState::RI1
456                } else if CodePointSetData::new::<EmojiPresentation>().contains(c) {
457                    b = EmojiBreak::Start;
458                    EmojiState::Emoji
459                } else {
460                    EmojiState::None
461                }
462            }
463            EmojiState::RI1 => {
464                if CodePointSetData::new::<RegionalIndicator>().contains(c) {
465                    b = EmojiBreak::Prohibit;
466                    EmojiState::RI2
467                } else {
468                    b = EmojiBreak::Error;
469                    EmojiState::None
470                }
471            }
472            EmojiState::RI2 => end_unless_ZWJ(c, &mut b),
473            EmojiState::Emoji => {
474                if CodePointSetData::new::<EmojiModifier>().contains(c) {
475                    EmojiState::EMod
476                } else if c == '\u{FE0F}' {
477                    EmojiState::VarSelector
478                } else if ('\u{E0020}'..='\u{E007E}').contains(&c) {
479                    EmojiState::TagModifier
480                } else if c == '\u{200D}' {
481                    EmojiState::ZWJ
482                } else {
483                    b = EmojiBreak::End;
484                    EmojiState::None
485                }
486            }
487            EmojiState::EMod => end_unless_ZWJ(c, &mut b),
488            EmojiState::VarSelector => {
489                if c == '\u{20E3}' {
490                    end_unless_ZWJ(c, &mut b)
491                } else {
492                    b = EmojiBreak::End;
493                    EmojiState::None
494                }
495            }
496            EmojiState::TagModifier => {
497                if ('\u{E0020}'..='\u{E007E}').contains(&c) {
498                    EmojiState::TagModifier
499                } else if c == '\u{E007F}' {
500                    end_unless_ZWJ(c, &mut b)
501                } else {
502                    b = EmojiBreak::Error;
503                    EmojiState::None
504                }
505            }
506            EmojiState::ZWJ => {
507                if CodePointSetData::new::<RegionalIndicator>().contains(c) {
508                    EmojiState::RI1
509                } else if CodePointSetData::new::<EmojiPresentation>().contains(c) {
510                    EmojiState::Emoji
511                } else {
512                    b = EmojiBreak::Error;
513                    EmojiState::None
514                }
515            }
516        };
517        if b == EmojiBreak::End {
518            *self = if CodePointSetData::new::<RegionalIndicator>().contains(c) {
519                b = EmojiBreak::Restart;
520                EmojiState::RI1
521            } else if CodePointSetData::new::<EmojiPresentation>().contains(c) {
522                b = EmojiBreak::Restart;
523                EmojiState::Emoji
524            } else {
525                EmojiState::None
526            };
527        }
528        b
529    }
530}