Skip to main content

kas_text/display/
text_runs.rs

1// Licensed under the Apache License, Version 2.0 (the "License");
2// you may not use this file except in compliance with the License.
3// You may obtain a copy of the License in the LICENSE-APACHE file or at:
4//     https://www.apache.org/licenses/LICENSE-2.0
5
6//! Text preparation: line breaking and BIDI
7
8#![allow(clippy::unnecessary_unwrap)]
9
10use super::TextDisplay;
11use crate::conv::{to_u32, to_usize};
12use crate::fonts::{self, FaceId, FontSelector, NoFontMatch};
13use crate::format::FormattableText;
14use crate::util::ends_with_hard_break;
15use crate::{Direction, Range, shaper};
16use icu_properties::props::{EmojiModifier, EmojiPresentation, RegionalIndicator, Script};
17use icu_properties::{CodePointMapData, CodePointSetData};
18use icu_segmenter::LineSegmenter;
19use std::sync::OnceLock;
20use unicode_bidi::{BidiInfo, LTR_LEVEL, RTL_LEVEL};
21
22#[derive(Clone, Copy, Debug, PartialEq)]
23pub(crate) enum RunSpecial {
24    None,
25    /// Run ends with a hard break
26    HardBreak,
27    /// Run does not end with a break
28    NoBreak,
29    /// Run is a horizontal tab (run is a single char only)
30    HTab,
31}
32
33impl TextDisplay {
34    /// Update font size
35    ///
36    /// [Requires status][Self#status-of-preparation]: level runs have been
37    /// prepared and are valid in all ways except size (`dpem`).
38    ///
39    /// This updates the result of [`TextDisplay::prepare_runs`] due to change
40    /// in font size.
41    pub fn resize_runs<F: FormattableText + ?Sized>(&mut self, text: &F, mut dpem: f32) {
42        let mut font_tokens = text.font_tokens(dpem);
43        let mut next_fmt = font_tokens.next();
44
45        let text = text.as_str();
46
47        for run in &mut self.runs {
48            while let Some(fmt) = next_fmt.as_ref() {
49                if fmt.start > run.range.start {
50                    break;
51                }
52                dpem = fmt.dpem;
53                next_fmt = font_tokens.next();
54            }
55
56            let input = shaper::Input {
57                text,
58                dpem,
59                level: run.level,
60                script: run.script,
61            };
62            let mut breaks = Default::default();
63            std::mem::swap(&mut breaks, &mut run.breaks);
64            if run.level.is_rtl() {
65                breaks.reverse();
66            }
67            *run = shaper::shape(input, run.range, run.face_id, breaks, run.special);
68        }
69    }
70
71    /// Resolve font face and shape run
72    ///
73    /// This may sub-divide text as required to find matching fonts.
74    fn push_run(
75        &mut self,
76        font: FontSelector,
77        input: shaper::Input,
78        range: Range,
79        mut breaks: tinyvec::TinyVec<[shaper::GlyphBreak; 4]>,
80        special: RunSpecial,
81        first_real: Option<char>,
82    ) -> Result<(), NoFontMatch> {
83        let fonts = fonts::library();
84        let font_id = fonts.select_font(&font, input.script.into())?;
85        let text = &input.text[range.to_std()];
86
87        // Find a font face
88        let mut face_id = None;
89        if let Some(c) = first_real {
90            face_id = fonts
91                .face_for_char(font_id, None, c)
92                .expect("invalid FontId");
93        }
94
95        let mut face = match face_id {
96            Some(id) => id,
97            None => {
98                // We failed to find a font face for the run
99                fonts.first_face_for(font_id).expect("invalid FontId")
100            }
101        };
102
103        let mut start = 0;
104        for (index, c) in text.char_indices() {
105            let index = to_u32(index);
106            if let Some(new_face) = fonts
107                .face_for_char(font_id, Some(face), c)
108                .expect("invalid FontId")
109                && new_face != face
110            {
111                if index > start {
112                    let sub_range = Range {
113                        start: range.start + start,
114                        end: range.start + index,
115                    };
116                    let mut j = 0;
117                    for i in 0..breaks.len() {
118                        if breaks[i].index < sub_range.end {
119                            j = i + 1;
120                        }
121                    }
122                    let rest = breaks.split_off(j);
123
124                    self.runs.push(shaper::shape(
125                        input,
126                        sub_range,
127                        face,
128                        breaks,
129                        RunSpecial::NoBreak,
130                    ));
131                    breaks = rest;
132                    start = index;
133                }
134
135                face = new_face;
136            }
137        }
138
139        let sub_range = Range {
140            start: range.start + start,
141            end: range.end,
142        };
143        self.runs
144            .push(shaper::shape(input, sub_range, face, breaks, special));
145        Ok(())
146    }
147
148    /// Break text into level runs
149    ///
150    /// [Requires status][Self#status-of-preparation]: none.
151    ///
152    /// Must be called again if any of `text`, `direction` or `font` change.
153    /// If only `dpem` changes, [`Self::resize_runs`] may be called instead.
154    ///
155    /// The text is broken into a set of contiguous "level runs". These runs are
156    /// maximal slices of the `text` which do not contain explicit line breaks
157    /// and have a single text direction according to the
158    /// [Unicode Bidirectional Algorithm](http://www.unicode.org/reports/tr9/).
159    pub fn prepare_runs<F: FormattableText + ?Sized>(
160        &mut self,
161        text: &F,
162        direction: Direction,
163        mut font: FontSelector,
164        mut dpem: f32,
165    ) -> Result<(), NoFontMatch> {
166        // This method constructs a list of "hard lines" (the initial line and any
167        // caused by a hard break), each composed of a list of "level runs" (the
168        // result of splitting and reversing according to Unicode TR9 aka
169        // Bidirectional algorithm), plus a list of "soft break" positions
170        // (where wrapping may introduce new lines depending on available space).
171
172        self.runs.clear();
173
174        let mut font_tokens = text.font_tokens(dpem);
175        let mut next_fmt = font_tokens.next();
176        if let Some(fmt) = next_fmt.as_ref()
177            && fmt.start == 0
178        {
179            font = fmt.font;
180            dpem = fmt.dpem;
181            next_fmt = font_tokens.next();
182        }
183
184        let text = text.as_str();
185
186        let default_para_level = match direction {
187            Direction::Auto => None,
188            Direction::AutoRtl => {
189                use unicode_bidi::Direction::*;
190                match unicode_bidi::get_base_direction(text) {
191                    Ltr | Rtl => None,
192                    Mixed => Some(RTL_LEVEL),
193                }
194            }
195            Direction::Ltr => Some(LTR_LEVEL),
196            Direction::Rtl => Some(RTL_LEVEL),
197        };
198        let info = BidiInfo::new(text, default_para_level);
199        let levels = info.levels;
200        assert_eq!(text.len(), levels.len());
201
202        let mut input = shaper::Input {
203            text,
204            dpem,
205            level: levels.first().cloned().unwrap_or(LTR_LEVEL),
206            script: Script::Unknown,
207        };
208
209        let mut start = 0;
210        let mut breaks = Default::default();
211
212        // TODO: allow segmenter configuration
213        let segmenter = LineSegmenter::new_auto(Default::default());
214        let mut break_iter = segmenter.segment_str(text);
215        let mut next_break = break_iter.next();
216
217        let mut first_real = None;
218        let mut emoji_state = EmojiState::None;
219        let mut emoji_start = 0;
220        let mut emoji_end = 0;
221
222        let mut last_is_control = false;
223        let mut last_is_htab = false;
224        let mut non_control_end = 0;
225
226        for (index, c) in text
227            .char_indices()
228            .chain(std::iter::once((text.len(), '\0')))
229        {
230            // Handling for control chars
231            if !last_is_control {
232                non_control_end = index;
233            }
234            let is_htab = c == '\t';
235            let mut require_break = last_is_htab;
236            let is_control = c.is_control();
237
238            // Is wrapping allowed at this position?
239            let is_break = next_break == Some(index);
240            let hard_break = is_break && ends_with_hard_break(&text[..index]);
241            if is_break {
242                next_break = break_iter.next();
243            }
244
245            let script = CodePointMapData::<Script>::new().get(c);
246
247            let emoji_break = emoji_state.advance(c);
248            let mut new_emoji_start = emoji_start;
249            let mut is_emoji = false;
250            let prohibit_break = match emoji_break {
251                EmojiBreak::None => false,
252                EmojiBreak::Start => {
253                    require_break = true;
254                    new_emoji_start = index;
255                    false
256                }
257                EmojiBreak::Prohibit => {
258                    emoji_end = index;
259                    true
260                }
261                EmojiBreak::End => {
262                    require_break = true;
263                    emoji_end = index;
264                    debug_assert!(emoji_end > emoji_start);
265                    is_emoji = true;
266                    false
267                }
268                EmojiBreak::Restart => {
269                    require_break = true;
270                    emoji_end = index;
271                    new_emoji_start = index;
272                    debug_assert!(emoji_end > emoji_start);
273                    is_emoji = true;
274                    false
275                }
276                EmojiBreak::Error => {
277                    is_emoji = emoji_end > emoji_start;
278                    require_break = is_emoji;
279                    false
280                }
281            };
282
283            // Force end of current run?
284            require_break |= levels
285                .get(index)
286                .map(|level| *level != input.level)
287                .unwrap_or(true);
288
289            if let Some(fmt) = next_fmt.as_ref()
290                && to_usize(fmt.start) == index
291            {
292                require_break = true;
293            }
294
295            let mut new_script = None;
296            if is_real(script) {
297                if first_real.is_none() {
298                    first_real = Some(c);
299                }
300                if script != input.script {
301                    new_script = Some(script);
302                    require_break |= is_real(input.script);
303                }
304            }
305
306            if !prohibit_break && (hard_break || require_break) {
307                let special = match () {
308                    _ if hard_break => RunSpecial::HardBreak,
309                    _ if last_is_htab => RunSpecial::HTab,
310                    _ if last_is_control || is_break => RunSpecial::None,
311                    _ => RunSpecial::NoBreak,
312                };
313
314                if is_emoji {
315                    let range = (emoji_start..emoji_end).into();
316                    let face = emoji_face_id()?;
317                    self.runs
318                        .push(shaper::shape(input, range, face, breaks, special));
319                } else {
320                    // NOTE: the range may be empty; we need it anyway (unless
321                    // we modify the last run's special property).
322                    let range = (start..non_control_end).into();
323                    self.push_run(font, input, range, breaks, special, first_real)?;
324                };
325                first_real = None;
326
327                start = index;
328                non_control_end = index;
329                if let Some(level) = levels.get(index) {
330                    input.level = *level;
331                }
332                input.script = script;
333                breaks = Default::default();
334            } else if is_break && !is_control {
335                breaks.push(shaper::GlyphBreak::new(to_u32(index)));
336            }
337
338            if let Some(fmt) = next_fmt.as_ref()
339                && to_usize(fmt.start) == index
340            {
341                font = fmt.font;
342                input.dpem = fmt.dpem;
343                next_fmt = font_tokens.next();
344                debug_assert!(
345                    next_fmt
346                        .as_ref()
347                        .map(|fmt| to_usize(fmt.start) > index)
348                        .unwrap_or(true)
349                );
350            }
351
352            last_is_control = is_control;
353            last_is_htab = is_htab;
354            emoji_start = new_emoji_start;
355            if let Some(script) = new_script {
356                input.script = script;
357            }
358        }
359
360        let hard_break = ends_with_hard_break(&text);
361
362        // Following a hard break we have an implied empty line.
363        if hard_break {
364            let range = (text.len()..text.len()).into();
365            input.level = default_para_level.unwrap_or(LTR_LEVEL);
366            breaks = Default::default();
367            self.push_run(font, input, range, breaks, RunSpecial::None, None)?;
368        }
369
370        /*
371        println!("text: {}", text);
372        let fonts = fonts::library();
373        for run in &self.runs {
374            let slice = &text[run.range];
375            print!(
376                "\t{:?}, text[{}..{}]: '{}', ",
377                run.level, run.range.start, run.range.end, slice
378            );
379            match run.special {
380                RunSpecial::None => (),
381                RunSpecial::HardBreak => print!("HardBreak, "),
382                RunSpecial::NoBreak => print!("NoBreak, "),
383                RunSpecial::HTab => print!("HTab, "),
384            }
385            print!("breaks=[");
386            let mut iter = run.breaks.iter();
387            if let Some(b) = iter.next() {
388                print!("{}", b.index);
389            }
390            for b in iter {
391                print!(", {}", b.index);
392            }
393            print!("]");
394            if let Some(name) = fonts.get_face_store(run.face_id).name_full() {
395                print!(", {name}");
396            }
397            println!();
398        }
399        */
400        Ok(())
401    }
402}
403
404fn is_real(script: Script) -> bool {
405    !matches!(script, Script::Common | Script::Unknown | Script::Inherited)
406}
407
408fn emoji_face_id() -> Result<FaceId, NoFontMatch> {
409    static ONCE: OnceLock<Result<FaceId, NoFontMatch>> = OnceLock::new();
410    *ONCE.get_or_init(|| {
411        let fonts = fonts::library();
412        let font = fonts.select_font(&FontSelector::EMOJI, Script::Common.into());
413        font.map(|font_id| fonts.first_face_for(font_id).expect("invalid FontId"))
414    })
415}
416
417#[derive(Clone, Copy, Debug, PartialEq, Eq)]
418enum EmojiBreak {
419    /// Not an Emoji
420    None,
421    /// Start of an Emoji sequence
422    Start,
423    /// Mid Emoji sequence, valid
424    Prohibit,
425    /// End of a valid Emoji sequence
426    End,
427    /// End of one Emoji and start of another
428    Restart,
429    /// Error; revert to last known good index
430    Error,
431}
432
433enum EmojiState {
434    None,
435    RI1,
436    RI2,
437    Emoji,
438    EMod,
439    VarSelector,
440    TagModifier,
441    ZWJ,
442}
443
444impl EmojiState {
445    /// Advance the emoji state machine
446    ///
447    /// Returns whether a break should occur before `c`.
448    fn advance(&mut self, c: char) -> EmojiBreak {
449        // Reference: https://unicode.org/reports/tr51/#EBNF_and_Regex
450        #[allow(non_snake_case)]
451        fn end_unless_ZWJ(c: char, b: &mut EmojiBreak) -> EmojiState {
452            if c == '\u{200D}' {
453                EmojiState::ZWJ
454            } else {
455                *b = EmojiBreak::End;
456                EmojiState::None
457            }
458        }
459        let mut b = EmojiBreak::None;
460        *self = match *self {
461            EmojiState::None => {
462                if CodePointSetData::new::<RegionalIndicator>().contains(c) {
463                    b = EmojiBreak::Start;
464                    EmojiState::RI1
465                } else if CodePointSetData::new::<EmojiPresentation>().contains(c) {
466                    b = EmojiBreak::Start;
467                    EmojiState::Emoji
468                } else {
469                    EmojiState::None
470                }
471            }
472            EmojiState::RI1 => {
473                if CodePointSetData::new::<RegionalIndicator>().contains(c) {
474                    b = EmojiBreak::Prohibit;
475                    EmojiState::RI2
476                } else {
477                    b = EmojiBreak::Error;
478                    EmojiState::None
479                }
480            }
481            EmojiState::RI2 => end_unless_ZWJ(c, &mut b),
482            EmojiState::Emoji => {
483                if CodePointSetData::new::<EmojiModifier>().contains(c) {
484                    EmojiState::EMod
485                } else if c == '\u{FE0F}' {
486                    EmojiState::VarSelector
487                } else if ('\u{E0020}'..='\u{E007E}').contains(&c) {
488                    EmojiState::TagModifier
489                } else if c == '\u{200D}' {
490                    EmojiState::ZWJ
491                } else {
492                    b = EmojiBreak::End;
493                    EmojiState::None
494                }
495            }
496            EmojiState::EMod => end_unless_ZWJ(c, &mut b),
497            EmojiState::VarSelector => {
498                if c == '\u{20E3}' {
499                    end_unless_ZWJ(c, &mut b)
500                } else {
501                    b = EmojiBreak::End;
502                    EmojiState::None
503                }
504            }
505            EmojiState::TagModifier => {
506                if ('\u{E0020}'..='\u{E007E}').contains(&c) {
507                    EmojiState::TagModifier
508                } else if c == '\u{E007F}' {
509                    end_unless_ZWJ(c, &mut b)
510                } else {
511                    b = EmojiBreak::Error;
512                    EmojiState::None
513                }
514            }
515            EmojiState::ZWJ => {
516                if CodePointSetData::new::<RegionalIndicator>().contains(c) {
517                    EmojiState::RI1
518                } else if CodePointSetData::new::<EmojiPresentation>().contains(c) {
519                    EmojiState::Emoji
520                } else {
521                    b = EmojiBreak::Error;
522                    EmojiState::None
523                }
524            }
525        };
526        if b == EmojiBreak::End {
527            *self = if CodePointSetData::new::<RegionalIndicator>().contains(c) {
528                b = EmojiBreak::Restart;
529                EmojiState::RI1
530            } else if CodePointSetData::new::<EmojiPresentation>().contains(c) {
531                b = EmojiBreak::Restart;
532                EmojiState::Emoji
533            } else {
534                EmojiState::None
535            };
536        }
537        b
538    }
539}