Skip to main content

oxitext_shape/
lib.rs

1#![forbid(unsafe_code)]
2#![warn(missing_docs)]
3//! `oxitext-shape` — Swash-based text shaper for OxiText.
4//!
5//! Provides [`SwashShaper`], which wraps swash's [`ShapeContext`] and produces
6//! [`ShapedRun`]s from UTF-8 text + raw font bytes.
7//!
8//! M1: LTR Latin shaping. Bidi (M2) and script-specific itemisation (M3) are
9//! deferred.
10//!
11//! # M3 additions
12//!
13//! - [`backend`]: Swappable [`backend::ShapeBackend`] trait, with the default
14//!   [`backend::SwashShaperBackend`] wrapper and optional
15//!   [`backend::RustybuzzShaper`] (feature `rustybuzz-backend`).
16//!
17//! # M5 additions (Slice 5a)
18//!
19//! - [`cache`]: Bounded LRU shape cache ([`cache::ShapeCache`],
20//!   [`cache::ShapeKey`]) backed by [`lru::LruCache`].
21//! - [`SwashShaper::with_cache`]: creates a `SwashShaper` with an attached
22//!   `ShapeCache`; subsequent `shape()` calls check the cache before invoking
23//!   swash.
24//!
25//! # Feature-aware shaping (Slice 6)
26//!
27//! - [`ShapeFeature`]: an OpenType feature tag-value pair.
28//! - [`ShapeDirection`]: direction enum (Ltr/Rtl/Ttb/Btt).
29//! - [`ShapeRequest`] / [`ShapeRequestBuilder`]: builder pattern for a full
30//!   shaping request including text, font, size, direction, script, language,
31//!   and a list of [`ShapeFeature`]s.
32//! - [`SwashShaper::shape_request`]: shapes a complete [`ShapeRequest`], with
33//!   automatic `vert`/`vrt2` feature injection for top-to-bottom text.
34//! - [`SwashShaper::shape_with_features`]: lower-level entry point that
35//!   accepts a feature slice directly.
36
37pub mod backend;
38pub mod batch;
39pub mod cache;
40pub mod script_detect;
41pub mod variational;
42
43#[cfg(feature = "system-fonts")]
44pub mod system_fonts;
45#[cfg(feature = "system-fonts")]
46pub use system_fonts::{
47    build_system_db, load_best_font_for_text, load_best_font_for_text_from, load_font_for_family,
48    load_font_for_family_from,
49};
50
51/// Native OS font fallback for complex script coverage.
52///
53/// When the `native-fallback` Cargo feature is enabled, this module re-exports
54/// the [`oxifont_adapter_native::shaper_bridge`] API, allowing shaping engines
55/// to resolve Unicode codepoints to OS-native font bytes (CoreText on macOS,
56/// DirectWrite on Windows, pure filesystem scan on Linux).
57///
58/// # Example
59///
60/// ```no_run
61/// # #[cfg(feature = "native-fallback")]
62/// # {
63/// use oxitext_shape::native_fallback;
64///
65/// let primary = std::fs::read("NotoSans-Regular.ttf").unwrap();
66/// // For Arabic/Hebrew/CJK text that NotoSans may not cover:
67/// let fallbacks = native_fallback::collect_fallback_fonts_for_text("مرحبا", &primary);
68/// println!("{} fallback font(s) provided", fallbacks.len());
69/// # }
70/// ```
71#[cfg(feature = "native-fallback")]
72pub mod native_fallback {
73    pub use oxifont_adapter_native::shaper_bridge::{
74        collect_fallback_fonts_for_text, collect_fonts_for_text, find_native_font_for_codepoint,
75        load_best_native_font_for_text, load_native_font_for_codepoint_with_index,
76    };
77}
78
79#[cfg(feature = "rustybuzz-backend")]
80pub use backend::RustybuzzShaper;
81pub use backend::ShapeBackend;
82pub use backend::SwashShaperBackend;
83pub use cache::{FontId, ShapeCache, ShapeKey};
84use oxitext_core::{OxiTextError, ShapedGlyph, ShapedRun};
85pub use script_detect::{
86    requires_arabic_shaping, requires_indic_shaping, requires_mark_positioning,
87};
88use smallvec::SmallVec;
89use std::sync::Arc;
90use swash::shape::{Direction, ShapeContext};
91use swash::FontRef;
92// ──────────────────────────────────────────────────────────────────────────────
93// ShapeFeature
94// ──────────────────────────────────────────────────────────────────────────────
95
96/// An OpenType feature tag-value pair.
97///
98/// The `tag` is a 4-byte ASCII identifier (e.g. `b"liga"`, `b"kern"`,
99/// `b"smcp"`).  A `value` of `0` disables the feature, `1` enables it, and
100/// values `>1` select an alternate index for features such as `salt`.
101#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
102pub struct ShapeFeature {
103    /// 4-byte ASCII OpenType feature tag.
104    pub tag: [u8; 4],
105    /// Feature value: 0 = disable, 1 = enable, >1 = alternate index.
106    pub value: u32,
107}
108
109impl ShapeFeature {
110    /// Creates a new feature with an arbitrary value.
111    pub const fn new(tag: [u8; 4], value: u32) -> Self {
112        Self { tag, value }
113    }
114
115    /// Creates an enabled feature (`value = 1`).
116    pub const fn enable(tag: [u8; 4]) -> Self {
117        Self { tag, value: 1 }
118    }
119
120    /// Creates a disabled feature (`value = 0`).
121    pub const fn disable(tag: [u8; 4]) -> Self {
122        Self { tag, value: 0 }
123    }
124
125    /// Standard ligatures.
126    pub const LIGA: Self = Self::enable(*b"liga");
127    /// Kerning.
128    pub const KERN: Self = Self::enable(*b"kern");
129    /// Small capitals.
130    pub const SMCP: Self = Self::enable(*b"smcp");
131    /// Contextual alternates.
132    pub const CALT: Self = Self::enable(*b"calt");
133    /// Vertical forms (substitution of upright CJK glyphs with vertical ones).
134    pub const VERT: Self = Self::enable(*b"vert");
135    /// Vertical rotation (alternative to `vert` for some CJK contexts).
136    pub const VRT2: Self = Self::enable(*b"vrt2");
137}
138
139// ──────────────────────────────────────────────────────────────────────────────
140// ShapeDirection
141// ──────────────────────────────────────────────────────────────────────────────
142
143/// Text direction for a shaping request.
144#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
145pub enum ShapeDirection {
146    /// Left-to-right (default for Latin, Cyrillic, etc.).
147    #[default]
148    Ltr,
149    /// Right-to-left (Arabic, Hebrew, etc.).
150    Rtl,
151    /// Top-to-bottom (CJK vertical text).
152    Ttb,
153    /// Bottom-to-top (rare).
154    Btt,
155}
156
157// ──────────────────────────────────────────────────────────────────────────────
158// ShapeRequest / ShapeRequestBuilder
159// ──────────────────────────────────────────────────────────────────────────────
160
161/// A complete shaping request with all parameters.
162///
163/// Build via [`ShapeRequest::builder`] and then call
164/// [`SwashShaper::shape_request`].
165#[derive(Debug, Clone)]
166pub struct ShapeRequest<'a> {
167    /// UTF-8 text to shape.
168    pub text: &'a str,
169    /// Raw font bytes.
170    pub font_data: &'a [u8],
171    /// Font size in pixels-per-em.
172    pub px_size: f32,
173    /// Shaping direction.
174    pub direction: ShapeDirection,
175    /// OpenType script tag (e.g. `b"latn"`, `b"arab"`), or `None` for
176    /// auto-detection.
177    pub script: Option<[u8; 4]>,
178    /// OpenType language tag (e.g. `b"ENG "`, `b"ARA "`), or `None`.
179    pub language: Option<[u8; 4]>,
180    /// OpenType feature overrides.
181    pub features: Vec<ShapeFeature>,
182}
183
184impl<'a> ShapeRequest<'a> {
185    /// Returns a new [`ShapeRequestBuilder`].
186    pub fn builder() -> ShapeRequestBuilder<'a> {
187        ShapeRequestBuilder::default()
188    }
189}
190
191/// Builder for [`ShapeRequest`].
192#[derive(Debug, Default)]
193pub struct ShapeRequestBuilder<'a> {
194    text: Option<&'a str>,
195    font_data: Option<&'a [u8]>,
196    px_size: f32,
197    direction: ShapeDirection,
198    script: Option<[u8; 4]>,
199    language: Option<[u8; 4]>,
200    features: Vec<ShapeFeature>,
201}
202
203/// Errors that can occur when building a [`ShapeRequest`].
204#[derive(Debug)]
205pub enum ShapeRequestError {
206    /// The `text` field was not provided.
207    MissingText,
208    /// The `font_data` field was not provided.
209    MissingFont,
210}
211
212impl std::fmt::Display for ShapeRequestError {
213    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
214        match self {
215            ShapeRequestError::MissingText => f.write_str("text not set"),
216            ShapeRequestError::MissingFont => f.write_str("font_data not set"),
217        }
218    }
219}
220
221impl std::error::Error for ShapeRequestError {}
222
223impl<'a> ShapeRequestBuilder<'a> {
224    /// Sets the text to shape.
225    pub fn text(mut self, t: &'a str) -> Self {
226        self.text = Some(t);
227        self
228    }
229
230    /// Sets the raw font bytes.
231    pub fn font_data(mut self, d: &'a [u8]) -> Self {
232        self.font_data = Some(d);
233        self
234    }
235
236    /// Sets the font size in pixels-per-em.
237    pub fn px_size(mut self, s: f32) -> Self {
238        self.px_size = s;
239        self
240    }
241
242    /// Sets the shaping direction.
243    pub fn direction(mut self, d: ShapeDirection) -> Self {
244        self.direction = d;
245        self
246    }
247
248    /// Pins the OpenType script tag (overrides swash's auto-detection).
249    pub fn script(mut self, tag: [u8; 4]) -> Self {
250        self.script = Some(tag);
251        self
252    }
253
254    /// Pins the OpenType language tag for language-specific GSUB/GPOS rules.
255    pub fn language(mut self, tag: [u8; 4]) -> Self {
256        self.language = Some(tag);
257        self
258    }
259
260    /// Appends an OpenType feature override.
261    pub fn feature(mut self, f: ShapeFeature) -> Self {
262        self.features.push(f);
263        self
264    }
265
266    /// Builds the [`ShapeRequest`].
267    ///
268    /// # Errors
269    /// Returns [`ShapeRequestError::MissingText`] or
270    /// [`ShapeRequestError::MissingFont`] if the respective fields were not
271    /// provided.
272    pub fn build(self) -> Result<ShapeRequest<'a>, ShapeRequestError> {
273        Ok(ShapeRequest {
274            text: self.text.ok_or(ShapeRequestError::MissingText)?,
275            font_data: self.font_data.ok_or(ShapeRequestError::MissingFont)?,
276            px_size: self.px_size,
277            direction: self.direction,
278            script: self.script,
279            language: self.language,
280            features: self.features,
281        })
282    }
283}
284
285// ──────────────────────────────────────────────────────────────────────────────
286// Internal parameter bundle used by shape_with_features_internal
287// ──────────────────────────────────────────────────────────────────────────────
288
289/// Internal parameter bundle for the unified shaping entry point.
290///
291/// Groups all shaping inputs into a single struct so `shape_with_features_internal`
292/// stays under the clippy `too_many_arguments` threshold.
293struct ShapeParams<'a> {
294    font_data: &'a [u8],
295    text: &'a str,
296    px_size: f32,
297    rtl: bool,
298    script_tag: Option<[u8; 4]>,
299    language_tag: Option<[u8; 4]>,
300    features: &'a [ShapeFeature],
301}
302
303// ──────────────────────────────────────────────────────────────────────────────
304// SwashShaper
305// ──────────────────────────────────────────────────────────────────────────────
306
307/// Text shaper backed by [swash].
308///
309/// Keep a single `SwashShaper` alive across multiple layout passes to amortise
310/// the cost of the internal LRU caches that swash maintains in [`ShapeContext`].
311///
312/// Optionally attach a [`ShapeCache`] via [`Self::with_cache`] to skip swash
313/// entirely on repeated requests for the same `(font, text, size)` tuple.
314pub struct SwashShaper {
315    ctx: ShapeContext,
316    /// Optional application-level shape cache.
317    cache: Option<Arc<ShapeCache>>,
318    /// Cached text string for script-run reuse (Item 4).
319    #[cfg(feature = "icu")]
320    script_cache_text: String,
321    /// Cached script runs for the cached text (Item 4).
322    #[cfg(feature = "icu")]
323    script_cache_runs: Vec<oxitext_icu::ScriptRun>,
324}
325
326impl SwashShaper {
327    /// Creates a new shaper with default cache settings and no shape cache.
328    pub fn new() -> Self {
329        Self {
330            ctx: ShapeContext::new(),
331            cache: None,
332            #[cfg(feature = "icu")]
333            script_cache_text: String::new(),
334            #[cfg(feature = "icu")]
335            script_cache_runs: Vec::new(),
336        }
337    }
338
339    /// Creates a new shaper with an attached [`ShapeCache`] of `capacity` entries.
340    ///
341    /// Repeated calls to [`Self::shape`] with the same `(font_data, text, size)`
342    /// tuple will be served from the cache after the first miss.
343    ///
344    /// # Arguments
345    /// - `capacity`: maximum number of [`ShapedRun`]s to keep in the cache.
346    ///   Passing `0` uses a minimum capacity of 1.
347    pub fn with_cache(capacity: usize) -> Self {
348        Self {
349            ctx: ShapeContext::new(),
350            cache: Some(Arc::new(ShapeCache::new(capacity))),
351            #[cfg(feature = "icu")]
352            script_cache_text: String::new(),
353            #[cfg(feature = "icu")]
354            script_cache_runs: Vec::new(),
355        }
356    }
357
358    /// Returns a reference to the attached shape cache, if any.
359    pub fn shape_cache(&self) -> Option<&Arc<ShapeCache>> {
360        self.cache.as_ref()
361    }
362
363    /// Shapes `text` using the font in `font_data` at `size` pixels-per-em.
364    ///
365    /// Returns a [`ShapedRun`] containing one [`ShapedGlyph`] per output glyph.
366    /// The `x_advance` of each glyph is in pixels (already scaled by `size`).
367    ///
368    /// When an attached [`ShapeCache`] is present the result is looked up
369    /// before invoking swash.  Cache keys incorporate `font_data` pointer
370    /// identity, the exact text, and `size`.
371    ///
372    /// # Errors
373    /// Returns [`OxiTextError::Shaping`] if the font bytes cannot be parsed by
374    /// swash.
375    pub fn shape(
376        &mut self,
377        text: &str,
378        font_data: Arc<[u8]>,
379        size: f32,
380    ) -> Result<ShapedRun, OxiTextError> {
381        // Build a deterministic axis hash from the size (no variation axes yet).
382        let axis_hash = size.to_bits() as u64;
383
384        // Check cache if attached.
385        if let Some(ref cache) = self.cache {
386            let key = ShapeKey::new(&font_data, text, axis_hash);
387            if let Some(cached) = cache.get(&key) {
388                return Ok((*cached).clone());
389            }
390        }
391
392        // Cache miss — invoke swash.
393        let font = FontRef::from_index(&font_data, 0)
394            .ok_or_else(|| OxiTextError::Shaping("swash could not parse font bytes".into()))?;
395
396        let mut shaper = self.ctx.builder(font).size(size).build();
397        shaper.add_str(text);
398
399        let mut glyphs: SmallVec<[ShapedGlyph; 8]> = SmallVec::new();
400        shaper.shape_with(|cluster| {
401            // A cluster is whitespace if every source char it covers is
402            // whitespace. Most whitespace clusters cover a single space/tab.
403            let cluster_range = cluster.source.start as usize..cluster.source.end as usize;
404            let is_ws = text
405                .get(cluster_range)
406                .map(|slice| !slice.is_empty() && slice.chars().all(|c| c.is_whitespace()))
407                .unwrap_or(false);
408            // More than one glyph in a cluster means inner glyphs are unsafe
409            // to break before (ligature / mark attachment).
410            let multi = cluster.glyphs.len() > 1;
411            for (idx, glyph) in cluster.glyphs.iter().enumerate() {
412                // A glyph is unsafe to break before if it is inside a
413                // multi-glyph cluster (idx > 0) OR if it carries the mark
414                // attachment flag (combining mark attached to a base glyph).
415                let utb = (multi && idx > 0) || glyph.info.is_mark();
416                glyphs.push(ShapedGlyph {
417                    gid: glyph.id,
418                    x_advance: glyph.advance,
419                    y_advance: 0.0,
420                    x_offset: glyph.x,
421                    y_offset: glyph.y,
422                    cluster: cluster.source.start,
423                    is_whitespace: is_ws,
424                    unsafe_to_break: utb,
425                });
426            }
427        });
428
429        let run = ShapedRun {
430            glyphs,
431            font_data: Arc::clone(&font_data),
432        };
433
434        // Populate cache on miss.
435        if let Some(ref cache) = self.cache {
436            let key = ShapeKey::new(&font_data, text, axis_hash);
437            cache.insert(key, Arc::new(run.clone()));
438        }
439
440        Ok(run)
441    }
442
443    /// Shapes `text` with explicit direction control.
444    ///
445    /// When `rtl` is `false` this is identical to [`Self::shape`].
446    ///
447    /// When `rtl` is `true` the shaper signals `Direction::RightToLeft` to
448    /// swash (enabling correct Arabic/Hebrew form selection via OpenType GSUB),
449    /// then **sorts** the resulting glyphs by ascending `cluster` byte offset so
450    /// the output is always in **logical source order** regardless of what swash
451    /// emits.  The caller (bidi engine) is responsible for visual reordering.
452    ///
453    /// # Errors
454    /// Returns [`OxiTextError::Shaping`] if the font bytes cannot be parsed.
455    pub fn shape_with_direction(
456        &mut self,
457        text: &str,
458        font_data: Arc<[u8]>,
459        size: f32,
460        rtl: bool,
461    ) -> Result<ShapedRun, OxiTextError> {
462        if !rtl {
463            return self.shape(text, font_data, size);
464        }
465        // RTL path: shape with the explicit RightToLeft hint so swash can apply
466        // direction-sensitive GSUB lookups, then sort to ascending cluster order
467        // (logical order) to satisfy the architecture contract.
468        let mut run = self.do_shape_rtl(text, font_data, size)?;
469        run.glyphs.sort_by_key(|g| g.cluster);
470        Ok(run)
471    }
472
473    /// Shapes text using all parameters in a [`ShapeRequest`].
474    ///
475    /// When `direction` is [`ShapeDirection::Ttb`] or [`ShapeDirection::Btt`],
476    /// the `vert` and `vrt2` OpenType features are **automatically appended**
477    /// to the feature list (if not already present) so that fonts with a
478    /// vertical substitution table produce the correct glyph variants.
479    ///
480    /// Script and language tags, if provided, are forwarded to swash's
481    /// `ShaperBuilder` for language-specific GSUB/GPOS rule selection.
482    ///
483    /// # Errors
484    /// Returns [`OxiTextError::Shaping`] if the font bytes cannot be parsed.
485    pub fn shape_request(
486        &mut self,
487        req: &ShapeRequest<'_>,
488    ) -> Result<Vec<ShapedGlyph>, OxiTextError> {
489        // When the `icu` feature is enabled, normalize text to NFC before shaping
490        // so that precomposed and decomposed spellings produce identical glyph runs.
491        #[cfg(feature = "icu")]
492        let normalized_text: String;
493        #[cfg(feature = "icu")]
494        let req_text: &str = {
495            normalized_text = oxitext_icu::Normalizer::new().nfc(req.text);
496            normalized_text.as_str()
497        };
498        #[cfg(not(feature = "icu"))]
499        let req_text: &str = req.text;
500
501        // When direction is Ltr but the text is Arabic, auto-upgrade to Rtl
502        // so swash can apply the correct Arabic GSUB form-selection lookups.
503        let effective_direction = if req.direction == ShapeDirection::Ltr
504            && requires_arabic_shaping(req_text)
505        {
506            #[cfg(debug_assertions)]
507            eprintln!("[oxitext-shape] Arabic text detected with Ltr direction; upgrading to Rtl");
508            ShapeDirection::Rtl
509        } else {
510            req.direction
511        };
512
513        // Auto-inject vertical OpenType features for vertical directions.
514        let mut features = req.features.clone();
515        if effective_direction == ShapeDirection::Ttb || effective_direction == ShapeDirection::Btt
516        {
517            if !features.iter().any(|f| f.tag == *b"vert") {
518                features.push(ShapeFeature::VERT);
519            }
520            if !features.iter().any(|f| f.tag == *b"vrt2") {
521                features.push(ShapeFeature::VRT2);
522            }
523        }
524
525        let rtl = effective_direction == ShapeDirection::Rtl;
526        self.shape_with_features_internal(ShapeParams {
527            font_data: req.font_data,
528            text: req_text,
529            px_size: req.px_size,
530            rtl,
531            script_tag: req.script,
532            language_tag: req.language,
533            features: &features,
534        })
535    }
536
537    /// Shapes text with an explicit list of OpenType feature overrides.
538    ///
539    /// Unlike [`Self::shape_request`] this entry point does **not** inject
540    /// vertical features automatically; callers are responsible for supplying
541    /// the full feature list.
542    ///
543    /// # Errors
544    /// Returns [`OxiTextError::Shaping`] if the font bytes cannot be parsed.
545    pub fn shape_with_features(
546        &mut self,
547        font_data: &[u8],
548        text: &str,
549        px_size: f32,
550        rtl: bool,
551        features: &[ShapeFeature],
552    ) -> Result<Vec<ShapedGlyph>, OxiTextError> {
553        self.shape_with_features_internal(ShapeParams {
554            font_data,
555            text,
556            px_size,
557            rtl,
558            script_tag: None,
559            language_tag: None,
560            features,
561        })
562    }
563
564    /// Internal implementation shared by [`Self::shape_request`] and
565    /// [`Self::shape_with_features`].
566    fn shape_with_features_internal(
567        &mut self,
568        params: ShapeParams<'_>,
569    ) -> Result<Vec<ShapedGlyph>, OxiTextError> {
570        use swash::tag_from_bytes;
571        use swash::text::{Language, Script};
572
573        let font = FontRef::from_index(params.font_data, 0)
574            .ok_or_else(|| OxiTextError::Shaping("swash could not parse font bytes".into()))?;
575
576        let direction = if params.rtl {
577            Direction::RightToLeft
578        } else {
579            Direction::LeftToRight
580        };
581
582        // Resolve the optional script tag to a swash Script enum value.
583        let script = params
584            .script_tag
585            .and_then(|t| Script::from_opentype(tag_from_bytes(&t)))
586            .unwrap_or(Script::Latin);
587
588        // Resolve the optional language tag to a swash Language.
589        let language = params.language_tag.and_then(|t| {
590            // swash Language::parse expects a BCP-47 string; for OpenType tags
591            // we convert the raw bytes to a lossy str and try to parse them.
592            let s = std::str::from_utf8(&t).unwrap_or("").trim_end();
593            Language::parse(s)
594        });
595
596        // Convert our ShapeFeature slice to swash-compatible (tag, value) pairs.
597        // swash's `ShaperBuilder::features` accepts any iterator whose items
598        // implement `Into<Setting<u16>>`.  The swash crate provides
599        // `From<&([u8; 4], T)> for Setting<T>`, so we pass an iterator of
600        // references to satisfy the bound.
601        let swash_features: Vec<([u8; 4], u16)> = params
602            .features
603            .iter()
604            .map(|f| (f.tag, f.value.min(u32::from(u16::MAX)) as u16))
605            .collect();
606
607        let mut shaper = self
608            .ctx
609            .builder(font)
610            .size(params.px_size)
611            .direction(direction)
612            .script(script)
613            .language(language)
614            .features(swash_features.iter())
615            .build();
616
617        shaper.add_str(params.text);
618
619        let mut glyphs: Vec<ShapedGlyph> = Vec::new();
620        shaper.shape_with(|cluster| {
621            let cluster_range = cluster.source.start as usize..cluster.source.end as usize;
622            let is_ws = params
623                .text
624                .get(cluster_range)
625                .map(|slice| !slice.is_empty() && slice.chars().all(|c| c.is_whitespace()))
626                .unwrap_or(false);
627            let multi = cluster.glyphs.len() > 1;
628            for (idx, glyph) in cluster.glyphs.iter().enumerate() {
629                let utb = (multi && idx > 0) || glyph.info.is_mark();
630                glyphs.push(ShapedGlyph {
631                    gid: glyph.id,
632                    x_advance: glyph.advance,
633                    y_advance: 0.0,
634                    x_offset: glyph.x,
635                    y_offset: glyph.y,
636                    cluster: cluster.source.start,
637                    is_whitespace: is_ws,
638                    unsafe_to_break: utb,
639                });
640            }
641        });
642
643        if params.rtl {
644            glyphs.sort_by_key(|g| g.cluster);
645        }
646
647        Ok(glyphs)
648    }
649
650    /// Internal RTL shaping path: invokes swash with `Direction::RightToLeft`.
651    ///
652    /// Returns glyphs in whatever order swash produces; the public
653    /// [`Self::shape_with_direction`] sorts them to ascending cluster order.
654    fn do_shape_rtl(
655        &mut self,
656        text: &str,
657        font_data: Arc<[u8]>,
658        size: f32,
659    ) -> Result<ShapedRun, OxiTextError> {
660        let font = FontRef::from_index(&font_data, 0)
661            .ok_or_else(|| OxiTextError::Shaping("swash could not parse font bytes".into()))?;
662
663        let mut shaper = self
664            .ctx
665            .builder(font)
666            .size(size)
667            .direction(Direction::RightToLeft)
668            .build();
669        shaper.add_str(text);
670
671        let mut glyphs: SmallVec<[ShapedGlyph; 8]> = SmallVec::new();
672        shaper.shape_with(|cluster| {
673            let cluster_range = cluster.source.start as usize..cluster.source.end as usize;
674            let is_ws = text
675                .get(cluster_range)
676                .map(|slice| !slice.is_empty() && slice.chars().all(|c| c.is_whitespace()))
677                .unwrap_or(false);
678            let multi = cluster.glyphs.len() > 1;
679            for (idx, glyph) in cluster.glyphs.iter().enumerate() {
680                let utb = (multi && idx > 0) || glyph.info.is_mark();
681                glyphs.push(ShapedGlyph {
682                    gid: glyph.id,
683                    x_advance: glyph.advance,
684                    y_advance: 0.0,
685                    x_offset: glyph.x,
686                    y_offset: glyph.y,
687                    cluster: cluster.source.start,
688                    is_whitespace: is_ws,
689                    unsafe_to_break: utb,
690                });
691            }
692        });
693
694        Ok(ShapedRun {
695            glyphs,
696            font_data: Arc::clone(&font_data),
697        })
698    }
699
700    /// Shapes `text` and returns a rich [`ShapeResult`] with metadata.
701    ///
702    /// The result includes the glyph list, the direction used, and any
703    /// codepoints that could not be mapped (glyph ID 0 / `.notdef`).
704    ///
705    /// # Errors
706    /// Returns [`OxiTextError::Shaping`] if the font bytes cannot be parsed.
707    pub fn shape_full(
708        &mut self,
709        font_data: &[u8],
710        text: &str,
711        px_size: f32,
712    ) -> Result<ShapeResult, OxiTextError> {
713        use unicode_segmentation::UnicodeSegmentation;
714
715        let glyphs = self.shape_with_features_internal(ShapeParams {
716            font_data,
717            text,
718            px_size,
719            rtl: false,
720            script_tag: None,
721            language_tag: None,
722            features: &[],
723        })?;
724        let mut result = ShapeResult::from_glyphs(glyphs, text, ShapeDirection::Ltr);
725        // Populate grapheme cluster boundaries: start offset of each grapheme
726        // plus the end-of-text sentinel.
727        result.cluster_boundaries = text
728            .grapheme_indices(true)
729            .map(|(i, _)| i)
730            .chain(std::iter::once(text.len()))
731            .collect();
732        Ok(result)
733    }
734
735    /// Shapes `text` using raw font bytes supplied as `&[u8]` (LTR).
736    ///
737    /// A convenience wrapper over `Self::shape_with_features_internal` for
738    /// callers that already hold raw font bytes and do not need the `Arc` wrapping
739    /// or cache infrastructure of [`Self::shape`].
740    ///
741    /// # Errors
742    /// Returns [`OxiTextError::Shaping`] if the font bytes cannot be parsed.
743    pub fn shape_slice(
744        &mut self,
745        font_data: &[u8],
746        text: &str,
747        px_size: f32,
748    ) -> Result<Vec<ShapedGlyph>, OxiTextError> {
749        self.shape_with_features_internal(ShapeParams {
750            font_data,
751            text,
752            px_size,
753            rtl: false,
754            script_tag: None,
755            language_tag: None,
756            features: &[],
757        })
758    }
759
760    /// Shapes `text` using raw font bytes supplied as `&[u8]` (RTL).
761    ///
762    /// Like [`Self::shape_slice`] but shapes in right-to-left direction and
763    /// returns glyphs in ascending `cluster` (logical source) order.
764    ///
765    /// # Errors
766    /// Returns [`OxiTextError::Shaping`] if the font bytes cannot be parsed.
767    pub fn shape_slice_rtl(
768        &mut self,
769        font_data: &[u8],
770        text: &str,
771        px_size: f32,
772    ) -> Result<Vec<ShapedGlyph>, OxiTextError> {
773        self.shape_with_features_internal(ShapeParams {
774            font_data,
775            text,
776            px_size,
777            rtl: true,
778            script_tag: None,
779            language_tag: None,
780            features: &[],
781        })
782    }
783
784    /// Shapes `text` with a font fallback chain.
785    ///
786    /// For each codepoint that produces `glyph_id == 0` (`.notdef`), the
787    /// corresponding text run is re-shaped with each successive fallback font
788    /// in `fonts[1..]`.  If a fallback produces a non-zero glyph ID the
789    /// fallback glyphs replace the `.notdef` glyphs in the result; otherwise
790    /// the `.notdef` glyphs are preserved (best-effort).
791    ///
792    /// `fonts[0]` is the primary font; `fonts[1..]` are tried in order.
793    ///
794    /// # Note on cluster offsets
795    ///
796    /// When a sub-string is re-shaped with a fallback font, swash emits cluster
797    /// byte offsets **relative to that sub-string** (starting at 0).  This
798    /// function adds the original start offset back before merging so all
799    /// returned glyphs carry absolute offsets into `text`.
800    ///
801    /// # Errors
802    /// Returns [`OxiTextError::Shaping`] if the primary font cannot be parsed.
803    pub fn shape_with_fallback(
804        &mut self,
805        fonts: &[&[u8]],
806        text: &str,
807        px_size: f32,
808    ) -> Result<Vec<ShapedGlyph>, OxiTextError> {
809        let primary = fonts
810            .first()
811            .ok_or_else(|| OxiTextError::Shaping("font list is empty".into()))?;
812
813        // 1. Shape with the primary font.
814        let mut result = self.shape_with_features_internal(ShapeParams {
815            font_data: primary,
816            text,
817            px_size,
818            rtl: false,
819            script_tag: None,
820            language_tag: None,
821            features: &[],
822        })?;
823
824        if fonts.len() <= 1 {
825            return Ok(result);
826        }
827
828        // 2. Find contiguous runs of .notdef (glyph ID 0) glyphs.
829        let notdef_runs = collect_notdef_runs(&result, text);
830
831        // 3. For each .notdef run try the fallback fonts.
832        for (run_text_start, run_text_end) in notdef_runs {
833            let sub_text = match text.get(run_text_start..run_text_end) {
834                Some(s) if !s.is_empty() => s,
835                _ => continue,
836            };
837
838            // Try each fallback font in order.
839            'fallback: for fallback_font in &fonts[1..] {
840                let fallback_glyphs = match self.shape_with_features_internal(ShapeParams {
841                    font_data: fallback_font,
842                    text: sub_text,
843                    px_size,
844                    rtl: false,
845                    script_tag: None,
846                    language_tag: None,
847                    features: &[],
848                }) {
849                    Ok(g) => g,
850                    Err(_) => continue,
851                };
852
853                // Only use this fallback if it resolved at least one glyph.
854                if fallback_glyphs.iter().all(|g| g.gid == 0) {
855                    continue;
856                }
857
858                // Adjust cluster offsets from sub-string-relative to
859                // text-absolute and replace the .notdef glyphs in result.
860                let start_offset = run_text_start as u32;
861                let adjusted: Vec<ShapedGlyph> = fallback_glyphs
862                    .into_iter()
863                    .map(|mut g| {
864                        g.cluster += start_offset;
865                        g
866                    })
867                    .collect();
868
869                // Replace glyphs in the result whose cluster falls in [run_text_start, run_text_end).
870                result.retain(|g| {
871                    let c = g.cluster as usize;
872                    !(c >= run_text_start && c < run_text_end && g.gid == 0)
873                });
874
875                // Insert adjusted fallback glyphs at the correct position.
876                let insert_pos = result.partition_point(|g| (g.cluster as usize) < run_text_start);
877                for (i, g) in adjusted.into_iter().enumerate() {
878                    result.insert(insert_pos + i, g);
879                }
880
881                break 'fallback;
882            }
883        }
884
885        Ok(result)
886    }
887
888    /// Returns `true` if the given font data contains AAT layout tables.
889    ///
890    /// Checks for the presence of `morx` (extended glyph metamorphosis rules),
891    /// `kerx` (extended kerning data), or `ankr` (anchor point) tables — the
892    /// three primary tables that distinguish Apple Advanced Typography (AAT)
893    /// fonts from pure OpenType fonts.
894    ///
895    /// Swash's [`ShapeContext`] already applies AAT tables transparently when
896    /// present, so this function is informational only; it does not change the
897    /// shaping path.
898    pub fn font_has_aat(font_data: &[u8]) -> bool {
899        ttf_parser::Face::parse(font_data, 0)
900            .map(|face| {
901                face.raw_face()
902                    .table(ttf_parser::Tag::from_bytes(b"morx"))
903                    .is_some()
904                    || face
905                        .raw_face()
906                        .table(ttf_parser::Tag::from_bytes(b"kerx"))
907                        .is_some()
908                    || face
909                        .raw_face()
910                        .table(ttf_parser::Tag::from_bytes(b"ankr"))
911                        .is_some()
912            })
913            .unwrap_or(false)
914    }
915
916    /// Shape using AAT if the font has Morx/Kerx tables, otherwise fall back to
917    /// standard OpenType shaping.
918    ///
919    /// Swash handles both AAT and OpenType tables transparently via its
920    /// `ShapeContext`; this method is informational. It delegates directly to
921    /// `Self::shape_with_features_internal` regardless of table presence.
922    ///
923    /// # Errors
924    /// Returns [`OxiTextError::Shaping`] if the font bytes cannot be parsed.
925    pub fn shape_with_aat_fallback(
926        &mut self,
927        font_data: &[u8],
928        text: &str,
929        px_size: f32,
930    ) -> Result<ShapeResult, OxiTextError> {
931        use unicode_segmentation::UnicodeSegmentation;
932
933        let glyphs = self.shape_with_features_internal(ShapeParams {
934            font_data,
935            text,
936            px_size,
937            rtl: false,
938            script_tag: None,
939            language_tag: None,
940            features: &[],
941        })?;
942        let mut result = ShapeResult::from_glyphs(glyphs, text, ShapeDirection::Ltr);
943        result.cluster_boundaries = text
944            .grapheme_indices(true)
945            .map(|(i, _)| i)
946            .chain(std::iter::once(text.len()))
947            .collect();
948        Ok(result)
949    }
950}
951
952// ──────────────────────────────────────────────────────────────────────────────
953// ShapeResult
954// ──────────────────────────────────────────────────────────────────────────────
955
956/// Extended shaping result with metadata.
957///
958/// Produced by [`SwashShaper::shape_full`]; includes the glyph list, the
959/// direction resolved by the shaper, the OpenType script tag (if known), and a
960/// list of Unicode codepoints that could not be mapped (glyph ID 0 / `.notdef`).
961#[derive(Debug, Clone)]
962pub struct ShapeResult {
963    /// Shaped glyphs in logical cluster order.
964    pub glyphs: Vec<ShapedGlyph>,
965    /// OpenType script tag detected (e.g. `b"latn"`, `b"arab"`), or `None` if
966    /// unknown.  May be set by the caller after construction.
967    pub script_detected: Option<[u8; 4]>,
968    /// Direction resolved by the shaper.
969    pub direction: ShapeDirection,
970    /// Unicode codepoints that produced a `.notdef` glyph (ID 0).
971    pub missing_codepoints: Vec<char>,
972    /// Byte offsets (in the original text) where grapheme cluster boundaries fall.
973    ///
974    /// Populated by [`SwashShaper::shape_full`].  Empty when [`SwashShaper::shape`]
975    /// is called directly.  The first entry is `0` (start of text) and the last
976    /// entry is `text.len()` (end of text).
977    pub cluster_boundaries: Vec<usize>,
978}
979
980impl ShapeResult {
981    /// Constructs a [`ShapeResult`] from a glyph vector, the source text, and
982    /// the shaping direction.
983    ///
984    /// `script_detected` is left as `None`; callers may set it afterwards.
985    pub fn from_glyphs(glyphs: Vec<ShapedGlyph>, text: &str, direction: ShapeDirection) -> Self {
986        let missing: Vec<char> = {
987            let mut seen = std::collections::HashSet::new();
988            let mut missing = Vec::new();
989            for g in &glyphs {
990                if g.gid == 0 {
991                    if let Some(ch) = text
992                        .get(g.cluster as usize..)
993                        .and_then(|s| s.chars().next())
994                    {
995                        if seen.insert(ch) {
996                            missing.push(ch);
997                        }
998                    }
999                }
1000            }
1001            missing
1002        };
1003        Self {
1004            glyphs,
1005            script_detected: None,
1006            direction,
1007            missing_codepoints: missing,
1008            cluster_boundaries: Vec::new(),
1009        }
1010    }
1011}
1012
1013// ──────────────────────────────────────────────────────────────────────────────
1014// Helpers
1015// ──────────────────────────────────────────────────────────────────────────────
1016
1017/// Collect contiguous byte ranges in `text` that are covered exclusively by
1018/// `.notdef` (glyph ID 0) glyphs in `glyphs`.
1019///
1020/// Returns a `Vec` of `(start, end)` byte offset pairs into `text`.
1021fn collect_notdef_runs(glyphs: &[ShapedGlyph], text: &str) -> Vec<(usize, usize)> {
1022    if glyphs.is_empty() {
1023        return Vec::new();
1024    }
1025
1026    // Build a sorted, deduplicated list of cluster byte offsets that are .notdef.
1027    let mut notdef_clusters: Vec<usize> = glyphs
1028        .iter()
1029        .filter(|g| g.gid == 0)
1030        .map(|g| g.cluster as usize)
1031        .collect();
1032    notdef_clusters.sort_unstable();
1033    notdef_clusters.dedup();
1034
1035    // Build a sorted list of all cluster start offsets (regardless of gid).
1036    let mut all_starts: Vec<usize> = glyphs.iter().map(|g| g.cluster as usize).collect();
1037    all_starts.sort_unstable();
1038    all_starts.dedup();
1039
1040    // For each .notdef cluster, determine the end offset: it's the byte offset
1041    // of the next cluster in `all_starts`, or `text.len()` for the last one.
1042    let mut runs: Vec<(usize, usize)> = Vec::new();
1043    for &start in &notdef_clusters {
1044        let end = all_starts
1045            .iter()
1046            .find(|&&s| s > start)
1047            .copied()
1048            .unwrap_or(text.len());
1049        // Merge with the previous run if adjacent.
1050        if let Some(last) = runs.last_mut() {
1051            if last.1 == start {
1052                last.1 = end;
1053                continue;
1054            }
1055        }
1056        runs.push((start, end));
1057    }
1058    runs
1059}
1060
1061impl Default for SwashShaper {
1062    fn default() -> Self {
1063        Self::new()
1064    }
1065}
1066
1067// ──────────────────────────────────────────────────────────────────────────────
1068// System font convenience methods (feature `system-fonts`)
1069// ──────────────────────────────────────────────────────────────────────────────
1070
1071#[cfg(feature = "system-fonts")]
1072impl SwashShaper {
1073    /// Shape `text` using the best system font for its Unicode content.
1074    ///
1075    /// Calls [`system_fonts::load_best_font_for_text`] to discover a system
1076    /// font whose OS/2 Unicode range bits cover the codepoints in `text`, then
1077    /// shapes with that font at `px_size` pixels-per-em.
1078    ///
1079    /// This is a convenience wrapper; callers that need to reuse the same
1080    /// font database for many shaping calls should load the font bytes once
1081    /// with [`system_fonts::load_best_font_for_text`] and then call
1082    /// [`Self::shape_slice`] directly.
1083    ///
1084    /// # Errors
1085    /// Returns [`OxiTextError::Shaping`] when no suitable system font can be
1086    /// found or when the discovered font bytes cannot be parsed by swash.
1087    pub fn shape_with_system_font(
1088        &mut self,
1089        text: &str,
1090        px_size: f32,
1091    ) -> Result<Vec<ShapedGlyph>, OxiTextError> {
1092        let font_data = system_fonts::load_best_font_for_text(text)
1093            .ok_or_else(|| OxiTextError::Shaping("no system font found for text".into()))?;
1094        self.shape_slice(&font_data, text, px_size)
1095    }
1096
1097    /// Shape `text` using the system font that best matches `family`.
1098    ///
1099    /// `family` may be a concrete font family name (e.g. `"Arial"`) or a CSS
1100    /// generic alias (e.g. `"sans-serif"`).  The best CSS Level 4 match from
1101    /// the system catalog is used.
1102    ///
1103    /// # Errors
1104    /// Returns [`OxiTextError::Shaping`] when no font matching `family` can be
1105    /// found in the system catalog or when the font bytes cannot be parsed.
1106    pub fn shape_with_family(
1107        &mut self,
1108        text: &str,
1109        family: &str,
1110        px_size: f32,
1111    ) -> Result<Vec<ShapedGlyph>, OxiTextError> {
1112        let font_data = system_fonts::load_font_for_family(family).ok_or_else(|| {
1113            OxiTextError::Shaping(format!("no system font found for family '{family}'"))
1114        })?;
1115        self.shape_slice(&font_data, text, px_size)
1116    }
1117}
1118
1119// ──────────────────────────────────────────────────────────────────────────────
1120// Script-aware itemization (Feature 1, behind `icu` feature gate)
1121// ──────────────────────────────────────────────────────────────────────────────
1122
1123#[cfg(feature = "icu")]
1124/// Maps a [`oxitext_icu::TextScript`] to a 4-byte OpenType script tag.
1125fn text_script_to_ot_tag(s: oxitext_icu::TextScript) -> [u8; 4] {
1126    use oxitext_icu::TextScript;
1127    match s {
1128        TextScript::Latin => *b"latn",
1129        TextScript::Arabic => *b"arab",
1130        TextScript::Devanagari => *b"dev2",
1131        TextScript::Han => *b"hani",
1132        TextScript::Hangul => *b"hang",
1133        TextScript::Hiragana | TextScript::Katakana => *b"kana",
1134        TextScript::Hebrew => *b"hebr",
1135        TextScript::Thai => *b"thai",
1136        TextScript::Greek => *b"grek",
1137        TextScript::Cyrillic => *b"cyrl",
1138        _ => *b"DFLT",
1139    }
1140}
1141
1142#[cfg(feature = "icu")]
1143impl SwashShaper {
1144    /// Shapes `text` by first splitting it into per-script runs using ICU4X
1145    /// script itemization, then shaping each run with the appropriate OpenType
1146    /// script tag.
1147    ///
1148    /// Returns one [`ShapedRun`] per script run, in logical (source) order.
1149    /// Each glyph's `cluster` offset is absolute (relative to the start of
1150    /// `text`), not relative to the sub-run.
1151    ///
1152    /// # Errors
1153    /// Returns [`OxiTextError::Shaping`] if the font bytes cannot be parsed.
1154    pub fn shape_by_script(
1155        &mut self,
1156        font_data: Arc<[u8]>,
1157        text: &str,
1158        px_size: f32,
1159        features: &[ShapeFeature],
1160    ) -> Result<Vec<ShapedRun>, OxiTextError> {
1161        // Reuse cached script runs when the text is unchanged (Item 4 cache).
1162        if self.script_cache_text != text {
1163            let props = oxitext_icu::CharProperties::new();
1164            self.script_cache_runs = props.itemize(text);
1165            self.script_cache_text = text.to_owned();
1166        }
1167        let script_runs = self.script_cache_runs.clone();
1168
1169        let mut result: Vec<ShapedRun> = Vec::with_capacity(script_runs.len());
1170
1171        for run in &script_runs {
1172            let sub_text = text
1173                .get(run.start..run.end)
1174                .ok_or_else(|| OxiTextError::Shaping("invalid script run byte range".into()))?;
1175
1176            let ot_tag = text_script_to_ot_tag(run.script);
1177            let is_rtl = run.script.is_rtl();
1178
1179            let mut glyphs = self.shape_with_features_internal(ShapeParams {
1180                font_data: &font_data,
1181                text: sub_text,
1182                px_size,
1183                rtl: is_rtl,
1184                script_tag: Some(ot_tag),
1185                language_tag: None,
1186                features,
1187            })?;
1188
1189            // Adjust cluster offsets from sub-run-relative to text-absolute.
1190            let start_offset = run.start as u32;
1191            for g in &mut glyphs {
1192                g.cluster += start_offset;
1193            }
1194
1195            result.push(ShapedRun {
1196                glyphs: glyphs.into(),
1197                font_data: Arc::clone(&font_data),
1198            });
1199        }
1200
1201        Ok(result)
1202    }
1203}
1204
1205// ──────────────────────────────────────────────────────────────────────────────
1206// Kashida insertion opportunities (Feature 2)
1207// ──────────────────────────────────────────────────────────────────────────────
1208
1209/// Returns `true` when `c` is an Arabic character with Dual_Joining type.
1210///
1211/// Dual-joining characters connect to neighbours on both sides and are
1212/// therefore eligible for kashida (tatweel) stretching. This approximation
1213/// covers the mainstream Arabic block: U+0626..=U+063A and U+0641..=U+064A,
1214/// excluding known non-joiners (Alef U+0627, Dhal–Zain U+062F..=U+0632,
1215/// Waw U+0648).
1216fn is_arabic_dual_joining(c: char) -> bool {
1217    let cp = c as u32;
1218    match cp {
1219        // Lower Arabic range: Ba through Ghain (excludes Alef 0x0627,
1220        // Dal-Zain 0x062F–0x0632, and Waw 0x0648 which are right-joining only)
1221        0x0626..=0x063A => !matches!(cp, 0x0627 | 0x062F..=0x0632),
1222        // Upper Arabic range: Fa through Ya
1223        0x0641..=0x064A => !matches!(cp, 0x0648),
1224        _ => false,
1225    }
1226}
1227
1228/// Returns glyph indices (into `glyphs`) after which a kashida stretch can be
1229/// inserted for Arabic justification.
1230///
1231/// A position is a kashida opportunity when the source character at the
1232/// glyph's cluster byte offset is an Arabic dual-joining character (one that
1233/// connects on both sides and can therefore be stretched with tatweel).
1234///
1235/// If `text` does not contain Arabic text, or if no glyph's cluster maps to a
1236/// dual-joining character, the returned `Vec` is empty.
1237pub fn find_kashida_opportunities(text: &str, glyphs: &[ShapedGlyph]) -> Vec<usize> {
1238    let mut result = Vec::new();
1239    for (idx, glyph) in glyphs.iter().enumerate() {
1240        let byte_pos = glyph.cluster as usize;
1241        if let Some(ch) = text.get(byte_pos..).and_then(|s| s.chars().next()) {
1242            if is_arabic_dual_joining(ch) {
1243                result.push(idx);
1244            }
1245        }
1246    }
1247    result
1248}
1249
1250// ──────────────────────────────────────────────────────────────────────────────
1251// Emoji ZWJ sequence detection (Feature 3)
1252// ──────────────────────────────────────────────────────────────────────────────
1253
1254/// Returns byte ranges in `text` that correspond to ZWJ-joined emoji sequences.
1255///
1256/// A ZWJ emoji sequence is a grapheme cluster that:
1257/// 1. Contains U+200D (ZERO WIDTH JOINER), **and**
1258/// 2. Has at least two non-ZWJ codepoints (i.e. it is not a bare ZWJ followed
1259///    by nothing).
1260///
1261/// The returned ranges are contiguous byte spans in `text` covering each such
1262/// cluster. When multiple such clusters are adjacent (share no separator) they
1263/// are reported individually.
1264///
1265/// Uses [`unicode_segmentation::UnicodeSegmentation::grapheme_indices`] for
1266/// grapheme-cluster boundaries so that the detection is consistent with UAX #29.
1267pub fn detect_emoji_zwj_sequences(text: &str) -> Vec<std::ops::Range<usize>> {
1268    use unicode_segmentation::UnicodeSegmentation;
1269
1270    let mut result = Vec::new();
1271    for (start, cluster) in text.grapheme_indices(true) {
1272        // A ZWJ sequence must contain the joiner itself.
1273        if !cluster.contains('\u{200D}') {
1274            continue;
1275        }
1276        // Must also have at least 2 non-ZWJ codepoints.
1277        let non_zwj_count = cluster.chars().filter(|&c| c != '\u{200D}').count();
1278        if non_zwj_count >= 2 {
1279            let end = start + cluster.len();
1280            result.push(start..end);
1281        }
1282    }
1283    result
1284}
1285
1286#[cfg(test)]
1287mod bench_tests;
1288#[cfg(test)]
1289mod tests_inline;