Skip to main content

oxitext_shape/
lib.rs

1#![forbid(unsafe_code)]
2#![warn(missing_docs)]
3//! `oxitext-shape` — Swash-based text shaper for OxiText.
4//!
5//! Provides [`SwashShaper`], which wraps swash's [`ShapeContext`] and produces
6//! [`ShapedRun`]s from UTF-8 text + raw font bytes.
7//!
8//! M1: LTR Latin shaping. Bidi (M2) and script-specific itemisation (M3) are
9//! deferred.
10//!
11//! # M3 additions
12//!
13//! - [`backend`]: Swappable [`backend::ShapeBackend`] trait, with the default
14//!   [`backend::SwashShaperBackend`] wrapper and optional
15//!   [`backend::RustybuzzShaper`] (feature `rustybuzz-backend`).
16//!
17//! # M5 additions (Slice 5a)
18//!
19//! - [`cache`]: Bounded LRU shape cache ([`cache::ShapeCache`],
20//!   [`cache::ShapeKey`]) backed by [`lru::LruCache`].
21//! - [`SwashShaper::with_cache`]: creates a `SwashShaper` with an attached
22//!   `ShapeCache`; subsequent `shape()` calls check the cache before invoking
23//!   swash.
24//!
25//! # Feature-aware shaping (Slice 6)
26//!
27//! - [`ShapeFeature`]: an OpenType feature tag-value pair.
28//! - [`ShapeDirection`]: direction enum (Ltr/Rtl/Ttb/Btt).
29//! - [`ShapeRequest`] / [`ShapeRequestBuilder`]: builder pattern for a full
30//!   shaping request including text, font, size, direction, script, language,
31//!   and a list of [`ShapeFeature`]s.
32//! - [`SwashShaper::shape_request`]: shapes a complete [`ShapeRequest`], with
33//!   automatic `vert`/`vrt2` feature injection for top-to-bottom text.
34//! - [`SwashShaper::shape_with_features`]: lower-level entry point that
35//!   accepts a feature slice directly.
36
37pub mod backend;
38pub mod batch;
39pub mod cache;
40pub mod script_detect;
41pub mod variational;
42
43#[cfg(feature = "system-fonts")]
44pub mod system_fonts;
45#[cfg(feature = "system-fonts")]
46pub use system_fonts::{
47    build_system_db, load_best_font_for_text, load_best_font_for_text_from, load_font_for_family,
48    load_font_for_family_from,
49};
50
51#[cfg(feature = "rustybuzz-backend")]
52pub use backend::RustybuzzShaper;
53pub use backend::ShapeBackend;
54pub use backend::SwashShaperBackend;
55pub use cache::{FontId, ShapeCache, ShapeKey};
56use oxitext_core::{OxiTextError, ShapedGlyph, ShapedRun};
57pub use script_detect::{
58    requires_arabic_shaping, requires_indic_shaping, requires_mark_positioning,
59};
60use smallvec::SmallVec;
61use std::sync::Arc;
62use swash::shape::{Direction, ShapeContext};
63use swash::FontRef;
64// ──────────────────────────────────────────────────────────────────────────────
65// ShapeFeature
66// ──────────────────────────────────────────────────────────────────────────────
67
68/// An OpenType feature tag-value pair.
69///
70/// The `tag` is a 4-byte ASCII identifier (e.g. `b"liga"`, `b"kern"`,
71/// `b"smcp"`).  A `value` of `0` disables the feature, `1` enables it, and
72/// values `>1` select an alternate index for features such as `salt`.
73#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
74pub struct ShapeFeature {
75    /// 4-byte ASCII OpenType feature tag.
76    pub tag: [u8; 4],
77    /// Feature value: 0 = disable, 1 = enable, >1 = alternate index.
78    pub value: u32,
79}
80
81impl ShapeFeature {
82    /// Creates a new feature with an arbitrary value.
83    pub const fn new(tag: [u8; 4], value: u32) -> Self {
84        Self { tag, value }
85    }
86
87    /// Creates an enabled feature (`value = 1`).
88    pub const fn enable(tag: [u8; 4]) -> Self {
89        Self { tag, value: 1 }
90    }
91
92    /// Creates a disabled feature (`value = 0`).
93    pub const fn disable(tag: [u8; 4]) -> Self {
94        Self { tag, value: 0 }
95    }
96
97    /// Standard ligatures.
98    pub const LIGA: Self = Self::enable(*b"liga");
99    /// Kerning.
100    pub const KERN: Self = Self::enable(*b"kern");
101    /// Small capitals.
102    pub const SMCP: Self = Self::enable(*b"smcp");
103    /// Contextual alternates.
104    pub const CALT: Self = Self::enable(*b"calt");
105    /// Vertical forms (substitution of upright CJK glyphs with vertical ones).
106    pub const VERT: Self = Self::enable(*b"vert");
107    /// Vertical rotation (alternative to `vert` for some CJK contexts).
108    pub const VRT2: Self = Self::enable(*b"vrt2");
109}
110
111// ──────────────────────────────────────────────────────────────────────────────
112// ShapeDirection
113// ──────────────────────────────────────────────────────────────────────────────
114
115/// Text direction for a shaping request.
116#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
117pub enum ShapeDirection {
118    /// Left-to-right (default for Latin, Cyrillic, etc.).
119    #[default]
120    Ltr,
121    /// Right-to-left (Arabic, Hebrew, etc.).
122    Rtl,
123    /// Top-to-bottom (CJK vertical text).
124    Ttb,
125    /// Bottom-to-top (rare).
126    Btt,
127}
128
129// ──────────────────────────────────────────────────────────────────────────────
130// ShapeRequest / ShapeRequestBuilder
131// ──────────────────────────────────────────────────────────────────────────────
132
133/// A complete shaping request with all parameters.
134///
135/// Build via [`ShapeRequest::builder`] and then call
136/// [`SwashShaper::shape_request`].
137#[derive(Debug, Clone)]
138pub struct ShapeRequest<'a> {
139    /// UTF-8 text to shape.
140    pub text: &'a str,
141    /// Raw font bytes.
142    pub font_data: &'a [u8],
143    /// Font size in pixels-per-em.
144    pub px_size: f32,
145    /// Shaping direction.
146    pub direction: ShapeDirection,
147    /// OpenType script tag (e.g. `b"latn"`, `b"arab"`), or `None` for
148    /// auto-detection.
149    pub script: Option<[u8; 4]>,
150    /// OpenType language tag (e.g. `b"ENG "`, `b"ARA "`), or `None`.
151    pub language: Option<[u8; 4]>,
152    /// OpenType feature overrides.
153    pub features: Vec<ShapeFeature>,
154}
155
156impl<'a> ShapeRequest<'a> {
157    /// Returns a new [`ShapeRequestBuilder`].
158    pub fn builder() -> ShapeRequestBuilder<'a> {
159        ShapeRequestBuilder::default()
160    }
161}
162
163/// Builder for [`ShapeRequest`].
164#[derive(Debug, Default)]
165pub struct ShapeRequestBuilder<'a> {
166    text: Option<&'a str>,
167    font_data: Option<&'a [u8]>,
168    px_size: f32,
169    direction: ShapeDirection,
170    script: Option<[u8; 4]>,
171    language: Option<[u8; 4]>,
172    features: Vec<ShapeFeature>,
173}
174
175/// Errors that can occur when building a [`ShapeRequest`].
176#[derive(Debug)]
177pub enum ShapeRequestError {
178    /// The `text` field was not provided.
179    MissingText,
180    /// The `font_data` field was not provided.
181    MissingFont,
182}
183
184impl std::fmt::Display for ShapeRequestError {
185    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
186        match self {
187            ShapeRequestError::MissingText => f.write_str("text not set"),
188            ShapeRequestError::MissingFont => f.write_str("font_data not set"),
189        }
190    }
191}
192
193impl std::error::Error for ShapeRequestError {}
194
195impl<'a> ShapeRequestBuilder<'a> {
196    /// Sets the text to shape.
197    pub fn text(mut self, t: &'a str) -> Self {
198        self.text = Some(t);
199        self
200    }
201
202    /// Sets the raw font bytes.
203    pub fn font_data(mut self, d: &'a [u8]) -> Self {
204        self.font_data = Some(d);
205        self
206    }
207
208    /// Sets the font size in pixels-per-em.
209    pub fn px_size(mut self, s: f32) -> Self {
210        self.px_size = s;
211        self
212    }
213
214    /// Sets the shaping direction.
215    pub fn direction(mut self, d: ShapeDirection) -> Self {
216        self.direction = d;
217        self
218    }
219
220    /// Pins the OpenType script tag (overrides swash's auto-detection).
221    pub fn script(mut self, tag: [u8; 4]) -> Self {
222        self.script = Some(tag);
223        self
224    }
225
226    /// Pins the OpenType language tag for language-specific GSUB/GPOS rules.
227    pub fn language(mut self, tag: [u8; 4]) -> Self {
228        self.language = Some(tag);
229        self
230    }
231
232    /// Appends an OpenType feature override.
233    pub fn feature(mut self, f: ShapeFeature) -> Self {
234        self.features.push(f);
235        self
236    }
237
238    /// Builds the [`ShapeRequest`].
239    ///
240    /// # Errors
241    /// Returns [`ShapeRequestError::MissingText`] or
242    /// [`ShapeRequestError::MissingFont`] if the respective fields were not
243    /// provided.
244    pub fn build(self) -> Result<ShapeRequest<'a>, ShapeRequestError> {
245        Ok(ShapeRequest {
246            text: self.text.ok_or(ShapeRequestError::MissingText)?,
247            font_data: self.font_data.ok_or(ShapeRequestError::MissingFont)?,
248            px_size: self.px_size,
249            direction: self.direction,
250            script: self.script,
251            language: self.language,
252            features: self.features,
253        })
254    }
255}
256
257// ──────────────────────────────────────────────────────────────────────────────
258// Internal parameter bundle used by shape_with_features_internal
259// ──────────────────────────────────────────────────────────────────────────────
260
261/// Internal parameter bundle for the unified shaping entry point.
262///
263/// Groups all shaping inputs into a single struct so `shape_with_features_internal`
264/// stays under the clippy `too_many_arguments` threshold.
265struct ShapeParams<'a> {
266    font_data: &'a [u8],
267    text: &'a str,
268    px_size: f32,
269    rtl: bool,
270    script_tag: Option<[u8; 4]>,
271    language_tag: Option<[u8; 4]>,
272    features: &'a [ShapeFeature],
273}
274
275// ──────────────────────────────────────────────────────────────────────────────
276// SwashShaper
277// ──────────────────────────────────────────────────────────────────────────────
278
279/// Text shaper backed by [swash].
280///
281/// Keep a single `SwashShaper` alive across multiple layout passes to amortise
282/// the cost of the internal LRU caches that swash maintains in [`ShapeContext`].
283///
284/// Optionally attach a [`ShapeCache`] via [`Self::with_cache`] to skip swash
285/// entirely on repeated requests for the same `(font, text, size)` tuple.
286pub struct SwashShaper {
287    ctx: ShapeContext,
288    /// Optional application-level shape cache.
289    cache: Option<Arc<ShapeCache>>,
290    /// Cached text string for script-run reuse (Item 4).
291    #[cfg(feature = "icu")]
292    script_cache_text: String,
293    /// Cached script runs for the cached text (Item 4).
294    #[cfg(feature = "icu")]
295    script_cache_runs: Vec<oxitext_icu::ScriptRun>,
296}
297
298impl SwashShaper {
299    /// Creates a new shaper with default cache settings and no shape cache.
300    pub fn new() -> Self {
301        Self {
302            ctx: ShapeContext::new(),
303            cache: None,
304            #[cfg(feature = "icu")]
305            script_cache_text: String::new(),
306            #[cfg(feature = "icu")]
307            script_cache_runs: Vec::new(),
308        }
309    }
310
311    /// Creates a new shaper with an attached [`ShapeCache`] of `capacity` entries.
312    ///
313    /// Repeated calls to [`Self::shape`] with the same `(font_data, text, size)`
314    /// tuple will be served from the cache after the first miss.
315    ///
316    /// # Arguments
317    /// - `capacity`: maximum number of [`ShapedRun`]s to keep in the cache.
318    ///   Passing `0` uses a minimum capacity of 1.
319    pub fn with_cache(capacity: usize) -> Self {
320        Self {
321            ctx: ShapeContext::new(),
322            cache: Some(Arc::new(ShapeCache::new(capacity))),
323            #[cfg(feature = "icu")]
324            script_cache_text: String::new(),
325            #[cfg(feature = "icu")]
326            script_cache_runs: Vec::new(),
327        }
328    }
329
330    /// Returns a reference to the attached shape cache, if any.
331    pub fn shape_cache(&self) -> Option<&Arc<ShapeCache>> {
332        self.cache.as_ref()
333    }
334
335    /// Shapes `text` using the font in `font_data` at `size` pixels-per-em.
336    ///
337    /// Returns a [`ShapedRun`] containing one [`ShapedGlyph`] per output glyph.
338    /// The `x_advance` of each glyph is in pixels (already scaled by `size`).
339    ///
340    /// When an attached [`ShapeCache`] is present the result is looked up
341    /// before invoking swash.  Cache keys incorporate `font_data` pointer
342    /// identity, the exact text, and `size`.
343    ///
344    /// # Errors
345    /// Returns [`OxiTextError::Shaping`] if the font bytes cannot be parsed by
346    /// swash.
347    pub fn shape(
348        &mut self,
349        text: &str,
350        font_data: Arc<[u8]>,
351        size: f32,
352    ) -> Result<ShapedRun, OxiTextError> {
353        // Build a deterministic axis hash from the size (no variation axes yet).
354        let axis_hash = size.to_bits() as u64;
355
356        // Check cache if attached.
357        if let Some(ref cache) = self.cache {
358            let key = ShapeKey::new(&font_data, text, axis_hash);
359            if let Some(cached) = cache.get(&key) {
360                return Ok((*cached).clone());
361            }
362        }
363
364        // Cache miss — invoke swash.
365        let font = FontRef::from_index(&font_data, 0)
366            .ok_or_else(|| OxiTextError::Shaping("swash could not parse font bytes".into()))?;
367
368        let mut shaper = self.ctx.builder(font).size(size).build();
369        shaper.add_str(text);
370
371        let mut glyphs: SmallVec<[ShapedGlyph; 8]> = SmallVec::new();
372        shaper.shape_with(|cluster| {
373            // A cluster is whitespace if every source char it covers is
374            // whitespace. Most whitespace clusters cover a single space/tab.
375            let cluster_range = cluster.source.start as usize..cluster.source.end as usize;
376            let is_ws = text
377                .get(cluster_range)
378                .map(|slice| !slice.is_empty() && slice.chars().all(|c| c.is_whitespace()))
379                .unwrap_or(false);
380            // More than one glyph in a cluster means inner glyphs are unsafe
381            // to break before (ligature / mark attachment).
382            let multi = cluster.glyphs.len() > 1;
383            for (idx, glyph) in cluster.glyphs.iter().enumerate() {
384                // A glyph is unsafe to break before if it is inside a
385                // multi-glyph cluster (idx > 0) OR if it carries the mark
386                // attachment flag (combining mark attached to a base glyph).
387                let utb = (multi && idx > 0) || glyph.info.is_mark();
388                glyphs.push(ShapedGlyph {
389                    gid: glyph.id,
390                    x_advance: glyph.advance,
391                    y_advance: 0.0,
392                    x_offset: glyph.x,
393                    y_offset: glyph.y,
394                    cluster: cluster.source.start,
395                    is_whitespace: is_ws,
396                    unsafe_to_break: utb,
397                });
398            }
399        });
400
401        let run = ShapedRun {
402            glyphs,
403            font_data: Arc::clone(&font_data),
404        };
405
406        // Populate cache on miss.
407        if let Some(ref cache) = self.cache {
408            let key = ShapeKey::new(&font_data, text, axis_hash);
409            cache.insert(key, Arc::new(run.clone()));
410        }
411
412        Ok(run)
413    }
414
415    /// Shapes `text` with explicit direction control.
416    ///
417    /// When `rtl` is `false` this is identical to [`Self::shape`].
418    ///
419    /// When `rtl` is `true` the shaper signals `Direction::RightToLeft` to
420    /// swash (enabling correct Arabic/Hebrew form selection via OpenType GSUB),
421    /// then **sorts** the resulting glyphs by ascending `cluster` byte offset so
422    /// the output is always in **logical source order** regardless of what swash
423    /// emits.  The caller (bidi engine) is responsible for visual reordering.
424    ///
425    /// # Errors
426    /// Returns [`OxiTextError::Shaping`] if the font bytes cannot be parsed.
427    pub fn shape_with_direction(
428        &mut self,
429        text: &str,
430        font_data: Arc<[u8]>,
431        size: f32,
432        rtl: bool,
433    ) -> Result<ShapedRun, OxiTextError> {
434        if !rtl {
435            return self.shape(text, font_data, size);
436        }
437        // RTL path: shape with the explicit RightToLeft hint so swash can apply
438        // direction-sensitive GSUB lookups, then sort to ascending cluster order
439        // (logical order) to satisfy the architecture contract.
440        let mut run = self.do_shape_rtl(text, font_data, size)?;
441        run.glyphs.sort_by_key(|g| g.cluster);
442        Ok(run)
443    }
444
445    /// Shapes text using all parameters in a [`ShapeRequest`].
446    ///
447    /// When `direction` is [`ShapeDirection::Ttb`] or [`ShapeDirection::Btt`],
448    /// the `vert` and `vrt2` OpenType features are **automatically appended**
449    /// to the feature list (if not already present) so that fonts with a
450    /// vertical substitution table produce the correct glyph variants.
451    ///
452    /// Script and language tags, if provided, are forwarded to swash's
453    /// `ShaperBuilder` for language-specific GSUB/GPOS rule selection.
454    ///
455    /// # Errors
456    /// Returns [`OxiTextError::Shaping`] if the font bytes cannot be parsed.
457    pub fn shape_request(
458        &mut self,
459        req: &ShapeRequest<'_>,
460    ) -> Result<Vec<ShapedGlyph>, OxiTextError> {
461        // When the `icu` feature is enabled, normalize text to NFC before shaping
462        // so that precomposed and decomposed spellings produce identical glyph runs.
463        #[cfg(feature = "icu")]
464        let normalized_text: String;
465        #[cfg(feature = "icu")]
466        let req_text: &str = {
467            normalized_text = oxitext_icu::Normalizer::new().nfc(req.text);
468            normalized_text.as_str()
469        };
470        #[cfg(not(feature = "icu"))]
471        let req_text: &str = req.text;
472
473        // When direction is Ltr but the text is Arabic, auto-upgrade to Rtl
474        // so swash can apply the correct Arabic GSUB form-selection lookups.
475        let effective_direction = if req.direction == ShapeDirection::Ltr
476            && requires_arabic_shaping(req_text)
477        {
478            #[cfg(debug_assertions)]
479            eprintln!("[oxitext-shape] Arabic text detected with Ltr direction; upgrading to Rtl");
480            ShapeDirection::Rtl
481        } else {
482            req.direction
483        };
484
485        // Auto-inject vertical OpenType features for vertical directions.
486        let mut features = req.features.clone();
487        if effective_direction == ShapeDirection::Ttb || effective_direction == ShapeDirection::Btt
488        {
489            if !features.iter().any(|f| f.tag == *b"vert") {
490                features.push(ShapeFeature::VERT);
491            }
492            if !features.iter().any(|f| f.tag == *b"vrt2") {
493                features.push(ShapeFeature::VRT2);
494            }
495        }
496
497        let rtl = effective_direction == ShapeDirection::Rtl;
498        self.shape_with_features_internal(ShapeParams {
499            font_data: req.font_data,
500            text: req_text,
501            px_size: req.px_size,
502            rtl,
503            script_tag: req.script,
504            language_tag: req.language,
505            features: &features,
506        })
507    }
508
509    /// Shapes text with an explicit list of OpenType feature overrides.
510    ///
511    /// Unlike [`Self::shape_request`] this entry point does **not** inject
512    /// vertical features automatically; callers are responsible for supplying
513    /// the full feature list.
514    ///
515    /// # Errors
516    /// Returns [`OxiTextError::Shaping`] if the font bytes cannot be parsed.
517    pub fn shape_with_features(
518        &mut self,
519        font_data: &[u8],
520        text: &str,
521        px_size: f32,
522        rtl: bool,
523        features: &[ShapeFeature],
524    ) -> Result<Vec<ShapedGlyph>, OxiTextError> {
525        self.shape_with_features_internal(ShapeParams {
526            font_data,
527            text,
528            px_size,
529            rtl,
530            script_tag: None,
531            language_tag: None,
532            features,
533        })
534    }
535
536    /// Internal implementation shared by [`Self::shape_request`] and
537    /// [`Self::shape_with_features`].
538    fn shape_with_features_internal(
539        &mut self,
540        params: ShapeParams<'_>,
541    ) -> Result<Vec<ShapedGlyph>, OxiTextError> {
542        use swash::tag_from_bytes;
543        use swash::text::{Language, Script};
544
545        let font = FontRef::from_index(params.font_data, 0)
546            .ok_or_else(|| OxiTextError::Shaping("swash could not parse font bytes".into()))?;
547
548        let direction = if params.rtl {
549            Direction::RightToLeft
550        } else {
551            Direction::LeftToRight
552        };
553
554        // Resolve the optional script tag to a swash Script enum value.
555        let script = params
556            .script_tag
557            .and_then(|t| Script::from_opentype(tag_from_bytes(&t)))
558            .unwrap_or(Script::Latin);
559
560        // Resolve the optional language tag to a swash Language.
561        let language = params.language_tag.and_then(|t| {
562            // swash Language::parse expects a BCP-47 string; for OpenType tags
563            // we convert the raw bytes to a lossy str and try to parse them.
564            let s = std::str::from_utf8(&t).unwrap_or("").trim_end();
565            Language::parse(s)
566        });
567
568        // Convert our ShapeFeature slice to swash-compatible (tag, value) pairs.
569        // swash's `ShaperBuilder::features` accepts any iterator whose items
570        // implement `Into<Setting<u16>>`.  The swash crate provides
571        // `From<&([u8; 4], T)> for Setting<T>`, so we pass an iterator of
572        // references to satisfy the bound.
573        let swash_features: Vec<([u8; 4], u16)> = params
574            .features
575            .iter()
576            .map(|f| (f.tag, f.value.min(u32::from(u16::MAX)) as u16))
577            .collect();
578
579        let mut shaper = self
580            .ctx
581            .builder(font)
582            .size(params.px_size)
583            .direction(direction)
584            .script(script)
585            .language(language)
586            .features(swash_features.iter())
587            .build();
588
589        shaper.add_str(params.text);
590
591        let mut glyphs: Vec<ShapedGlyph> = Vec::new();
592        shaper.shape_with(|cluster| {
593            let cluster_range = cluster.source.start as usize..cluster.source.end as usize;
594            let is_ws = params
595                .text
596                .get(cluster_range)
597                .map(|slice| !slice.is_empty() && slice.chars().all(|c| c.is_whitespace()))
598                .unwrap_or(false);
599            let multi = cluster.glyphs.len() > 1;
600            for (idx, glyph) in cluster.glyphs.iter().enumerate() {
601                let utb = (multi && idx > 0) || glyph.info.is_mark();
602                glyphs.push(ShapedGlyph {
603                    gid: glyph.id,
604                    x_advance: glyph.advance,
605                    y_advance: 0.0,
606                    x_offset: glyph.x,
607                    y_offset: glyph.y,
608                    cluster: cluster.source.start,
609                    is_whitespace: is_ws,
610                    unsafe_to_break: utb,
611                });
612            }
613        });
614
615        if params.rtl {
616            glyphs.sort_by_key(|g| g.cluster);
617        }
618
619        Ok(glyphs)
620    }
621
622    /// Internal RTL shaping path: invokes swash with `Direction::RightToLeft`.
623    ///
624    /// Returns glyphs in whatever order swash produces; the public
625    /// [`Self::shape_with_direction`] sorts them to ascending cluster order.
626    fn do_shape_rtl(
627        &mut self,
628        text: &str,
629        font_data: Arc<[u8]>,
630        size: f32,
631    ) -> Result<ShapedRun, OxiTextError> {
632        let font = FontRef::from_index(&font_data, 0)
633            .ok_or_else(|| OxiTextError::Shaping("swash could not parse font bytes".into()))?;
634
635        let mut shaper = self
636            .ctx
637            .builder(font)
638            .size(size)
639            .direction(Direction::RightToLeft)
640            .build();
641        shaper.add_str(text);
642
643        let mut glyphs: SmallVec<[ShapedGlyph; 8]> = SmallVec::new();
644        shaper.shape_with(|cluster| {
645            let cluster_range = cluster.source.start as usize..cluster.source.end as usize;
646            let is_ws = text
647                .get(cluster_range)
648                .map(|slice| !slice.is_empty() && slice.chars().all(|c| c.is_whitespace()))
649                .unwrap_or(false);
650            let multi = cluster.glyphs.len() > 1;
651            for (idx, glyph) in cluster.glyphs.iter().enumerate() {
652                let utb = (multi && idx > 0) || glyph.info.is_mark();
653                glyphs.push(ShapedGlyph {
654                    gid: glyph.id,
655                    x_advance: glyph.advance,
656                    y_advance: 0.0,
657                    x_offset: glyph.x,
658                    y_offset: glyph.y,
659                    cluster: cluster.source.start,
660                    is_whitespace: is_ws,
661                    unsafe_to_break: utb,
662                });
663            }
664        });
665
666        Ok(ShapedRun {
667            glyphs,
668            font_data: Arc::clone(&font_data),
669        })
670    }
671
672    /// Shapes `text` and returns a rich [`ShapeResult`] with metadata.
673    ///
674    /// The result includes the glyph list, the direction used, and any
675    /// codepoints that could not be mapped (glyph ID 0 / `.notdef`).
676    ///
677    /// # Errors
678    /// Returns [`OxiTextError::Shaping`] if the font bytes cannot be parsed.
679    pub fn shape_full(
680        &mut self,
681        font_data: &[u8],
682        text: &str,
683        px_size: f32,
684    ) -> Result<ShapeResult, OxiTextError> {
685        use unicode_segmentation::UnicodeSegmentation;
686
687        let glyphs = self.shape_with_features_internal(ShapeParams {
688            font_data,
689            text,
690            px_size,
691            rtl: false,
692            script_tag: None,
693            language_tag: None,
694            features: &[],
695        })?;
696        let mut result = ShapeResult::from_glyphs(glyphs, text, ShapeDirection::Ltr);
697        // Populate grapheme cluster boundaries: start offset of each grapheme
698        // plus the end-of-text sentinel.
699        result.cluster_boundaries = text
700            .grapheme_indices(true)
701            .map(|(i, _)| i)
702            .chain(std::iter::once(text.len()))
703            .collect();
704        Ok(result)
705    }
706
707    /// Shapes `text` using raw font bytes supplied as `&[u8]` (LTR).
708    ///
709    /// A convenience wrapper over `Self::shape_with_features_internal` for
710    /// callers that already hold raw font bytes and do not need the `Arc` wrapping
711    /// or cache infrastructure of [`Self::shape`].
712    ///
713    /// # Errors
714    /// Returns [`OxiTextError::Shaping`] if the font bytes cannot be parsed.
715    pub fn shape_slice(
716        &mut self,
717        font_data: &[u8],
718        text: &str,
719        px_size: f32,
720    ) -> Result<Vec<ShapedGlyph>, OxiTextError> {
721        self.shape_with_features_internal(ShapeParams {
722            font_data,
723            text,
724            px_size,
725            rtl: false,
726            script_tag: None,
727            language_tag: None,
728            features: &[],
729        })
730    }
731
732    /// Shapes `text` using raw font bytes supplied as `&[u8]` (RTL).
733    ///
734    /// Like [`Self::shape_slice`] but shapes in right-to-left direction and
735    /// returns glyphs in ascending `cluster` (logical source) order.
736    ///
737    /// # Errors
738    /// Returns [`OxiTextError::Shaping`] if the font bytes cannot be parsed.
739    pub fn shape_slice_rtl(
740        &mut self,
741        font_data: &[u8],
742        text: &str,
743        px_size: f32,
744    ) -> Result<Vec<ShapedGlyph>, OxiTextError> {
745        self.shape_with_features_internal(ShapeParams {
746            font_data,
747            text,
748            px_size,
749            rtl: true,
750            script_tag: None,
751            language_tag: None,
752            features: &[],
753        })
754    }
755
756    /// Shapes `text` with a font fallback chain.
757    ///
758    /// For each codepoint that produces `glyph_id == 0` (`.notdef`), the
759    /// corresponding text run is re-shaped with each successive fallback font
760    /// in `fonts[1..]`.  If a fallback produces a non-zero glyph ID the
761    /// fallback glyphs replace the `.notdef` glyphs in the result; otherwise
762    /// the `.notdef` glyphs are preserved (best-effort).
763    ///
764    /// `fonts[0]` is the primary font; `fonts[1..]` are tried in order.
765    ///
766    /// # Note on cluster offsets
767    ///
768    /// When a sub-string is re-shaped with a fallback font, swash emits cluster
769    /// byte offsets **relative to that sub-string** (starting at 0).  This
770    /// function adds the original start offset back before merging so all
771    /// returned glyphs carry absolute offsets into `text`.
772    ///
773    /// # Errors
774    /// Returns [`OxiTextError::Shaping`] if the primary font cannot be parsed.
775    pub fn shape_with_fallback(
776        &mut self,
777        fonts: &[&[u8]],
778        text: &str,
779        px_size: f32,
780    ) -> Result<Vec<ShapedGlyph>, OxiTextError> {
781        let primary = fonts
782            .first()
783            .ok_or_else(|| OxiTextError::Shaping("font list is empty".into()))?;
784
785        // 1. Shape with the primary font.
786        let mut result = self.shape_with_features_internal(ShapeParams {
787            font_data: primary,
788            text,
789            px_size,
790            rtl: false,
791            script_tag: None,
792            language_tag: None,
793            features: &[],
794        })?;
795
796        if fonts.len() <= 1 {
797            return Ok(result);
798        }
799
800        // 2. Find contiguous runs of .notdef (glyph ID 0) glyphs.
801        let notdef_runs = collect_notdef_runs(&result, text);
802
803        // 3. For each .notdef run try the fallback fonts.
804        for (run_text_start, run_text_end) in notdef_runs {
805            let sub_text = match text.get(run_text_start..run_text_end) {
806                Some(s) if !s.is_empty() => s,
807                _ => continue,
808            };
809
810            // Try each fallback font in order.
811            'fallback: for fallback_font in &fonts[1..] {
812                let fallback_glyphs = match self.shape_with_features_internal(ShapeParams {
813                    font_data: fallback_font,
814                    text: sub_text,
815                    px_size,
816                    rtl: false,
817                    script_tag: None,
818                    language_tag: None,
819                    features: &[],
820                }) {
821                    Ok(g) => g,
822                    Err(_) => continue,
823                };
824
825                // Only use this fallback if it resolved at least one glyph.
826                if fallback_glyphs.iter().all(|g| g.gid == 0) {
827                    continue;
828                }
829
830                // Adjust cluster offsets from sub-string-relative to
831                // text-absolute and replace the .notdef glyphs in result.
832                let start_offset = run_text_start as u32;
833                let adjusted: Vec<ShapedGlyph> = fallback_glyphs
834                    .into_iter()
835                    .map(|mut g| {
836                        g.cluster += start_offset;
837                        g
838                    })
839                    .collect();
840
841                // Replace glyphs in the result whose cluster falls in [run_text_start, run_text_end).
842                result.retain(|g| {
843                    let c = g.cluster as usize;
844                    !(c >= run_text_start && c < run_text_end && g.gid == 0)
845                });
846
847                // Insert adjusted fallback glyphs at the correct position.
848                let insert_pos = result.partition_point(|g| (g.cluster as usize) < run_text_start);
849                for (i, g) in adjusted.into_iter().enumerate() {
850                    result.insert(insert_pos + i, g);
851                }
852
853                break 'fallback;
854            }
855        }
856
857        Ok(result)
858    }
859
860    /// Returns `true` if the given font data contains AAT layout tables.
861    ///
862    /// Checks for the presence of `morx` (extended glyph metamorphosis rules),
863    /// `kerx` (extended kerning data), or `ankr` (anchor point) tables — the
864    /// three primary tables that distinguish Apple Advanced Typography (AAT)
865    /// fonts from pure OpenType fonts.
866    ///
867    /// Swash's [`ShapeContext`] already applies AAT tables transparently when
868    /// present, so this function is informational only; it does not change the
869    /// shaping path.
870    pub fn font_has_aat(font_data: &[u8]) -> bool {
871        ttf_parser::Face::parse(font_data, 0)
872            .map(|face| {
873                face.raw_face()
874                    .table(ttf_parser::Tag::from_bytes(b"morx"))
875                    .is_some()
876                    || face
877                        .raw_face()
878                        .table(ttf_parser::Tag::from_bytes(b"kerx"))
879                        .is_some()
880                    || face
881                        .raw_face()
882                        .table(ttf_parser::Tag::from_bytes(b"ankr"))
883                        .is_some()
884            })
885            .unwrap_or(false)
886    }
887
888    /// Shape using AAT if the font has Morx/Kerx tables, otherwise fall back to
889    /// standard OpenType shaping.
890    ///
891    /// Swash handles both AAT and OpenType tables transparently via its
892    /// `ShapeContext`; this method is informational. It delegates directly to
893    /// `Self::shape_with_features_internal` regardless of table presence.
894    ///
895    /// # Errors
896    /// Returns [`OxiTextError::Shaping`] if the font bytes cannot be parsed.
897    pub fn shape_with_aat_fallback(
898        &mut self,
899        font_data: &[u8],
900        text: &str,
901        px_size: f32,
902    ) -> Result<ShapeResult, OxiTextError> {
903        use unicode_segmentation::UnicodeSegmentation;
904
905        let glyphs = self.shape_with_features_internal(ShapeParams {
906            font_data,
907            text,
908            px_size,
909            rtl: false,
910            script_tag: None,
911            language_tag: None,
912            features: &[],
913        })?;
914        let mut result = ShapeResult::from_glyphs(glyphs, text, ShapeDirection::Ltr);
915        result.cluster_boundaries = text
916            .grapheme_indices(true)
917            .map(|(i, _)| i)
918            .chain(std::iter::once(text.len()))
919            .collect();
920        Ok(result)
921    }
922}
923
924// ──────────────────────────────────────────────────────────────────────────────
925// ShapeResult
926// ──────────────────────────────────────────────────────────────────────────────
927
928/// Extended shaping result with metadata.
929///
930/// Produced by [`SwashShaper::shape_full`]; includes the glyph list, the
931/// direction resolved by the shaper, the OpenType script tag (if known), and a
932/// list of Unicode codepoints that could not be mapped (glyph ID 0 / `.notdef`).
933#[derive(Debug, Clone)]
934pub struct ShapeResult {
935    /// Shaped glyphs in logical cluster order.
936    pub glyphs: Vec<ShapedGlyph>,
937    /// OpenType script tag detected (e.g. `b"latn"`, `b"arab"`), or `None` if
938    /// unknown.  May be set by the caller after construction.
939    pub script_detected: Option<[u8; 4]>,
940    /// Direction resolved by the shaper.
941    pub direction: ShapeDirection,
942    /// Unicode codepoints that produced a `.notdef` glyph (ID 0).
943    pub missing_codepoints: Vec<char>,
944    /// Byte offsets (in the original text) where grapheme cluster boundaries fall.
945    ///
946    /// Populated by [`SwashShaper::shape_full`].  Empty when [`SwashShaper::shape`]
947    /// is called directly.  The first entry is `0` (start of text) and the last
948    /// entry is `text.len()` (end of text).
949    pub cluster_boundaries: Vec<usize>,
950}
951
952impl ShapeResult {
953    /// Constructs a [`ShapeResult`] from a glyph vector, the source text, and
954    /// the shaping direction.
955    ///
956    /// `script_detected` is left as `None`; callers may set it afterwards.
957    pub fn from_glyphs(glyphs: Vec<ShapedGlyph>, text: &str, direction: ShapeDirection) -> Self {
958        let missing: Vec<char> = {
959            let mut seen = std::collections::HashSet::new();
960            let mut missing = Vec::new();
961            for g in &glyphs {
962                if g.gid == 0 {
963                    if let Some(ch) = text
964                        .get(g.cluster as usize..)
965                        .and_then(|s| s.chars().next())
966                    {
967                        if seen.insert(ch) {
968                            missing.push(ch);
969                        }
970                    }
971                }
972            }
973            missing
974        };
975        Self {
976            glyphs,
977            script_detected: None,
978            direction,
979            missing_codepoints: missing,
980            cluster_boundaries: Vec::new(),
981        }
982    }
983}
984
985// ──────────────────────────────────────────────────────────────────────────────
986// Helpers
987// ──────────────────────────────────────────────────────────────────────────────
988
989/// Collect contiguous byte ranges in `text` that are covered exclusively by
990/// `.notdef` (glyph ID 0) glyphs in `glyphs`.
991///
992/// Returns a `Vec` of `(start, end)` byte offset pairs into `text`.
993fn collect_notdef_runs(glyphs: &[ShapedGlyph], text: &str) -> Vec<(usize, usize)> {
994    if glyphs.is_empty() {
995        return Vec::new();
996    }
997
998    // Build a sorted, deduplicated list of cluster byte offsets that are .notdef.
999    let mut notdef_clusters: Vec<usize> = glyphs
1000        .iter()
1001        .filter(|g| g.gid == 0)
1002        .map(|g| g.cluster as usize)
1003        .collect();
1004    notdef_clusters.sort_unstable();
1005    notdef_clusters.dedup();
1006
1007    // Build a sorted list of all cluster start offsets (regardless of gid).
1008    let mut all_starts: Vec<usize> = glyphs.iter().map(|g| g.cluster as usize).collect();
1009    all_starts.sort_unstable();
1010    all_starts.dedup();
1011
1012    // For each .notdef cluster, determine the end offset: it's the byte offset
1013    // of the next cluster in `all_starts`, or `text.len()` for the last one.
1014    let mut runs: Vec<(usize, usize)> = Vec::new();
1015    for &start in &notdef_clusters {
1016        let end = all_starts
1017            .iter()
1018            .find(|&&s| s > start)
1019            .copied()
1020            .unwrap_or(text.len());
1021        // Merge with the previous run if adjacent.
1022        if let Some(last) = runs.last_mut() {
1023            if last.1 == start {
1024                last.1 = end;
1025                continue;
1026            }
1027        }
1028        runs.push((start, end));
1029    }
1030    runs
1031}
1032
1033impl Default for SwashShaper {
1034    fn default() -> Self {
1035        Self::new()
1036    }
1037}
1038
1039// ──────────────────────────────────────────────────────────────────────────────
1040// System font convenience methods (feature `system-fonts`)
1041// ──────────────────────────────────────────────────────────────────────────────
1042
1043#[cfg(feature = "system-fonts")]
1044impl SwashShaper {
1045    /// Shape `text` using the best system font for its Unicode content.
1046    ///
1047    /// Calls [`system_fonts::load_best_font_for_text`] to discover a system
1048    /// font whose OS/2 Unicode range bits cover the codepoints in `text`, then
1049    /// shapes with that font at `px_size` pixels-per-em.
1050    ///
1051    /// This is a convenience wrapper; callers that need to reuse the same
1052    /// font database for many shaping calls should load the font bytes once
1053    /// with [`system_fonts::load_best_font_for_text`] and then call
1054    /// [`Self::shape_slice`] directly.
1055    ///
1056    /// # Errors
1057    /// Returns [`OxiTextError::Shaping`] when no suitable system font can be
1058    /// found or when the discovered font bytes cannot be parsed by swash.
1059    pub fn shape_with_system_font(
1060        &mut self,
1061        text: &str,
1062        px_size: f32,
1063    ) -> Result<Vec<ShapedGlyph>, OxiTextError> {
1064        let font_data = system_fonts::load_best_font_for_text(text)
1065            .ok_or_else(|| OxiTextError::Shaping("no system font found for text".into()))?;
1066        self.shape_slice(&font_data, text, px_size)
1067    }
1068
1069    /// Shape `text` using the system font that best matches `family`.
1070    ///
1071    /// `family` may be a concrete font family name (e.g. `"Arial"`) or a CSS
1072    /// generic alias (e.g. `"sans-serif"`).  The best CSS Level 4 match from
1073    /// the system catalog is used.
1074    ///
1075    /// # Errors
1076    /// Returns [`OxiTextError::Shaping`] when no font matching `family` can be
1077    /// found in the system catalog or when the font bytes cannot be parsed.
1078    pub fn shape_with_family(
1079        &mut self,
1080        text: &str,
1081        family: &str,
1082        px_size: f32,
1083    ) -> Result<Vec<ShapedGlyph>, OxiTextError> {
1084        let font_data = system_fonts::load_font_for_family(family).ok_or_else(|| {
1085            OxiTextError::Shaping(format!("no system font found for family '{family}'"))
1086        })?;
1087        self.shape_slice(&font_data, text, px_size)
1088    }
1089}
1090
1091// ──────────────────────────────────────────────────────────────────────────────
1092// Script-aware itemization (Feature 1, behind `icu` feature gate)
1093// ──────────────────────────────────────────────────────────────────────────────
1094
1095#[cfg(feature = "icu")]
1096/// Maps a [`oxitext_icu::TextScript`] to a 4-byte OpenType script tag.
1097fn text_script_to_ot_tag(s: oxitext_icu::TextScript) -> [u8; 4] {
1098    use oxitext_icu::TextScript;
1099    match s {
1100        TextScript::Latin => *b"latn",
1101        TextScript::Arabic => *b"arab",
1102        TextScript::Devanagari => *b"dev2",
1103        TextScript::Han => *b"hani",
1104        TextScript::Hangul => *b"hang",
1105        TextScript::Hiragana | TextScript::Katakana => *b"kana",
1106        TextScript::Hebrew => *b"hebr",
1107        TextScript::Thai => *b"thai",
1108        TextScript::Greek => *b"grek",
1109        TextScript::Cyrillic => *b"cyrl",
1110        _ => *b"DFLT",
1111    }
1112}
1113
1114#[cfg(feature = "icu")]
1115impl SwashShaper {
1116    /// Shapes `text` by first splitting it into per-script runs using ICU4X
1117    /// script itemization, then shaping each run with the appropriate OpenType
1118    /// script tag.
1119    ///
1120    /// Returns one [`ShapedRun`] per script run, in logical (source) order.
1121    /// Each glyph's `cluster` offset is absolute (relative to the start of
1122    /// `text`), not relative to the sub-run.
1123    ///
1124    /// # Errors
1125    /// Returns [`OxiTextError::Shaping`] if the font bytes cannot be parsed.
1126    pub fn shape_by_script(
1127        &mut self,
1128        font_data: Arc<[u8]>,
1129        text: &str,
1130        px_size: f32,
1131        features: &[ShapeFeature],
1132    ) -> Result<Vec<ShapedRun>, OxiTextError> {
1133        // Reuse cached script runs when the text is unchanged (Item 4 cache).
1134        if self.script_cache_text != text {
1135            let props = oxitext_icu::CharProperties::new();
1136            self.script_cache_runs = props.itemize(text);
1137            self.script_cache_text = text.to_owned();
1138        }
1139        let script_runs = self.script_cache_runs.clone();
1140
1141        let mut result: Vec<ShapedRun> = Vec::with_capacity(script_runs.len());
1142
1143        for run in &script_runs {
1144            let sub_text = text
1145                .get(run.start..run.end)
1146                .ok_or_else(|| OxiTextError::Shaping("invalid script run byte range".into()))?;
1147
1148            let ot_tag = text_script_to_ot_tag(run.script);
1149            let is_rtl = run.script.is_rtl();
1150
1151            let mut glyphs = self.shape_with_features_internal(ShapeParams {
1152                font_data: &font_data,
1153                text: sub_text,
1154                px_size,
1155                rtl: is_rtl,
1156                script_tag: Some(ot_tag),
1157                language_tag: None,
1158                features,
1159            })?;
1160
1161            // Adjust cluster offsets from sub-run-relative to text-absolute.
1162            let start_offset = run.start as u32;
1163            for g in &mut glyphs {
1164                g.cluster += start_offset;
1165            }
1166
1167            result.push(ShapedRun {
1168                glyphs: glyphs.into(),
1169                font_data: Arc::clone(&font_data),
1170            });
1171        }
1172
1173        Ok(result)
1174    }
1175}
1176
1177// ──────────────────────────────────────────────────────────────────────────────
1178// Kashida insertion opportunities (Feature 2)
1179// ──────────────────────────────────────────────────────────────────────────────
1180
1181/// Returns `true` when `c` is an Arabic character with Dual_Joining type.
1182///
1183/// Dual-joining characters connect to neighbours on both sides and are
1184/// therefore eligible for kashida (tatweel) stretching. This approximation
1185/// covers the mainstream Arabic block: U+0626..=U+063A and U+0641..=U+064A,
1186/// excluding known non-joiners (Alef U+0627, Dhal–Zain U+062F..=U+0632,
1187/// Waw U+0648).
1188fn is_arabic_dual_joining(c: char) -> bool {
1189    let cp = c as u32;
1190    match cp {
1191        // Lower Arabic range: Ba through Ghain (excludes Alef 0x0627,
1192        // Dal-Zain 0x062F–0x0632, and Waw 0x0648 which are right-joining only)
1193        0x0626..=0x063A => !matches!(cp, 0x0627 | 0x062F..=0x0632),
1194        // Upper Arabic range: Fa through Ya
1195        0x0641..=0x064A => !matches!(cp, 0x0648),
1196        _ => false,
1197    }
1198}
1199
1200/// Returns glyph indices (into `glyphs`) after which a kashida stretch can be
1201/// inserted for Arabic justification.
1202///
1203/// A position is a kashida opportunity when the source character at the
1204/// glyph's cluster byte offset is an Arabic dual-joining character (one that
1205/// connects on both sides and can therefore be stretched with tatweel).
1206///
1207/// If `text` does not contain Arabic text, or if no glyph's cluster maps to a
1208/// dual-joining character, the returned `Vec` is empty.
1209pub fn find_kashida_opportunities(text: &str, glyphs: &[ShapedGlyph]) -> Vec<usize> {
1210    let mut result = Vec::new();
1211    for (idx, glyph) in glyphs.iter().enumerate() {
1212        let byte_pos = glyph.cluster as usize;
1213        if let Some(ch) = text.get(byte_pos..).and_then(|s| s.chars().next()) {
1214            if is_arabic_dual_joining(ch) {
1215                result.push(idx);
1216            }
1217        }
1218    }
1219    result
1220}
1221
1222// ──────────────────────────────────────────────────────────────────────────────
1223// Emoji ZWJ sequence detection (Feature 3)
1224// ──────────────────────────────────────────────────────────────────────────────
1225
1226/// Returns byte ranges in `text` that correspond to ZWJ-joined emoji sequences.
1227///
1228/// A ZWJ emoji sequence is a grapheme cluster that:
1229/// 1. Contains U+200D (ZERO WIDTH JOINER), **and**
1230/// 2. Has at least two non-ZWJ codepoints (i.e. it is not a bare ZWJ followed
1231///    by nothing).
1232///
1233/// The returned ranges are contiguous byte spans in `text` covering each such
1234/// cluster. When multiple such clusters are adjacent (share no separator) they
1235/// are reported individually.
1236///
1237/// Uses [`unicode_segmentation::UnicodeSegmentation::grapheme_indices`] for
1238/// grapheme-cluster boundaries so that the detection is consistent with UAX #29.
1239pub fn detect_emoji_zwj_sequences(text: &str) -> Vec<std::ops::Range<usize>> {
1240    use unicode_segmentation::UnicodeSegmentation;
1241
1242    let mut result = Vec::new();
1243    for (start, cluster) in text.grapheme_indices(true) {
1244        // A ZWJ sequence must contain the joiner itself.
1245        if !cluster.contains('\u{200D}') {
1246            continue;
1247        }
1248        // Must also have at least 2 non-ZWJ codepoints.
1249        let non_zwj_count = cluster.chars().filter(|&c| c != '\u{200D}').count();
1250        if non_zwj_count >= 2 {
1251            let end = start + cluster.len();
1252            result.push(start..end);
1253        }
1254    }
1255    result
1256}
1257
1258#[cfg(test)]
1259mod bench_tests;
1260#[cfg(test)]
1261mod tests_inline;