oxitext_shape/lib.rs
1#![forbid(unsafe_code)]
2#![warn(missing_docs)]
3//! `oxitext-shape` — Swash-based text shaper for OxiText.
4//!
5//! Provides [`SwashShaper`], which wraps swash's [`ShapeContext`] and produces
6//! [`ShapedRun`]s from UTF-8 text + raw font bytes.
7//!
8//! M1: LTR Latin shaping. Bidi (M2) and script-specific itemisation (M3) are
9//! deferred.
10//!
11//! # M3 additions
12//!
13//! - [`backend`]: Swappable [`backend::ShapeBackend`] trait, with the default
14//! [`backend::SwashShaperBackend`] wrapper and optional
15//! [`backend::RustybuzzShaper`] (feature `rustybuzz-backend`).
16//!
17//! # M5 additions (Slice 5a)
18//!
19//! - [`cache`]: Bounded LRU shape cache ([`cache::ShapeCache`],
20//! [`cache::ShapeKey`]) backed by [`lru::LruCache`].
21//! - [`SwashShaper::with_cache`]: creates a `SwashShaper` with an attached
22//! `ShapeCache`; subsequent `shape()` calls check the cache before invoking
23//! swash.
24//!
25//! # Feature-aware shaping (Slice 6)
26//!
27//! - [`ShapeFeature`]: an OpenType feature tag-value pair.
28//! - [`ShapeDirection`]: direction enum (Ltr/Rtl/Ttb/Btt).
29//! - [`ShapeRequest`] / [`ShapeRequestBuilder`]: builder pattern for a full
30//! shaping request including text, font, size, direction, script, language,
31//! and a list of [`ShapeFeature`]s.
32//! - [`SwashShaper::shape_request`]: shapes a complete [`ShapeRequest`], with
33//! automatic `vert`/`vrt2` feature injection for top-to-bottom text.
34//! - [`SwashShaper::shape_with_features`]: lower-level entry point that
35//! accepts a feature slice directly.
36
37pub mod backend;
38pub mod batch;
39pub mod cache;
40pub mod script_detect;
41pub mod variational;
42
43#[cfg(feature = "system-fonts")]
44pub mod system_fonts;
45#[cfg(feature = "system-fonts")]
46pub use system_fonts::{
47 build_system_db, load_best_font_for_text, load_best_font_for_text_from, load_font_for_family,
48 load_font_for_family_from,
49};
50
51#[cfg(feature = "rustybuzz-backend")]
52pub use backend::RustybuzzShaper;
53pub use backend::ShapeBackend;
54pub use backend::SwashShaperBackend;
55pub use cache::{FontId, ShapeCache, ShapeKey};
56use oxitext_core::{OxiTextError, ShapedGlyph, ShapedRun};
57pub use script_detect::{
58 requires_arabic_shaping, requires_indic_shaping, requires_mark_positioning,
59};
60use smallvec::SmallVec;
61use std::sync::Arc;
62use swash::shape::{Direction, ShapeContext};
63use swash::FontRef;
64// ──────────────────────────────────────────────────────────────────────────────
65// ShapeFeature
66// ──────────────────────────────────────────────────────────────────────────────
67
68/// An OpenType feature tag-value pair.
69///
70/// The `tag` is a 4-byte ASCII identifier (e.g. `b"liga"`, `b"kern"`,
71/// `b"smcp"`). A `value` of `0` disables the feature, `1` enables it, and
72/// values `>1` select an alternate index for features such as `salt`.
73#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
74pub struct ShapeFeature {
75 /// 4-byte ASCII OpenType feature tag.
76 pub tag: [u8; 4],
77 /// Feature value: 0 = disable, 1 = enable, >1 = alternate index.
78 pub value: u32,
79}
80
81impl ShapeFeature {
82 /// Creates a new feature with an arbitrary value.
83 pub const fn new(tag: [u8; 4], value: u32) -> Self {
84 Self { tag, value }
85 }
86
87 /// Creates an enabled feature (`value = 1`).
88 pub const fn enable(tag: [u8; 4]) -> Self {
89 Self { tag, value: 1 }
90 }
91
92 /// Creates a disabled feature (`value = 0`).
93 pub const fn disable(tag: [u8; 4]) -> Self {
94 Self { tag, value: 0 }
95 }
96
97 /// Standard ligatures.
98 pub const LIGA: Self = Self::enable(*b"liga");
99 /// Kerning.
100 pub const KERN: Self = Self::enable(*b"kern");
101 /// Small capitals.
102 pub const SMCP: Self = Self::enable(*b"smcp");
103 /// Contextual alternates.
104 pub const CALT: Self = Self::enable(*b"calt");
105 /// Vertical forms (substitution of upright CJK glyphs with vertical ones).
106 pub const VERT: Self = Self::enable(*b"vert");
107 /// Vertical rotation (alternative to `vert` for some CJK contexts).
108 pub const VRT2: Self = Self::enable(*b"vrt2");
109}
110
111// ──────────────────────────────────────────────────────────────────────────────
112// ShapeDirection
113// ──────────────────────────────────────────────────────────────────────────────
114
115/// Text direction for a shaping request.
116#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
117pub enum ShapeDirection {
118 /// Left-to-right (default for Latin, Cyrillic, etc.).
119 #[default]
120 Ltr,
121 /// Right-to-left (Arabic, Hebrew, etc.).
122 Rtl,
123 /// Top-to-bottom (CJK vertical text).
124 Ttb,
125 /// Bottom-to-top (rare).
126 Btt,
127}
128
129// ──────────────────────────────────────────────────────────────────────────────
130// ShapeRequest / ShapeRequestBuilder
131// ──────────────────────────────────────────────────────────────────────────────
132
133/// A complete shaping request with all parameters.
134///
135/// Build via [`ShapeRequest::builder`] and then call
136/// [`SwashShaper::shape_request`].
137#[derive(Debug, Clone)]
138pub struct ShapeRequest<'a> {
139 /// UTF-8 text to shape.
140 pub text: &'a str,
141 /// Raw font bytes.
142 pub font_data: &'a [u8],
143 /// Font size in pixels-per-em.
144 pub px_size: f32,
145 /// Shaping direction.
146 pub direction: ShapeDirection,
147 /// OpenType script tag (e.g. `b"latn"`, `b"arab"`), or `None` for
148 /// auto-detection.
149 pub script: Option<[u8; 4]>,
150 /// OpenType language tag (e.g. `b"ENG "`, `b"ARA "`), or `None`.
151 pub language: Option<[u8; 4]>,
152 /// OpenType feature overrides.
153 pub features: Vec<ShapeFeature>,
154}
155
156impl<'a> ShapeRequest<'a> {
157 /// Returns a new [`ShapeRequestBuilder`].
158 pub fn builder() -> ShapeRequestBuilder<'a> {
159 ShapeRequestBuilder::default()
160 }
161}
162
163/// Builder for [`ShapeRequest`].
164#[derive(Debug, Default)]
165pub struct ShapeRequestBuilder<'a> {
166 text: Option<&'a str>,
167 font_data: Option<&'a [u8]>,
168 px_size: f32,
169 direction: ShapeDirection,
170 script: Option<[u8; 4]>,
171 language: Option<[u8; 4]>,
172 features: Vec<ShapeFeature>,
173}
174
175/// Errors that can occur when building a [`ShapeRequest`].
176#[derive(Debug)]
177pub enum ShapeRequestError {
178 /// The `text` field was not provided.
179 MissingText,
180 /// The `font_data` field was not provided.
181 MissingFont,
182}
183
184impl std::fmt::Display for ShapeRequestError {
185 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
186 match self {
187 ShapeRequestError::MissingText => f.write_str("text not set"),
188 ShapeRequestError::MissingFont => f.write_str("font_data not set"),
189 }
190 }
191}
192
193impl std::error::Error for ShapeRequestError {}
194
195impl<'a> ShapeRequestBuilder<'a> {
196 /// Sets the text to shape.
197 pub fn text(mut self, t: &'a str) -> Self {
198 self.text = Some(t);
199 self
200 }
201
202 /// Sets the raw font bytes.
203 pub fn font_data(mut self, d: &'a [u8]) -> Self {
204 self.font_data = Some(d);
205 self
206 }
207
208 /// Sets the font size in pixels-per-em.
209 pub fn px_size(mut self, s: f32) -> Self {
210 self.px_size = s;
211 self
212 }
213
214 /// Sets the shaping direction.
215 pub fn direction(mut self, d: ShapeDirection) -> Self {
216 self.direction = d;
217 self
218 }
219
220 /// Pins the OpenType script tag (overrides swash's auto-detection).
221 pub fn script(mut self, tag: [u8; 4]) -> Self {
222 self.script = Some(tag);
223 self
224 }
225
226 /// Pins the OpenType language tag for language-specific GSUB/GPOS rules.
227 pub fn language(mut self, tag: [u8; 4]) -> Self {
228 self.language = Some(tag);
229 self
230 }
231
232 /// Appends an OpenType feature override.
233 pub fn feature(mut self, f: ShapeFeature) -> Self {
234 self.features.push(f);
235 self
236 }
237
238 /// Builds the [`ShapeRequest`].
239 ///
240 /// # Errors
241 /// Returns [`ShapeRequestError::MissingText`] or
242 /// [`ShapeRequestError::MissingFont`] if the respective fields were not
243 /// provided.
244 pub fn build(self) -> Result<ShapeRequest<'a>, ShapeRequestError> {
245 Ok(ShapeRequest {
246 text: self.text.ok_or(ShapeRequestError::MissingText)?,
247 font_data: self.font_data.ok_or(ShapeRequestError::MissingFont)?,
248 px_size: self.px_size,
249 direction: self.direction,
250 script: self.script,
251 language: self.language,
252 features: self.features,
253 })
254 }
255}
256
257// ──────────────────────────────────────────────────────────────────────────────
258// Internal parameter bundle used by shape_with_features_internal
259// ──────────────────────────────────────────────────────────────────────────────
260
261/// Internal parameter bundle for the unified shaping entry point.
262///
263/// Groups all shaping inputs into a single struct so `shape_with_features_internal`
264/// stays under the clippy `too_many_arguments` threshold.
265struct ShapeParams<'a> {
266 font_data: &'a [u8],
267 text: &'a str,
268 px_size: f32,
269 rtl: bool,
270 script_tag: Option<[u8; 4]>,
271 language_tag: Option<[u8; 4]>,
272 features: &'a [ShapeFeature],
273}
274
275// ──────────────────────────────────────────────────────────────────────────────
276// SwashShaper
277// ──────────────────────────────────────────────────────────────────────────────
278
279/// Text shaper backed by [swash].
280///
281/// Keep a single `SwashShaper` alive across multiple layout passes to amortise
282/// the cost of the internal LRU caches that swash maintains in [`ShapeContext`].
283///
284/// Optionally attach a [`ShapeCache`] via [`Self::with_cache`] to skip swash
285/// entirely on repeated requests for the same `(font, text, size)` tuple.
286pub struct SwashShaper {
287 ctx: ShapeContext,
288 /// Optional application-level shape cache.
289 cache: Option<Arc<ShapeCache>>,
290 /// Cached text string for script-run reuse (Item 4).
291 #[cfg(feature = "icu")]
292 script_cache_text: String,
293 /// Cached script runs for the cached text (Item 4).
294 #[cfg(feature = "icu")]
295 script_cache_runs: Vec<oxitext_icu::ScriptRun>,
296}
297
298impl SwashShaper {
299 /// Creates a new shaper with default cache settings and no shape cache.
300 pub fn new() -> Self {
301 Self {
302 ctx: ShapeContext::new(),
303 cache: None,
304 #[cfg(feature = "icu")]
305 script_cache_text: String::new(),
306 #[cfg(feature = "icu")]
307 script_cache_runs: Vec::new(),
308 }
309 }
310
311 /// Creates a new shaper with an attached [`ShapeCache`] of `capacity` entries.
312 ///
313 /// Repeated calls to [`Self::shape`] with the same `(font_data, text, size)`
314 /// tuple will be served from the cache after the first miss.
315 ///
316 /// # Arguments
317 /// - `capacity`: maximum number of [`ShapedRun`]s to keep in the cache.
318 /// Passing `0` uses a minimum capacity of 1.
319 pub fn with_cache(capacity: usize) -> Self {
320 Self {
321 ctx: ShapeContext::new(),
322 cache: Some(Arc::new(ShapeCache::new(capacity))),
323 #[cfg(feature = "icu")]
324 script_cache_text: String::new(),
325 #[cfg(feature = "icu")]
326 script_cache_runs: Vec::new(),
327 }
328 }
329
330 /// Returns a reference to the attached shape cache, if any.
331 pub fn shape_cache(&self) -> Option<&Arc<ShapeCache>> {
332 self.cache.as_ref()
333 }
334
335 /// Shapes `text` using the font in `font_data` at `size` pixels-per-em.
336 ///
337 /// Returns a [`ShapedRun`] containing one [`ShapedGlyph`] per output glyph.
338 /// The `x_advance` of each glyph is in pixels (already scaled by `size`).
339 ///
340 /// When an attached [`ShapeCache`] is present the result is looked up
341 /// before invoking swash. Cache keys incorporate `font_data` pointer
342 /// identity, the exact text, and `size`.
343 ///
344 /// # Errors
345 /// Returns [`OxiTextError::Shaping`] if the font bytes cannot be parsed by
346 /// swash.
347 pub fn shape(
348 &mut self,
349 text: &str,
350 font_data: Arc<[u8]>,
351 size: f32,
352 ) -> Result<ShapedRun, OxiTextError> {
353 // Build a deterministic axis hash from the size (no variation axes yet).
354 let axis_hash = size.to_bits() as u64;
355
356 // Check cache if attached.
357 if let Some(ref cache) = self.cache {
358 let key = ShapeKey::new(&font_data, text, axis_hash);
359 if let Some(cached) = cache.get(&key) {
360 return Ok((*cached).clone());
361 }
362 }
363
364 // Cache miss — invoke swash.
365 let font = FontRef::from_index(&font_data, 0)
366 .ok_or_else(|| OxiTextError::Shaping("swash could not parse font bytes".into()))?;
367
368 let mut shaper = self.ctx.builder(font).size(size).build();
369 shaper.add_str(text);
370
371 let mut glyphs: SmallVec<[ShapedGlyph; 8]> = SmallVec::new();
372 shaper.shape_with(|cluster| {
373 // A cluster is whitespace if every source char it covers is
374 // whitespace. Most whitespace clusters cover a single space/tab.
375 let cluster_range = cluster.source.start as usize..cluster.source.end as usize;
376 let is_ws = text
377 .get(cluster_range)
378 .map(|slice| !slice.is_empty() && slice.chars().all(|c| c.is_whitespace()))
379 .unwrap_or(false);
380 // More than one glyph in a cluster means inner glyphs are unsafe
381 // to break before (ligature / mark attachment).
382 let multi = cluster.glyphs.len() > 1;
383 for (idx, glyph) in cluster.glyphs.iter().enumerate() {
384 // A glyph is unsafe to break before if it is inside a
385 // multi-glyph cluster (idx > 0) OR if it carries the mark
386 // attachment flag (combining mark attached to a base glyph).
387 let utb = (multi && idx > 0) || glyph.info.is_mark();
388 glyphs.push(ShapedGlyph {
389 gid: glyph.id,
390 x_advance: glyph.advance,
391 y_advance: 0.0,
392 x_offset: glyph.x,
393 y_offset: glyph.y,
394 cluster: cluster.source.start,
395 is_whitespace: is_ws,
396 unsafe_to_break: utb,
397 });
398 }
399 });
400
401 let run = ShapedRun {
402 glyphs,
403 font_data: Arc::clone(&font_data),
404 };
405
406 // Populate cache on miss.
407 if let Some(ref cache) = self.cache {
408 let key = ShapeKey::new(&font_data, text, axis_hash);
409 cache.insert(key, Arc::new(run.clone()));
410 }
411
412 Ok(run)
413 }
414
415 /// Shapes `text` with explicit direction control.
416 ///
417 /// When `rtl` is `false` this is identical to [`Self::shape`].
418 ///
419 /// When `rtl` is `true` the shaper signals `Direction::RightToLeft` to
420 /// swash (enabling correct Arabic/Hebrew form selection via OpenType GSUB),
421 /// then **sorts** the resulting glyphs by ascending `cluster` byte offset so
422 /// the output is always in **logical source order** regardless of what swash
423 /// emits. The caller (bidi engine) is responsible for visual reordering.
424 ///
425 /// # Errors
426 /// Returns [`OxiTextError::Shaping`] if the font bytes cannot be parsed.
427 pub fn shape_with_direction(
428 &mut self,
429 text: &str,
430 font_data: Arc<[u8]>,
431 size: f32,
432 rtl: bool,
433 ) -> Result<ShapedRun, OxiTextError> {
434 if !rtl {
435 return self.shape(text, font_data, size);
436 }
437 // RTL path: shape with the explicit RightToLeft hint so swash can apply
438 // direction-sensitive GSUB lookups, then sort to ascending cluster order
439 // (logical order) to satisfy the architecture contract.
440 let mut run = self.do_shape_rtl(text, font_data, size)?;
441 run.glyphs.sort_by_key(|g| g.cluster);
442 Ok(run)
443 }
444
445 /// Shapes text using all parameters in a [`ShapeRequest`].
446 ///
447 /// When `direction` is [`ShapeDirection::Ttb`] or [`ShapeDirection::Btt`],
448 /// the `vert` and `vrt2` OpenType features are **automatically appended**
449 /// to the feature list (if not already present) so that fonts with a
450 /// vertical substitution table produce the correct glyph variants.
451 ///
452 /// Script and language tags, if provided, are forwarded to swash's
453 /// `ShaperBuilder` for language-specific GSUB/GPOS rule selection.
454 ///
455 /// # Errors
456 /// Returns [`OxiTextError::Shaping`] if the font bytes cannot be parsed.
457 pub fn shape_request(
458 &mut self,
459 req: &ShapeRequest<'_>,
460 ) -> Result<Vec<ShapedGlyph>, OxiTextError> {
461 // When the `icu` feature is enabled, normalize text to NFC before shaping
462 // so that precomposed and decomposed spellings produce identical glyph runs.
463 #[cfg(feature = "icu")]
464 let normalized_text: String;
465 #[cfg(feature = "icu")]
466 let req_text: &str = {
467 normalized_text = oxitext_icu::Normalizer::new().nfc(req.text);
468 normalized_text.as_str()
469 };
470 #[cfg(not(feature = "icu"))]
471 let req_text: &str = req.text;
472
473 // When direction is Ltr but the text is Arabic, auto-upgrade to Rtl
474 // so swash can apply the correct Arabic GSUB form-selection lookups.
475 let effective_direction = if req.direction == ShapeDirection::Ltr
476 && requires_arabic_shaping(req_text)
477 {
478 #[cfg(debug_assertions)]
479 eprintln!("[oxitext-shape] Arabic text detected with Ltr direction; upgrading to Rtl");
480 ShapeDirection::Rtl
481 } else {
482 req.direction
483 };
484
485 // Auto-inject vertical OpenType features for vertical directions.
486 let mut features = req.features.clone();
487 if effective_direction == ShapeDirection::Ttb || effective_direction == ShapeDirection::Btt
488 {
489 if !features.iter().any(|f| f.tag == *b"vert") {
490 features.push(ShapeFeature::VERT);
491 }
492 if !features.iter().any(|f| f.tag == *b"vrt2") {
493 features.push(ShapeFeature::VRT2);
494 }
495 }
496
497 let rtl = effective_direction == ShapeDirection::Rtl;
498 self.shape_with_features_internal(ShapeParams {
499 font_data: req.font_data,
500 text: req_text,
501 px_size: req.px_size,
502 rtl,
503 script_tag: req.script,
504 language_tag: req.language,
505 features: &features,
506 })
507 }
508
509 /// Shapes text with an explicit list of OpenType feature overrides.
510 ///
511 /// Unlike [`Self::shape_request`] this entry point does **not** inject
512 /// vertical features automatically; callers are responsible for supplying
513 /// the full feature list.
514 ///
515 /// # Errors
516 /// Returns [`OxiTextError::Shaping`] if the font bytes cannot be parsed.
517 pub fn shape_with_features(
518 &mut self,
519 font_data: &[u8],
520 text: &str,
521 px_size: f32,
522 rtl: bool,
523 features: &[ShapeFeature],
524 ) -> Result<Vec<ShapedGlyph>, OxiTextError> {
525 self.shape_with_features_internal(ShapeParams {
526 font_data,
527 text,
528 px_size,
529 rtl,
530 script_tag: None,
531 language_tag: None,
532 features,
533 })
534 }
535
536 /// Internal implementation shared by [`Self::shape_request`] and
537 /// [`Self::shape_with_features`].
538 fn shape_with_features_internal(
539 &mut self,
540 params: ShapeParams<'_>,
541 ) -> Result<Vec<ShapedGlyph>, OxiTextError> {
542 use swash::tag_from_bytes;
543 use swash::text::{Language, Script};
544
545 let font = FontRef::from_index(params.font_data, 0)
546 .ok_or_else(|| OxiTextError::Shaping("swash could not parse font bytes".into()))?;
547
548 let direction = if params.rtl {
549 Direction::RightToLeft
550 } else {
551 Direction::LeftToRight
552 };
553
554 // Resolve the optional script tag to a swash Script enum value.
555 let script = params
556 .script_tag
557 .and_then(|t| Script::from_opentype(tag_from_bytes(&t)))
558 .unwrap_or(Script::Latin);
559
560 // Resolve the optional language tag to a swash Language.
561 let language = params.language_tag.and_then(|t| {
562 // swash Language::parse expects a BCP-47 string; for OpenType tags
563 // we convert the raw bytes to a lossy str and try to parse them.
564 let s = std::str::from_utf8(&t).unwrap_or("").trim_end();
565 Language::parse(s)
566 });
567
568 // Convert our ShapeFeature slice to swash-compatible (tag, value) pairs.
569 // swash's `ShaperBuilder::features` accepts any iterator whose items
570 // implement `Into<Setting<u16>>`. The swash crate provides
571 // `From<&([u8; 4], T)> for Setting<T>`, so we pass an iterator of
572 // references to satisfy the bound.
573 let swash_features: Vec<([u8; 4], u16)> = params
574 .features
575 .iter()
576 .map(|f| (f.tag, f.value.min(u32::from(u16::MAX)) as u16))
577 .collect();
578
579 let mut shaper = self
580 .ctx
581 .builder(font)
582 .size(params.px_size)
583 .direction(direction)
584 .script(script)
585 .language(language)
586 .features(swash_features.iter())
587 .build();
588
589 shaper.add_str(params.text);
590
591 let mut glyphs: Vec<ShapedGlyph> = Vec::new();
592 shaper.shape_with(|cluster| {
593 let cluster_range = cluster.source.start as usize..cluster.source.end as usize;
594 let is_ws = params
595 .text
596 .get(cluster_range)
597 .map(|slice| !slice.is_empty() && slice.chars().all(|c| c.is_whitespace()))
598 .unwrap_or(false);
599 let multi = cluster.glyphs.len() > 1;
600 for (idx, glyph) in cluster.glyphs.iter().enumerate() {
601 let utb = (multi && idx > 0) || glyph.info.is_mark();
602 glyphs.push(ShapedGlyph {
603 gid: glyph.id,
604 x_advance: glyph.advance,
605 y_advance: 0.0,
606 x_offset: glyph.x,
607 y_offset: glyph.y,
608 cluster: cluster.source.start,
609 is_whitespace: is_ws,
610 unsafe_to_break: utb,
611 });
612 }
613 });
614
615 if params.rtl {
616 glyphs.sort_by_key(|g| g.cluster);
617 }
618
619 Ok(glyphs)
620 }
621
622 /// Internal RTL shaping path: invokes swash with `Direction::RightToLeft`.
623 ///
624 /// Returns glyphs in whatever order swash produces; the public
625 /// [`Self::shape_with_direction`] sorts them to ascending cluster order.
626 fn do_shape_rtl(
627 &mut self,
628 text: &str,
629 font_data: Arc<[u8]>,
630 size: f32,
631 ) -> Result<ShapedRun, OxiTextError> {
632 let font = FontRef::from_index(&font_data, 0)
633 .ok_or_else(|| OxiTextError::Shaping("swash could not parse font bytes".into()))?;
634
635 let mut shaper = self
636 .ctx
637 .builder(font)
638 .size(size)
639 .direction(Direction::RightToLeft)
640 .build();
641 shaper.add_str(text);
642
643 let mut glyphs: SmallVec<[ShapedGlyph; 8]> = SmallVec::new();
644 shaper.shape_with(|cluster| {
645 let cluster_range = cluster.source.start as usize..cluster.source.end as usize;
646 let is_ws = text
647 .get(cluster_range)
648 .map(|slice| !slice.is_empty() && slice.chars().all(|c| c.is_whitespace()))
649 .unwrap_or(false);
650 let multi = cluster.glyphs.len() > 1;
651 for (idx, glyph) in cluster.glyphs.iter().enumerate() {
652 let utb = (multi && idx > 0) || glyph.info.is_mark();
653 glyphs.push(ShapedGlyph {
654 gid: glyph.id,
655 x_advance: glyph.advance,
656 y_advance: 0.0,
657 x_offset: glyph.x,
658 y_offset: glyph.y,
659 cluster: cluster.source.start,
660 is_whitespace: is_ws,
661 unsafe_to_break: utb,
662 });
663 }
664 });
665
666 Ok(ShapedRun {
667 glyphs,
668 font_data: Arc::clone(&font_data),
669 })
670 }
671
672 /// Shapes `text` and returns a rich [`ShapeResult`] with metadata.
673 ///
674 /// The result includes the glyph list, the direction used, and any
675 /// codepoints that could not be mapped (glyph ID 0 / `.notdef`).
676 ///
677 /// # Errors
678 /// Returns [`OxiTextError::Shaping`] if the font bytes cannot be parsed.
679 pub fn shape_full(
680 &mut self,
681 font_data: &[u8],
682 text: &str,
683 px_size: f32,
684 ) -> Result<ShapeResult, OxiTextError> {
685 use unicode_segmentation::UnicodeSegmentation;
686
687 let glyphs = self.shape_with_features_internal(ShapeParams {
688 font_data,
689 text,
690 px_size,
691 rtl: false,
692 script_tag: None,
693 language_tag: None,
694 features: &[],
695 })?;
696 let mut result = ShapeResult::from_glyphs(glyphs, text, ShapeDirection::Ltr);
697 // Populate grapheme cluster boundaries: start offset of each grapheme
698 // plus the end-of-text sentinel.
699 result.cluster_boundaries = text
700 .grapheme_indices(true)
701 .map(|(i, _)| i)
702 .chain(std::iter::once(text.len()))
703 .collect();
704 Ok(result)
705 }
706
707 /// Shapes `text` using raw font bytes supplied as `&[u8]` (LTR).
708 ///
709 /// A convenience wrapper over `Self::shape_with_features_internal` for
710 /// callers that already hold raw font bytes and do not need the `Arc` wrapping
711 /// or cache infrastructure of [`Self::shape`].
712 ///
713 /// # Errors
714 /// Returns [`OxiTextError::Shaping`] if the font bytes cannot be parsed.
715 pub fn shape_slice(
716 &mut self,
717 font_data: &[u8],
718 text: &str,
719 px_size: f32,
720 ) -> Result<Vec<ShapedGlyph>, OxiTextError> {
721 self.shape_with_features_internal(ShapeParams {
722 font_data,
723 text,
724 px_size,
725 rtl: false,
726 script_tag: None,
727 language_tag: None,
728 features: &[],
729 })
730 }
731
732 /// Shapes `text` using raw font bytes supplied as `&[u8]` (RTL).
733 ///
734 /// Like [`Self::shape_slice`] but shapes in right-to-left direction and
735 /// returns glyphs in ascending `cluster` (logical source) order.
736 ///
737 /// # Errors
738 /// Returns [`OxiTextError::Shaping`] if the font bytes cannot be parsed.
739 pub fn shape_slice_rtl(
740 &mut self,
741 font_data: &[u8],
742 text: &str,
743 px_size: f32,
744 ) -> Result<Vec<ShapedGlyph>, OxiTextError> {
745 self.shape_with_features_internal(ShapeParams {
746 font_data,
747 text,
748 px_size,
749 rtl: true,
750 script_tag: None,
751 language_tag: None,
752 features: &[],
753 })
754 }
755
756 /// Shapes `text` with a font fallback chain.
757 ///
758 /// For each codepoint that produces `glyph_id == 0` (`.notdef`), the
759 /// corresponding text run is re-shaped with each successive fallback font
760 /// in `fonts[1..]`. If a fallback produces a non-zero glyph ID the
761 /// fallback glyphs replace the `.notdef` glyphs in the result; otherwise
762 /// the `.notdef` glyphs are preserved (best-effort).
763 ///
764 /// `fonts[0]` is the primary font; `fonts[1..]` are tried in order.
765 ///
766 /// # Note on cluster offsets
767 ///
768 /// When a sub-string is re-shaped with a fallback font, swash emits cluster
769 /// byte offsets **relative to that sub-string** (starting at 0). This
770 /// function adds the original start offset back before merging so all
771 /// returned glyphs carry absolute offsets into `text`.
772 ///
773 /// # Errors
774 /// Returns [`OxiTextError::Shaping`] if the primary font cannot be parsed.
775 pub fn shape_with_fallback(
776 &mut self,
777 fonts: &[&[u8]],
778 text: &str,
779 px_size: f32,
780 ) -> Result<Vec<ShapedGlyph>, OxiTextError> {
781 let primary = fonts
782 .first()
783 .ok_or_else(|| OxiTextError::Shaping("font list is empty".into()))?;
784
785 // 1. Shape with the primary font.
786 let mut result = self.shape_with_features_internal(ShapeParams {
787 font_data: primary,
788 text,
789 px_size,
790 rtl: false,
791 script_tag: None,
792 language_tag: None,
793 features: &[],
794 })?;
795
796 if fonts.len() <= 1 {
797 return Ok(result);
798 }
799
800 // 2. Find contiguous runs of .notdef (glyph ID 0) glyphs.
801 let notdef_runs = collect_notdef_runs(&result, text);
802
803 // 3. For each .notdef run try the fallback fonts.
804 for (run_text_start, run_text_end) in notdef_runs {
805 let sub_text = match text.get(run_text_start..run_text_end) {
806 Some(s) if !s.is_empty() => s,
807 _ => continue,
808 };
809
810 // Try each fallback font in order.
811 'fallback: for fallback_font in &fonts[1..] {
812 let fallback_glyphs = match self.shape_with_features_internal(ShapeParams {
813 font_data: fallback_font,
814 text: sub_text,
815 px_size,
816 rtl: false,
817 script_tag: None,
818 language_tag: None,
819 features: &[],
820 }) {
821 Ok(g) => g,
822 Err(_) => continue,
823 };
824
825 // Only use this fallback if it resolved at least one glyph.
826 if fallback_glyphs.iter().all(|g| g.gid == 0) {
827 continue;
828 }
829
830 // Adjust cluster offsets from sub-string-relative to
831 // text-absolute and replace the .notdef glyphs in result.
832 let start_offset = run_text_start as u32;
833 let adjusted: Vec<ShapedGlyph> = fallback_glyphs
834 .into_iter()
835 .map(|mut g| {
836 g.cluster += start_offset;
837 g
838 })
839 .collect();
840
841 // Replace glyphs in the result whose cluster falls in [run_text_start, run_text_end).
842 result.retain(|g| {
843 let c = g.cluster as usize;
844 !(c >= run_text_start && c < run_text_end && g.gid == 0)
845 });
846
847 // Insert adjusted fallback glyphs at the correct position.
848 let insert_pos = result.partition_point(|g| (g.cluster as usize) < run_text_start);
849 for (i, g) in adjusted.into_iter().enumerate() {
850 result.insert(insert_pos + i, g);
851 }
852
853 break 'fallback;
854 }
855 }
856
857 Ok(result)
858 }
859
860 /// Returns `true` if the given font data contains AAT layout tables.
861 ///
862 /// Checks for the presence of `morx` (extended glyph metamorphosis rules),
863 /// `kerx` (extended kerning data), or `ankr` (anchor point) tables — the
864 /// three primary tables that distinguish Apple Advanced Typography (AAT)
865 /// fonts from pure OpenType fonts.
866 ///
867 /// Swash's [`ShapeContext`] already applies AAT tables transparently when
868 /// present, so this function is informational only; it does not change the
869 /// shaping path.
870 pub fn font_has_aat(font_data: &[u8]) -> bool {
871 ttf_parser::Face::parse(font_data, 0)
872 .map(|face| {
873 face.raw_face()
874 .table(ttf_parser::Tag::from_bytes(b"morx"))
875 .is_some()
876 || face
877 .raw_face()
878 .table(ttf_parser::Tag::from_bytes(b"kerx"))
879 .is_some()
880 || face
881 .raw_face()
882 .table(ttf_parser::Tag::from_bytes(b"ankr"))
883 .is_some()
884 })
885 .unwrap_or(false)
886 }
887
888 /// Shape using AAT if the font has Morx/Kerx tables, otherwise fall back to
889 /// standard OpenType shaping.
890 ///
891 /// Swash handles both AAT and OpenType tables transparently via its
892 /// `ShapeContext`; this method is informational. It delegates directly to
893 /// `Self::shape_with_features_internal` regardless of table presence.
894 ///
895 /// # Errors
896 /// Returns [`OxiTextError::Shaping`] if the font bytes cannot be parsed.
897 pub fn shape_with_aat_fallback(
898 &mut self,
899 font_data: &[u8],
900 text: &str,
901 px_size: f32,
902 ) -> Result<ShapeResult, OxiTextError> {
903 use unicode_segmentation::UnicodeSegmentation;
904
905 let glyphs = self.shape_with_features_internal(ShapeParams {
906 font_data,
907 text,
908 px_size,
909 rtl: false,
910 script_tag: None,
911 language_tag: None,
912 features: &[],
913 })?;
914 let mut result = ShapeResult::from_glyphs(glyphs, text, ShapeDirection::Ltr);
915 result.cluster_boundaries = text
916 .grapheme_indices(true)
917 .map(|(i, _)| i)
918 .chain(std::iter::once(text.len()))
919 .collect();
920 Ok(result)
921 }
922}
923
924// ──────────────────────────────────────────────────────────────────────────────
925// ShapeResult
926// ──────────────────────────────────────────────────────────────────────────────
927
928/// Extended shaping result with metadata.
929///
930/// Produced by [`SwashShaper::shape_full`]; includes the glyph list, the
931/// direction resolved by the shaper, the OpenType script tag (if known), and a
932/// list of Unicode codepoints that could not be mapped (glyph ID 0 / `.notdef`).
933#[derive(Debug, Clone)]
934pub struct ShapeResult {
935 /// Shaped glyphs in logical cluster order.
936 pub glyphs: Vec<ShapedGlyph>,
937 /// OpenType script tag detected (e.g. `b"latn"`, `b"arab"`), or `None` if
938 /// unknown. May be set by the caller after construction.
939 pub script_detected: Option<[u8; 4]>,
940 /// Direction resolved by the shaper.
941 pub direction: ShapeDirection,
942 /// Unicode codepoints that produced a `.notdef` glyph (ID 0).
943 pub missing_codepoints: Vec<char>,
944 /// Byte offsets (in the original text) where grapheme cluster boundaries fall.
945 ///
946 /// Populated by [`SwashShaper::shape_full`]. Empty when [`SwashShaper::shape`]
947 /// is called directly. The first entry is `0` (start of text) and the last
948 /// entry is `text.len()` (end of text).
949 pub cluster_boundaries: Vec<usize>,
950}
951
952impl ShapeResult {
953 /// Constructs a [`ShapeResult`] from a glyph vector, the source text, and
954 /// the shaping direction.
955 ///
956 /// `script_detected` is left as `None`; callers may set it afterwards.
957 pub fn from_glyphs(glyphs: Vec<ShapedGlyph>, text: &str, direction: ShapeDirection) -> Self {
958 let missing: Vec<char> = {
959 let mut seen = std::collections::HashSet::new();
960 let mut missing = Vec::new();
961 for g in &glyphs {
962 if g.gid == 0 {
963 if let Some(ch) = text
964 .get(g.cluster as usize..)
965 .and_then(|s| s.chars().next())
966 {
967 if seen.insert(ch) {
968 missing.push(ch);
969 }
970 }
971 }
972 }
973 missing
974 };
975 Self {
976 glyphs,
977 script_detected: None,
978 direction,
979 missing_codepoints: missing,
980 cluster_boundaries: Vec::new(),
981 }
982 }
983}
984
985// ──────────────────────────────────────────────────────────────────────────────
986// Helpers
987// ──────────────────────────────────────────────────────────────────────────────
988
989/// Collect contiguous byte ranges in `text` that are covered exclusively by
990/// `.notdef` (glyph ID 0) glyphs in `glyphs`.
991///
992/// Returns a `Vec` of `(start, end)` byte offset pairs into `text`.
993fn collect_notdef_runs(glyphs: &[ShapedGlyph], text: &str) -> Vec<(usize, usize)> {
994 if glyphs.is_empty() {
995 return Vec::new();
996 }
997
998 // Build a sorted, deduplicated list of cluster byte offsets that are .notdef.
999 let mut notdef_clusters: Vec<usize> = glyphs
1000 .iter()
1001 .filter(|g| g.gid == 0)
1002 .map(|g| g.cluster as usize)
1003 .collect();
1004 notdef_clusters.sort_unstable();
1005 notdef_clusters.dedup();
1006
1007 // Build a sorted list of all cluster start offsets (regardless of gid).
1008 let mut all_starts: Vec<usize> = glyphs.iter().map(|g| g.cluster as usize).collect();
1009 all_starts.sort_unstable();
1010 all_starts.dedup();
1011
1012 // For each .notdef cluster, determine the end offset: it's the byte offset
1013 // of the next cluster in `all_starts`, or `text.len()` for the last one.
1014 let mut runs: Vec<(usize, usize)> = Vec::new();
1015 for &start in ¬def_clusters {
1016 let end = all_starts
1017 .iter()
1018 .find(|&&s| s > start)
1019 .copied()
1020 .unwrap_or(text.len());
1021 // Merge with the previous run if adjacent.
1022 if let Some(last) = runs.last_mut() {
1023 if last.1 == start {
1024 last.1 = end;
1025 continue;
1026 }
1027 }
1028 runs.push((start, end));
1029 }
1030 runs
1031}
1032
1033impl Default for SwashShaper {
1034 fn default() -> Self {
1035 Self::new()
1036 }
1037}
1038
1039// ──────────────────────────────────────────────────────────────────────────────
1040// System font convenience methods (feature `system-fonts`)
1041// ──────────────────────────────────────────────────────────────────────────────
1042
1043#[cfg(feature = "system-fonts")]
1044impl SwashShaper {
1045 /// Shape `text` using the best system font for its Unicode content.
1046 ///
1047 /// Calls [`system_fonts::load_best_font_for_text`] to discover a system
1048 /// font whose OS/2 Unicode range bits cover the codepoints in `text`, then
1049 /// shapes with that font at `px_size` pixels-per-em.
1050 ///
1051 /// This is a convenience wrapper; callers that need to reuse the same
1052 /// font database for many shaping calls should load the font bytes once
1053 /// with [`system_fonts::load_best_font_for_text`] and then call
1054 /// [`Self::shape_slice`] directly.
1055 ///
1056 /// # Errors
1057 /// Returns [`OxiTextError::Shaping`] when no suitable system font can be
1058 /// found or when the discovered font bytes cannot be parsed by swash.
1059 pub fn shape_with_system_font(
1060 &mut self,
1061 text: &str,
1062 px_size: f32,
1063 ) -> Result<Vec<ShapedGlyph>, OxiTextError> {
1064 let font_data = system_fonts::load_best_font_for_text(text)
1065 .ok_or_else(|| OxiTextError::Shaping("no system font found for text".into()))?;
1066 self.shape_slice(&font_data, text, px_size)
1067 }
1068
1069 /// Shape `text` using the system font that best matches `family`.
1070 ///
1071 /// `family` may be a concrete font family name (e.g. `"Arial"`) or a CSS
1072 /// generic alias (e.g. `"sans-serif"`). The best CSS Level 4 match from
1073 /// the system catalog is used.
1074 ///
1075 /// # Errors
1076 /// Returns [`OxiTextError::Shaping`] when no font matching `family` can be
1077 /// found in the system catalog or when the font bytes cannot be parsed.
1078 pub fn shape_with_family(
1079 &mut self,
1080 text: &str,
1081 family: &str,
1082 px_size: f32,
1083 ) -> Result<Vec<ShapedGlyph>, OxiTextError> {
1084 let font_data = system_fonts::load_font_for_family(family).ok_or_else(|| {
1085 OxiTextError::Shaping(format!("no system font found for family '{family}'"))
1086 })?;
1087 self.shape_slice(&font_data, text, px_size)
1088 }
1089}
1090
1091// ──────────────────────────────────────────────────────────────────────────────
1092// Script-aware itemization (Feature 1, behind `icu` feature gate)
1093// ──────────────────────────────────────────────────────────────────────────────
1094
1095#[cfg(feature = "icu")]
1096/// Maps a [`oxitext_icu::TextScript`] to a 4-byte OpenType script tag.
1097fn text_script_to_ot_tag(s: oxitext_icu::TextScript) -> [u8; 4] {
1098 use oxitext_icu::TextScript;
1099 match s {
1100 TextScript::Latin => *b"latn",
1101 TextScript::Arabic => *b"arab",
1102 TextScript::Devanagari => *b"dev2",
1103 TextScript::Han => *b"hani",
1104 TextScript::Hangul => *b"hang",
1105 TextScript::Hiragana | TextScript::Katakana => *b"kana",
1106 TextScript::Hebrew => *b"hebr",
1107 TextScript::Thai => *b"thai",
1108 TextScript::Greek => *b"grek",
1109 TextScript::Cyrillic => *b"cyrl",
1110 _ => *b"DFLT",
1111 }
1112}
1113
1114#[cfg(feature = "icu")]
1115impl SwashShaper {
1116 /// Shapes `text` by first splitting it into per-script runs using ICU4X
1117 /// script itemization, then shaping each run with the appropriate OpenType
1118 /// script tag.
1119 ///
1120 /// Returns one [`ShapedRun`] per script run, in logical (source) order.
1121 /// Each glyph's `cluster` offset is absolute (relative to the start of
1122 /// `text`), not relative to the sub-run.
1123 ///
1124 /// # Errors
1125 /// Returns [`OxiTextError::Shaping`] if the font bytes cannot be parsed.
1126 pub fn shape_by_script(
1127 &mut self,
1128 font_data: Arc<[u8]>,
1129 text: &str,
1130 px_size: f32,
1131 features: &[ShapeFeature],
1132 ) -> Result<Vec<ShapedRun>, OxiTextError> {
1133 // Reuse cached script runs when the text is unchanged (Item 4 cache).
1134 if self.script_cache_text != text {
1135 let props = oxitext_icu::CharProperties::new();
1136 self.script_cache_runs = props.itemize(text);
1137 self.script_cache_text = text.to_owned();
1138 }
1139 let script_runs = self.script_cache_runs.clone();
1140
1141 let mut result: Vec<ShapedRun> = Vec::with_capacity(script_runs.len());
1142
1143 for run in &script_runs {
1144 let sub_text = text
1145 .get(run.start..run.end)
1146 .ok_or_else(|| OxiTextError::Shaping("invalid script run byte range".into()))?;
1147
1148 let ot_tag = text_script_to_ot_tag(run.script);
1149 let is_rtl = run.script.is_rtl();
1150
1151 let mut glyphs = self.shape_with_features_internal(ShapeParams {
1152 font_data: &font_data,
1153 text: sub_text,
1154 px_size,
1155 rtl: is_rtl,
1156 script_tag: Some(ot_tag),
1157 language_tag: None,
1158 features,
1159 })?;
1160
1161 // Adjust cluster offsets from sub-run-relative to text-absolute.
1162 let start_offset = run.start as u32;
1163 for g in &mut glyphs {
1164 g.cluster += start_offset;
1165 }
1166
1167 result.push(ShapedRun {
1168 glyphs: glyphs.into(),
1169 font_data: Arc::clone(&font_data),
1170 });
1171 }
1172
1173 Ok(result)
1174 }
1175}
1176
1177// ──────────────────────────────────────────────────────────────────────────────
1178// Kashida insertion opportunities (Feature 2)
1179// ──────────────────────────────────────────────────────────────────────────────
1180
1181/// Returns `true` when `c` is an Arabic character with Dual_Joining type.
1182///
1183/// Dual-joining characters connect to neighbours on both sides and are
1184/// therefore eligible for kashida (tatweel) stretching. This approximation
1185/// covers the mainstream Arabic block: U+0626..=U+063A and U+0641..=U+064A,
1186/// excluding known non-joiners (Alef U+0627, Dhal–Zain U+062F..=U+0632,
1187/// Waw U+0648).
1188fn is_arabic_dual_joining(c: char) -> bool {
1189 let cp = c as u32;
1190 match cp {
1191 // Lower Arabic range: Ba through Ghain (excludes Alef 0x0627,
1192 // Dal-Zain 0x062F–0x0632, and Waw 0x0648 which are right-joining only)
1193 0x0626..=0x063A => !matches!(cp, 0x0627 | 0x062F..=0x0632),
1194 // Upper Arabic range: Fa through Ya
1195 0x0641..=0x064A => !matches!(cp, 0x0648),
1196 _ => false,
1197 }
1198}
1199
1200/// Returns glyph indices (into `glyphs`) after which a kashida stretch can be
1201/// inserted for Arabic justification.
1202///
1203/// A position is a kashida opportunity when the source character at the
1204/// glyph's cluster byte offset is an Arabic dual-joining character (one that
1205/// connects on both sides and can therefore be stretched with tatweel).
1206///
1207/// If `text` does not contain Arabic text, or if no glyph's cluster maps to a
1208/// dual-joining character, the returned `Vec` is empty.
1209pub fn find_kashida_opportunities(text: &str, glyphs: &[ShapedGlyph]) -> Vec<usize> {
1210 let mut result = Vec::new();
1211 for (idx, glyph) in glyphs.iter().enumerate() {
1212 let byte_pos = glyph.cluster as usize;
1213 if let Some(ch) = text.get(byte_pos..).and_then(|s| s.chars().next()) {
1214 if is_arabic_dual_joining(ch) {
1215 result.push(idx);
1216 }
1217 }
1218 }
1219 result
1220}
1221
1222// ──────────────────────────────────────────────────────────────────────────────
1223// Emoji ZWJ sequence detection (Feature 3)
1224// ──────────────────────────────────────────────────────────────────────────────
1225
1226/// Returns byte ranges in `text` that correspond to ZWJ-joined emoji sequences.
1227///
1228/// A ZWJ emoji sequence is a grapheme cluster that:
1229/// 1. Contains U+200D (ZERO WIDTH JOINER), **and**
1230/// 2. Has at least two non-ZWJ codepoints (i.e. it is not a bare ZWJ followed
1231/// by nothing).
1232///
1233/// The returned ranges are contiguous byte spans in `text` covering each such
1234/// cluster. When multiple such clusters are adjacent (share no separator) they
1235/// are reported individually.
1236///
1237/// Uses [`unicode_segmentation::UnicodeSegmentation::grapheme_indices`] for
1238/// grapheme-cluster boundaries so that the detection is consistent with UAX #29.
1239pub fn detect_emoji_zwj_sequences(text: &str) -> Vec<std::ops::Range<usize>> {
1240 use unicode_segmentation::UnicodeSegmentation;
1241
1242 let mut result = Vec::new();
1243 for (start, cluster) in text.grapheme_indices(true) {
1244 // A ZWJ sequence must contain the joiner itself.
1245 if !cluster.contains('\u{200D}') {
1246 continue;
1247 }
1248 // Must also have at least 2 non-ZWJ codepoints.
1249 let non_zwj_count = cluster.chars().filter(|&c| c != '\u{200D}').count();
1250 if non_zwj_count >= 2 {
1251 let end = start + cluster.len();
1252 result.push(start..end);
1253 }
1254 }
1255 result
1256}
1257
1258#[cfg(test)]
1259mod bench_tests;
1260#[cfg(test)]
1261mod tests_inline;