Skip to main content

par_term_fonts/
text_shaper.rs

1/// Text shaping module using HarfBuzz via rustybuzz
2///
3/// This module provides text shaping capabilities for:
4/// - Ligatures (fi, fl, ffi, ffl, etc.)
5/// - Complex emoji sequences (flags, skin tones, ZWJ sequences)
6/// - Complex scripts (Arabic, Devanagari, etc.)
7/// - Bidirectional text (RTL languages)
8/// - Kerning and contextual alternates
9///
10/// # Architecture
11///
12/// The text shaping pipeline:
13/// 1. Grapheme cluster detection (unicode-segmentation)
14/// 2. Script and direction detection (unicode-bidi)
15/// 3. Font feature selection (based on script/language)
16/// 4. Text shaping (rustybuzz)
17/// 5. Glyph positioning and advances
18/// 6. Result caching for performance
19///
20/// # Usage
21///
22/// ```ignore
23/// let shaper = TextShaper::new();
24/// let shaped = shaper.shape_text(
25///     "Hello πŸ‡ΊπŸ‡Έ world",
26///     &font,
27///     ShapingOptions::default()
28/// );
29/// ```
30use lru::LruCache;
31use rustybuzz::{Face, Feature, GlyphBuffer, Language, Script, UnicodeBuffer};
32use std::num::NonZeroUsize;
33use std::str::FromStr;
34use std::sync::Arc;
35use unicode_segmentation::UnicodeSegmentation;
36
37/// A single shaped glyph with positioning information
38#[derive(Debug, Clone, Copy)]
39pub struct ShapedGlyph {
40    /// Glyph ID from the font
41    #[allow(dead_code)]
42    pub glyph_id: u32,
43
44    /// Cluster index (which input character(s) this glyph represents)
45    #[allow(dead_code)]
46    pub cluster: u32,
47
48    /// Horizontal advance width in pixels
49    pub x_advance: f32,
50
51    /// Vertical advance (usually 0 for horizontal text)
52    #[allow(dead_code)]
53    pub y_advance: f32,
54
55    /// Horizontal offset from the current position
56    #[allow(dead_code)]
57    pub x_offset: f32,
58
59    /// Vertical offset from the baseline
60    #[allow(dead_code)]
61    pub y_offset: f32,
62}
63
64/// Options for text shaping
65#[derive(Debug, Clone)]
66pub struct ShapingOptions {
67    /// Enable standard ligatures (fi, fl, etc.)
68    pub enable_ligatures: bool,
69
70    /// Enable kerning adjustments
71    pub enable_kerning: bool,
72
73    /// Enable contextual alternates
74    pub enable_contextual_alternates: bool,
75
76    /// Script hint (e.g., "arab" for Arabic, "deva" for Devanagari)
77    pub script: Option<String>,
78
79    /// Language hint (e.g., "en" for English, "ar" for Arabic)
80    pub language: Option<String>,
81
82    /// Text direction (true = RTL, false = LTR)
83    pub rtl: bool,
84}
85
86impl Default for ShapingOptions {
87    fn default() -> Self {
88        Self {
89            enable_ligatures: true,
90            enable_kerning: true,
91            enable_contextual_alternates: true,
92            script: None,
93            language: None,
94            rtl: false,
95        }
96    }
97}
98
99/// Result of shaping a text run
100#[derive(Debug, Clone)]
101pub struct ShapedRun {
102    /// The input text that was shaped
103    #[allow(dead_code)]
104    pub text: String,
105
106    /// The shaped glyphs
107    #[allow(dead_code)]
108    pub glyphs: Vec<ShapedGlyph>,
109
110    /// Total advance width in pixels
111    #[allow(dead_code)]
112    pub total_advance: f32,
113
114    /// Grapheme cluster boundaries (indices into the text)
115    #[allow(dead_code)]
116    pub cluster_boundaries: Vec<usize>,
117}
118
119/// Cache key for shaped text runs
120#[derive(Debug, Clone, Hash, Eq, PartialEq)]
121struct ShapeCacheKey {
122    text: String,
123    font_index: usize,
124    enable_ligatures: bool,
125    enable_kerning: bool,
126    script: Option<String>,
127    language: Option<String>,
128    rtl: bool,
129}
130
131/// Text shaper using HarfBuzz via rustybuzz
132pub struct TextShaper {
133    /// LRU cache of shaped text runs
134    shape_cache: LruCache<ShapeCacheKey, Arc<ShapedRun>>,
135}
136
137impl TextShaper {
138    /// Create a new text shaper with default settings
139    pub fn new() -> Self {
140        Self::with_cache_size(1000)
141    }
142
143    /// Create a new text shaper with a specific cache size
144    pub fn with_cache_size(max_cache_size: usize) -> Self {
145        Self {
146            shape_cache: LruCache::new(
147                NonZeroUsize::new(max_cache_size).unwrap_or(NonZeroUsize::new(1000).unwrap()),
148            ),
149        }
150    }
151
152    /// Detect grapheme clusters in the input text
153    ///
154    /// This is crucial for:
155    /// - Regional indicator pairs (flag emoji like πŸ‡ΊπŸ‡Έ)
156    /// - ZWJ sequences (emoji like πŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘¦)
157    /// - Combining characters (diacritics like Γ©)
158    /// - Emoji with skin tone modifiers (πŸ‘‹πŸ½)
159    pub fn detect_grapheme_clusters<'a>(&self, text: &'a str) -> Vec<(usize, &'a str)> {
160        text.grapheme_indices(true).collect()
161    }
162
163    /// Detect regional indicator pairs (flag emoji)
164    ///
165    /// Regional indicators are pairs of characters U+1F1E6-U+1F1FF
166    /// that combine to form flag emoji (e.g., πŸ‡ΊπŸ‡Έ = U+1F1FA + U+1F1F8)
167    #[allow(dead_code)]
168    pub fn is_regional_indicator_pair(&self, grapheme: &str) -> bool {
169        let chars: Vec<char> = grapheme.chars().collect();
170        if chars.len() == 2 {
171            let is_ri = |c: char| {
172                let code = c as u32;
173                (0x1F1E6..=0x1F1FF).contains(&code)
174            };
175            is_ri(chars[0]) && is_ri(chars[1])
176        } else {
177            false
178        }
179    }
180
181    /// Check if a grapheme contains a Zero Width Joiner (ZWJ)
182    ///
183    /// ZWJ sequences are used for complex emoji like family emoji (πŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘¦)
184    #[allow(dead_code)]
185    pub fn contains_zwj(&self, grapheme: &str) -> bool {
186        grapheme.contains('\u{200D}')
187    }
188
189    /// Shape a text run using rustybuzz
190    ///
191    /// This performs the actual text shaping, applying OpenType features
192    /// like ligatures, kerning, and contextual alternates.
193    ///
194    /// # Arguments
195    /// * `text` - The text to shape
196    /// * `font_data` - The font data (TrueType/OpenType)
197    /// * `font_index` - Font index for cache key
198    /// * `options` - Shaping options
199    ///
200    /// # Returns
201    /// A `ShapedRun` containing the shaped glyphs and metadata
202    pub fn shape_text(
203        &mut self,
204        text: &str,
205        font_data: &[u8],
206        font_index: usize,
207        options: ShapingOptions,
208    ) -> Arc<ShapedRun> {
209        // Check cache first
210        let cache_key = ShapeCacheKey {
211            text: text.to_string(),
212            font_index,
213            enable_ligatures: options.enable_ligatures,
214            enable_kerning: options.enable_kerning,
215            script: options.script.clone(),
216            language: options.language.clone(),
217            rtl: options.rtl,
218        };
219
220        if let Some(cached) = self.shape_cache.get(&cache_key) {
221            return Arc::clone(cached);
222        }
223
224        // Detect grapheme clusters
225        let clusters = self.detect_grapheme_clusters(text);
226        let cluster_boundaries: Vec<usize> = clusters.iter().map(|(idx, _)| *idx).collect();
227
228        // Create rustybuzz Face from font data
229        let face = match Face::from_slice(font_data, 0) {
230            Some(face) => face,
231            None => {
232                // If font parsing fails, return empty shaped run
233                let run = Arc::new(ShapedRun {
234                    text: text.to_string(),
235                    glyphs: vec![],
236                    total_advance: 0.0,
237                    cluster_boundaries,
238                });
239                return run;
240            }
241        };
242
243        // Create Unicode buffer and add text
244        let mut unicode_buffer = UnicodeBuffer::new();
245        unicode_buffer.push_str(text);
246
247        // Set direction
248        unicode_buffer.set_direction(if options.rtl {
249            rustybuzz::Direction::RightToLeft
250        } else {
251            rustybuzz::Direction::LeftToRight
252        });
253
254        // Set script hint if provided
255        if let Some(ref script_str) = options.script {
256            // Convert 4-letter script code to Script (e.g., "arab", "latn", "deva")
257            if let Ok(script) = Script::from_str(script_str) {
258                unicode_buffer.set_script(script);
259            }
260        }
261
262        // Set language hint if provided
263        if let Some(ref lang_str) = options.language {
264            // Convert language code to Language (e.g., "en", "ar", "zh")
265            if let Ok(lang) = Language::from_str(lang_str) {
266                unicode_buffer.set_language(lang);
267            }
268        }
269
270        // Build OpenType feature list based on options
271        // Use Feature::from_str() which parses standard feature notation
272        let mut features = Vec::new();
273
274        // Standard ligatures (liga): fi, fl, ffi, ffl
275        if options.enable_ligatures {
276            if let Ok(feat) = Feature::from_str("liga") {
277                features.push(feat);
278            }
279            // Contextual ligatures (clig) - often includes programming ligatures like ->, =>
280            if let Ok(feat) = Feature::from_str("clig") {
281                features.push(feat);
282            }
283            // Discretionary ligatures (dlig) - programming ligatures in many fonts
284            if let Ok(feat) = Feature::from_str("dlig") {
285                features.push(feat);
286            }
287        }
288
289        // Kerning adjustments (kern)
290        if options.enable_kerning
291            && let Ok(feat) = Feature::from_str("kern")
292        {
293            features.push(feat);
294        }
295
296        // Contextual alternates (calt) - enables context-sensitive glyph substitution
297        if options.enable_contextual_alternates
298            && let Ok(feat) = Feature::from_str("calt")
299        {
300            features.push(feat);
301        }
302
303        // Glyph composition/decomposition (ccmp) - required for proper emoji and complex scripts
304        if let Ok(feat) = Feature::from_str("ccmp") {
305            features.push(feat);
306        }
307
308        // Localized forms (locl) - language-specific glyph variants
309        if let Ok(feat) = Feature::from_str("locl") {
310            features.push(feat);
311        }
312
313        // Shape the text with OpenType features
314        let glyph_buffer = rustybuzz::shape(&face, &features, unicode_buffer);
315
316        // Extract shaped glyphs
317        let glyphs = self.extract_shaped_glyphs(&glyph_buffer);
318
319        // Calculate total advance
320        let total_advance = glyphs.iter().map(|g| g.x_advance).sum();
321
322        // Create shaped run
323        let shaped_run = Arc::new(ShapedRun {
324            text: text.to_string(),
325            glyphs,
326            total_advance,
327            cluster_boundaries,
328        });
329
330        // Cache the result (LRU eviction is automatic)
331        self.shape_cache.put(cache_key, Arc::clone(&shaped_run));
332
333        shaped_run
334    }
335
336    /// Extract shaped glyphs from HarfBuzz glyph buffer
337    fn extract_shaped_glyphs(&self, buffer: &GlyphBuffer) -> Vec<ShapedGlyph> {
338        let glyph_infos = buffer.glyph_infos();
339        let glyph_positions = buffer.glyph_positions();
340
341        glyph_infos
342            .iter()
343            .zip(glyph_positions.iter())
344            .map(|(info, pos)| ShapedGlyph {
345                glyph_id: info.glyph_id,
346                cluster: info.cluster,
347                x_advance: pos.x_advance as f32,
348                y_advance: pos.y_advance as f32,
349                x_offset: pos.x_offset as f32,
350                y_offset: pos.y_offset as f32,
351            })
352            .collect()
353    }
354
355    /// Clear the shape cache
356    #[allow(dead_code)]
357    pub fn clear_cache(&mut self) {
358        self.shape_cache.clear();
359    }
360
361    /// Get the current cache size
362    #[allow(dead_code)]
363    pub fn cache_size(&self) -> usize {
364        self.shape_cache.len()
365    }
366}
367
368impl Default for TextShaper {
369    fn default() -> Self {
370        Self::new()
371    }
372}
373
374#[cfg(test)]
375mod tests {
376    use super::*;
377
378    #[test]
379    fn test_grapheme_cluster_detection() {
380        let shaper = TextShaper::new();
381
382        // Test simple ASCII
383        let clusters = shaper.detect_grapheme_clusters("hello");
384        assert_eq!(clusters.len(), 5);
385
386        // Test emoji with skin tone
387        let clusters = shaper.detect_grapheme_clusters("πŸ‘‹πŸ½");
388        assert_eq!(clusters.len(), 1); // Should be one grapheme
389
390        // Test flag emoji
391        let clusters = shaper.detect_grapheme_clusters("πŸ‡ΊπŸ‡Έ");
392        assert_eq!(clusters.len(), 1); // Should be one grapheme
393    }
394
395    #[test]
396    fn test_regional_indicator_detection() {
397        let shaper = TextShaper::new();
398
399        // US flag
400        assert!(shaper.is_regional_indicator_pair("πŸ‡ΊπŸ‡Έ"));
401
402        // Regular text
403        assert!(!shaper.is_regional_indicator_pair("US"));
404
405        // Single character
406        assert!(!shaper.is_regional_indicator_pair("A"));
407    }
408
409    #[test]
410    fn test_zwj_detection() {
411        let shaper = TextShaper::new();
412
413        // Family emoji (contains ZWJ)
414        assert!(shaper.contains_zwj("πŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘¦"));
415
416        // Regular emoji (no ZWJ)
417        assert!(!shaper.contains_zwj("πŸ‘‹"));
418
419        // Regular text (no ZWJ)
420        assert!(!shaper.contains_zwj("hello"));
421    }
422}