par_term/
text_shaper.rs

1/// Text shaping module using HarfBuzz via rustybuzz
2///
3/// This module provides text shaping capabilities for:
4/// - Ligatures (fi, fl, ffi, ffl, etc.)
5/// - Complex emoji sequences (flags, skin tones, ZWJ sequences)
6/// - Complex scripts (Arabic, Devanagari, etc.)
7/// - Bidirectional text (RTL languages)
8/// - Kerning and contextual alternates
9///
10/// # Architecture
11///
12/// The text shaping pipeline:
13/// 1. Grapheme cluster detection (unicode-segmentation)
14/// 2. Script and direction detection (unicode-bidi)
15/// 3. Font feature selection (based on script/language)
16/// 4. Text shaping (rustybuzz)
17/// 5. Glyph positioning and advances
18/// 6. Result caching for performance
19///
20/// # Usage
21///
22/// ```ignore
23/// let shaper = TextShaper::new();
24/// let shaped = shaper.shape_text(
25///     "Hello πŸ‡ΊπŸ‡Έ world",
26///     &font,
27///     ShapingOptions::default()
28/// );
29/// ```
30use rustybuzz::{Face, Feature, GlyphBuffer, Language, Script, UnicodeBuffer};
31use std::collections::HashMap;
32use std::str::FromStr;
33use std::sync::Arc;
34use unicode_segmentation::UnicodeSegmentation;
35
36/// A single shaped glyph with positioning information
37#[derive(Debug, Clone, Copy)]
38pub struct ShapedGlyph {
39    /// Glyph ID from the font
40    #[allow(dead_code)]
41    pub glyph_id: u32,
42
43    /// Cluster index (which input character(s) this glyph represents)
44    #[allow(dead_code)]
45    pub cluster: u32,
46
47    /// Horizontal advance width in pixels
48    pub x_advance: f32,
49
50    /// Vertical advance (usually 0 for horizontal text)
51    #[allow(dead_code)]
52    pub y_advance: f32,
53
54    /// Horizontal offset from the current position
55    #[allow(dead_code)]
56    pub x_offset: f32,
57
58    /// Vertical offset from the baseline
59    #[allow(dead_code)]
60    pub y_offset: f32,
61}
62
63/// Options for text shaping
64#[derive(Debug, Clone)]
65pub struct ShapingOptions {
66    /// Enable standard ligatures (fi, fl, etc.)
67    pub enable_ligatures: bool,
68
69    /// Enable kerning adjustments
70    pub enable_kerning: bool,
71
72    /// Enable contextual alternates
73    pub enable_contextual_alternates: bool,
74
75    /// Script hint (e.g., "arab" for Arabic, "deva" for Devanagari)
76    pub script: Option<String>,
77
78    /// Language hint (e.g., "en" for English, "ar" for Arabic)
79    pub language: Option<String>,
80
81    /// Text direction (true = RTL, false = LTR)
82    pub rtl: bool,
83}
84
85impl Default for ShapingOptions {
86    fn default() -> Self {
87        Self {
88            enable_ligatures: true,
89            enable_kerning: true,
90            enable_contextual_alternates: true,
91            script: None,
92            language: None,
93            rtl: false,
94        }
95    }
96}
97
98/// Result of shaping a text run
99#[derive(Debug, Clone)]
100pub struct ShapedRun {
101    /// The input text that was shaped
102    #[allow(dead_code)]
103    pub text: String,
104
105    /// The shaped glyphs
106    #[allow(dead_code)]
107    pub glyphs: Vec<ShapedGlyph>,
108
109    /// Total advance width in pixels
110    #[allow(dead_code)]
111    pub total_advance: f32,
112
113    /// Grapheme cluster boundaries (indices into the text)
114    #[allow(dead_code)]
115    pub cluster_boundaries: Vec<usize>,
116}
117
118/// Cache key for shaped text runs
119#[derive(Debug, Clone, Hash, Eq, PartialEq)]
120struct ShapeCacheKey {
121    text: String,
122    font_index: usize,
123    enable_ligatures: bool,
124    enable_kerning: bool,
125    script: Option<String>,
126    language: Option<String>,
127    rtl: bool,
128}
129
130/// Text shaper using HarfBuzz via rustybuzz
131pub struct TextShaper {
132    /// Cache of shaped text runs
133    shape_cache: HashMap<ShapeCacheKey, Arc<ShapedRun>>,
134
135    /// Maximum cache size (number of entries)
136    max_cache_size: usize,
137}
138
139impl TextShaper {
140    /// Create a new text shaper with default settings
141    pub fn new() -> Self {
142        Self::with_cache_size(1000)
143    }
144
145    /// Create a new text shaper with a specific cache size
146    pub fn with_cache_size(max_cache_size: usize) -> Self {
147        Self {
148            shape_cache: HashMap::new(),
149            max_cache_size,
150        }
151    }
152
153    /// Detect grapheme clusters in the input text
154    ///
155    /// This is crucial for:
156    /// - Regional indicator pairs (flag emoji like πŸ‡ΊπŸ‡Έ)
157    /// - ZWJ sequences (emoji like πŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘¦)
158    /// - Combining characters (diacritics like Γ©)
159    /// - Emoji with skin tone modifiers (πŸ‘‹πŸ½)
160    pub fn detect_grapheme_clusters<'a>(&self, text: &'a str) -> Vec<(usize, &'a str)> {
161        text.grapheme_indices(true).collect()
162    }
163
164    /// Detect regional indicator pairs (flag emoji)
165    ///
166    /// Regional indicators are pairs of characters U+1F1E6-U+1F1FF
167    /// that combine to form flag emoji (e.g., πŸ‡ΊπŸ‡Έ = U+1F1FA + U+1F1F8)
168    #[allow(dead_code)]
169    pub fn is_regional_indicator_pair(&self, grapheme: &str) -> bool {
170        let chars: Vec<char> = grapheme.chars().collect();
171        if chars.len() == 2 {
172            let is_ri = |c: char| {
173                let code = c as u32;
174                (0x1F1E6..=0x1F1FF).contains(&code)
175            };
176            is_ri(chars[0]) && is_ri(chars[1])
177        } else {
178            false
179        }
180    }
181
182    /// Check if a grapheme contains a Zero Width Joiner (ZWJ)
183    ///
184    /// ZWJ sequences are used for complex emoji like family emoji (πŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘¦)
185    #[allow(dead_code)]
186    pub fn contains_zwj(&self, grapheme: &str) -> bool {
187        grapheme.contains('\u{200D}')
188    }
189
190    /// Shape a text run using rustybuzz
191    ///
192    /// This performs the actual text shaping, applying OpenType features
193    /// like ligatures, kerning, and contextual alternates.
194    ///
195    /// # Arguments
196    /// * `text` - The text to shape
197    /// * `font_data` - The font data (TrueType/OpenType)
198    /// * `font_index` - Font index for cache key
199    /// * `options` - Shaping options
200    ///
201    /// # Returns
202    /// A `ShapedRun` containing the shaped glyphs and metadata
203    pub fn shape_text(
204        &mut self,
205        text: &str,
206        font_data: &[u8],
207        font_index: usize,
208        options: ShapingOptions,
209    ) -> Arc<ShapedRun> {
210        // Check cache first
211        let cache_key = ShapeCacheKey {
212            text: text.to_string(),
213            font_index,
214            enable_ligatures: options.enable_ligatures,
215            enable_kerning: options.enable_kerning,
216            script: options.script.clone(),
217            language: options.language.clone(),
218            rtl: options.rtl,
219        };
220
221        if let Some(cached) = self.shape_cache.get(&cache_key) {
222            return Arc::clone(cached);
223        }
224
225        // Detect grapheme clusters
226        let clusters = self.detect_grapheme_clusters(text);
227        let cluster_boundaries: Vec<usize> = clusters.iter().map(|(idx, _)| *idx).collect();
228
229        // Create rustybuzz Face from font data
230        let face = match Face::from_slice(font_data, 0) {
231            Some(face) => face,
232            None => {
233                // If font parsing fails, return empty shaped run
234                let run = Arc::new(ShapedRun {
235                    text: text.to_string(),
236                    glyphs: vec![],
237                    total_advance: 0.0,
238                    cluster_boundaries,
239                });
240                return run;
241            }
242        };
243
244        // Create Unicode buffer and add text
245        let mut unicode_buffer = UnicodeBuffer::new();
246        unicode_buffer.push_str(text);
247
248        // Set direction
249        unicode_buffer.set_direction(if options.rtl {
250            rustybuzz::Direction::RightToLeft
251        } else {
252            rustybuzz::Direction::LeftToRight
253        });
254
255        // Set script hint if provided
256        if let Some(ref script_str) = options.script {
257            // Convert 4-letter script code to Script (e.g., "arab", "latn", "deva")
258            if let Ok(script) = Script::from_str(script_str) {
259                unicode_buffer.set_script(script);
260            }
261        }
262
263        // Set language hint if provided
264        if let Some(ref lang_str) = options.language {
265            // Convert language code to Language (e.g., "en", "ar", "zh")
266            if let Ok(lang) = Language::from_str(lang_str) {
267                unicode_buffer.set_language(lang);
268            }
269        }
270
271        // Build OpenType feature list based on options
272        // Use Feature::from_str() which parses standard feature notation
273        let mut features = Vec::new();
274
275        // Standard ligatures (liga): fi, fl, ffi, ffl
276        if options.enable_ligatures {
277            if let Ok(feat) = Feature::from_str("liga") {
278                features.push(feat);
279            }
280            // Contextual ligatures (clig) - often includes programming ligatures like ->, =>
281            if let Ok(feat) = Feature::from_str("clig") {
282                features.push(feat);
283            }
284            // Discretionary ligatures (dlig) - programming ligatures in many fonts
285            if let Ok(feat) = Feature::from_str("dlig") {
286                features.push(feat);
287            }
288        }
289
290        // Kerning adjustments (kern)
291        if options.enable_kerning
292            && let Ok(feat) = Feature::from_str("kern")
293        {
294            features.push(feat);
295        }
296
297        // Contextual alternates (calt) - enables context-sensitive glyph substitution
298        if options.enable_contextual_alternates
299            && let Ok(feat) = Feature::from_str("calt")
300        {
301            features.push(feat);
302        }
303
304        // Glyph composition/decomposition (ccmp) - required for proper emoji and complex scripts
305        if let Ok(feat) = Feature::from_str("ccmp") {
306            features.push(feat);
307        }
308
309        // Localized forms (locl) - language-specific glyph variants
310        if let Ok(feat) = Feature::from_str("locl") {
311            features.push(feat);
312        }
313
314        // Shape the text with OpenType features
315        let glyph_buffer = rustybuzz::shape(&face, &features, unicode_buffer);
316
317        // Extract shaped glyphs
318        let glyphs = self.extract_shaped_glyphs(&glyph_buffer);
319
320        // Calculate total advance
321        let total_advance = glyphs.iter().map(|g| g.x_advance).sum();
322
323        // Create shaped run
324        let shaped_run = Arc::new(ShapedRun {
325            text: text.to_string(),
326            glyphs,
327            total_advance,
328            cluster_boundaries,
329        });
330
331        // Cache the result (with LRU eviction if needed)
332        if self.shape_cache.len() >= self.max_cache_size {
333            // Simple eviction: remove first entry
334            // TODO: Implement proper LRU eviction
335            if let Some(key) = self.shape_cache.keys().next().cloned() {
336                self.shape_cache.remove(&key);
337            }
338        }
339
340        self.shape_cache.insert(cache_key, Arc::clone(&shaped_run));
341
342        shaped_run
343    }
344
345    /// Extract shaped glyphs from HarfBuzz glyph buffer
346    fn extract_shaped_glyphs(&self, buffer: &GlyphBuffer) -> Vec<ShapedGlyph> {
347        let glyph_infos = buffer.glyph_infos();
348        let glyph_positions = buffer.glyph_positions();
349
350        glyph_infos
351            .iter()
352            .zip(glyph_positions.iter())
353            .map(|(info, pos)| ShapedGlyph {
354                glyph_id: info.glyph_id,
355                cluster: info.cluster,
356                x_advance: pos.x_advance as f32,
357                y_advance: pos.y_advance as f32,
358                x_offset: pos.x_offset as f32,
359                y_offset: pos.y_offset as f32,
360            })
361            .collect()
362    }
363
364    /// Clear the shape cache
365    #[allow(dead_code)]
366    pub fn clear_cache(&mut self) {
367        self.shape_cache.clear();
368    }
369
370    /// Get the current cache size
371    #[allow(dead_code)]
372    pub fn cache_size(&self) -> usize {
373        self.shape_cache.len()
374    }
375}
376
377impl Default for TextShaper {
378    fn default() -> Self {
379        Self::new()
380    }
381}
382
383#[cfg(test)]
384mod tests {
385    use super::*;
386
387    #[test]
388    fn test_grapheme_cluster_detection() {
389        let shaper = TextShaper::new();
390
391        // Test simple ASCII
392        let clusters = shaper.detect_grapheme_clusters("hello");
393        assert_eq!(clusters.len(), 5);
394
395        // Test emoji with skin tone
396        let clusters = shaper.detect_grapheme_clusters("πŸ‘‹πŸ½");
397        assert_eq!(clusters.len(), 1); // Should be one grapheme
398
399        // Test flag emoji
400        let clusters = shaper.detect_grapheme_clusters("πŸ‡ΊπŸ‡Έ");
401        assert_eq!(clusters.len(), 1); // Should be one grapheme
402    }
403
404    #[test]
405    fn test_regional_indicator_detection() {
406        let shaper = TextShaper::new();
407
408        // US flag
409        assert!(shaper.is_regional_indicator_pair("πŸ‡ΊπŸ‡Έ"));
410
411        // Regular text
412        assert!(!shaper.is_regional_indicator_pair("US"));
413
414        // Single character
415        assert!(!shaper.is_regional_indicator_pair("A"));
416    }
417
418    #[test]
419    fn test_zwj_detection() {
420        let shaper = TextShaper::new();
421
422        // Family emoji (contains ZWJ)
423        assert!(shaper.contains_zwj("πŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘¦"));
424
425        // Regular emoji (no ZWJ)
426        assert!(!shaper.contains_zwj("πŸ‘‹"));
427
428        // Regular text (no ZWJ)
429        assert!(!shaper.contains_zwj("hello"));
430    }
431}