Skip to main content

oxidize_pdf/fonts/
embedder.rs

1//! Font embedding functionality for PDF generation
2
3use super::Font;
4use crate::objects::{Dictionary, Object, ObjectId};
5use crate::text::fonts::embedding::CjkFontType;
6use crate::Result;
7
8/// Font embedding options
9#[derive(Debug, Clone)]
10pub struct EmbeddingOptions {
11    /// Whether to subset the font (only include used glyphs)
12    pub subset: bool,
13    /// Whether to compress the font data
14    pub compress: bool,
15    /// Font encoding to use
16    pub encoding: FontEncoding,
17}
18
19impl Default for EmbeddingOptions {
20    fn default() -> Self {
21        EmbeddingOptions {
22            subset: true,
23            compress: true,
24            encoding: FontEncoding::WinAnsiEncoding,
25        }
26    }
27}
28
29/// Font encoding options
30#[derive(Debug, Clone, Copy, PartialEq, Eq)]
31pub enum FontEncoding {
32    /// Windows ANSI encoding (CP1252)
33    WinAnsiEncoding,
34    /// Mac Roman encoding
35    MacRomanEncoding,
36    /// Standard PDF encoding
37    StandardEncoding,
38    /// Identity encoding for CID fonts
39    IdentityH,
40}
41
42impl FontEncoding {
43    /// Get the encoding name for PDF
44    pub fn name(&self) -> &'static str {
45        match self {
46            FontEncoding::WinAnsiEncoding => "WinAnsiEncoding",
47            FontEncoding::MacRomanEncoding => "MacRomanEncoding",
48            FontEncoding::StandardEncoding => "StandardEncoding",
49            FontEncoding::IdentityH => "Identity-H",
50        }
51    }
52}
53
54/// Font embedder for creating PDF font objects
55pub struct FontEmbedder<'a> {
56    font: &'a Font,
57    options: EmbeddingOptions,
58    used_chars: Vec<char>,
59}
60
61impl<'a> FontEmbedder<'a> {
62    /// Create a new font embedder
63    pub fn new(font: &'a Font, options: EmbeddingOptions) -> Self {
64        FontEmbedder {
65            font,
66            options,
67            used_chars: Vec::new(),
68        }
69    }
70
71    /// Add characters that will be used with this font
72    pub fn add_used_chars(&mut self, text: &str) {
73        for ch in text.chars() {
74            if !self.used_chars.contains(&ch) {
75                self.used_chars.push(ch);
76            }
77        }
78    }
79
80    /// Create the font dictionary for embedding
81    pub fn create_font_dict(
82        &self,
83        descriptor_id: ObjectId,
84        to_unicode_id: Option<ObjectId>,
85    ) -> Dictionary {
86        let mut dict = Dictionary::new();
87
88        // Type and Subtype
89        dict.set("Type", Object::Name("Font".into()));
90
91        // Determine font type based on encoding
92        if self.options.encoding == FontEncoding::IdentityH {
93            // Type 0 (composite) font for Unicode support
94            self.create_type0_font_dict(&mut dict, descriptor_id, to_unicode_id);
95        } else {
96            // Type 1 or TrueType font
97            self.create_simple_font_dict(&mut dict, descriptor_id, to_unicode_id);
98        }
99
100        dict
101    }
102
103    /// Create a Type 0 (composite) font dictionary
104    fn create_type0_font_dict(
105        &self,
106        dict: &mut Dictionary,
107        descriptor_id: ObjectId,
108        to_unicode_id: Option<ObjectId>,
109    ) {
110        dict.set("Subtype", Object::Name("Type0".into()));
111        dict.set("BaseFont", Object::Name(self.font.postscript_name().into()));
112        dict.set(
113            "Encoding",
114            Object::Name(self.options.encoding.name().into()),
115        );
116
117        // DescendantFonts array with CIDFont
118        let cid_font_dict = self.create_cid_font_dict(descriptor_id);
119        dict.set(
120            "DescendantFonts",
121            Object::Array(vec![Object::Dictionary(cid_font_dict)]),
122        );
123
124        if let Some(to_unicode) = to_unicode_id {
125            dict.set("ToUnicode", Object::Reference(to_unicode));
126        }
127    }
128
129    /// Create a CIDFont dictionary
130    fn create_cid_font_dict(&self, descriptor_id: ObjectId) -> Dictionary {
131        let mut dict = Dictionary::new();
132
133        dict.set("Type", Object::Name("Font".into()));
134
135        let font_name = self.font.postscript_name();
136        let cid_font_subtype =
137            if CjkFontType::should_use_cidfonttype2_for_preview_compatibility(font_name) {
138                tracing::debug!(
139                    "Using CIDFontType2 for CJK font {} (Preview.app compatibility)",
140                    font_name
141                );
142                "CIDFontType2" // Force CIDFontType2 for CJK fonts to fix Preview.app rendering
143            } else {
144                "CIDFontType2" // Default for this embedder
145            };
146
147        dict.set("Subtype", Object::Name(cid_font_subtype.into()));
148        dict.set("BaseFont", Object::Name(font_name.into()));
149
150        // CIDSystemInfo - Use appropriate values for CJK fonts
151        let mut cid_system_info = Dictionary::new();
152        let font_name = self.font.postscript_name();
153        let (registry, ordering, supplement) =
154            if let Some(cjk_type) = CjkFontType::detect_from_name(font_name) {
155                cjk_type.cid_system_info()
156            } else {
157                ("Adobe", "Identity", 0)
158            };
159
160        cid_system_info.set("Registry", Object::String(registry.into()));
161        cid_system_info.set("Ordering", Object::String(ordering.into()));
162        cid_system_info.set("Supplement", Object::Integer(supplement as i64));
163        dict.set("CIDSystemInfo", Object::Dictionary(cid_system_info));
164
165        dict.set("FontDescriptor", Object::Reference(descriptor_id));
166
167        // Default width
168        dict.set("DW", Object::Integer(1000));
169
170        // Width array with actual glyph widths
171        let widths_array = self.create_cid_widths_array();
172        dict.set("W", Object::Array(widths_array));
173
174        dict
175    }
176
177    /// Create a simple font dictionary (Type1/TrueType)
178    fn create_simple_font_dict(
179        &self,
180        dict: &mut Dictionary,
181        descriptor_id: ObjectId,
182        to_unicode_id: Option<ObjectId>,
183    ) {
184        dict.set("Subtype", Object::Name("TrueType".into()));
185        dict.set("BaseFont", Object::Name(self.font.postscript_name().into()));
186        dict.set(
187            "Encoding",
188            Object::Name(self.options.encoding.name().into()),
189        );
190
191        dict.set("FontDescriptor", Object::Reference(descriptor_id));
192
193        // FirstChar and LastChar
194        let (first_char, last_char) = self.get_char_range();
195        dict.set("FirstChar", Object::Integer(first_char as i64));
196        dict.set("LastChar", Object::Integer(last_char as i64));
197
198        // Widths array
199        let widths = self.create_widths_array(first_char, last_char);
200        dict.set("Widths", Object::Array(widths));
201
202        if let Some(to_unicode) = to_unicode_id {
203            dict.set("ToUnicode", Object::Reference(to_unicode));
204        }
205    }
206
207    /// Get the range of characters used
208    fn get_char_range(&self) -> (u8, u8) {
209        if self.used_chars.is_empty() {
210            return (32, 126); // Default ASCII range
211        }
212
213        let mut min = 255;
214        let mut max = 0;
215
216        for &ch in &self.used_chars {
217            if ch as u32 <= 255 {
218                let byte = ch as u8;
219                if byte < min {
220                    min = byte;
221                }
222                if byte > max {
223                    max = byte;
224                }
225            }
226        }
227
228        (min, max)
229    }
230
231    /// Create widths array for the font
232    fn create_widths_array(&self, first_char: u8, last_char: u8) -> Vec<Object> {
233        let mut widths = Vec::new();
234
235        for ch in first_char..=last_char {
236            if let Some(width) = self.font.glyph_mapping.get_char_width(char::from(ch)) {
237                // Convert from font units to PDF units (1/1000)
238                let pdf_width = (width as f64 * 1000.0) / self.font.metrics.units_per_em as f64;
239                widths.push(Object::Integer(pdf_width as i64));
240            } else {
241                // Default width for missing glyphs
242                widths.push(Object::Integer(600));
243            }
244        }
245
246        widths
247    }
248
249    /// Create CID widths array for CID fonts
250    fn create_cid_widths_array(&self) -> Vec<Object> {
251        let mut width_array = Vec::new();
252
253        // Create a map of character widths
254        let mut char_widths = std::collections::HashMap::new();
255
256        // For each used character, get its width
257        for &ch in &self.used_chars {
258            if let Some(width) = self.font.glyph_mapping.get_char_width(ch) {
259                // Convert from font units to PDF units (1/1000)
260                let pdf_width = (width as f64 * 1000.0) / self.font.metrics.units_per_em as f64;
261                char_widths.insert(ch as u32, pdf_width as i64);
262            }
263        }
264
265        // Group consecutive characters with same width for efficiency
266        let mut sorted_chars: Vec<_> = char_widths.iter().collect();
267        sorted_chars.sort_by_key(|(code, _)| *code);
268
269        let mut current_range_start = None;
270        let mut current_range_end = None; // Track the end of the current range
271        let mut current_width = None;
272
273        for (&code, &width) in sorted_chars {
274            match (current_range_start, current_range_end) {
275                (None, _) => {
276                    // Start a new range
277                    current_range_start = Some(code);
278                    current_range_end = Some(code);
279                    current_width = Some(width);
280                }
281                (Some(_start), Some(end)) => {
282                    // Check if we can continue the current range
283                    // (consecutive code AND same width)
284                    if current_width == Some(width) && code == end + 1 {
285                        // Continue the range
286                        current_range_end = Some(code);
287                    } else {
288                        // End current range and add to array
289                        if let (Some(start_code), Some(w)) = (current_range_start, current_width) {
290                            width_array.push(Object::Integer(start_code as i64));
291                            width_array.push(Object::Array(vec![Object::Integer(w)]));
292                        }
293
294                        // Start new range
295                        current_range_start = Some(code);
296                        current_range_end = Some(code);
297                        current_width = Some(width);
298                    }
299                }
300                (Some(_), None) => {
301                    // This should never happen (invariant: if start is Some, end must be Some)
302                    unreachable!("Range start without range end (this is a bug)")
303                }
304            }
305        }
306
307        // Don't forget the last range
308        if let (Some(start_code), Some(w)) = (current_range_start, current_width) {
309            width_array.push(Object::Integer(start_code as i64));
310            width_array.push(Object::Array(vec![Object::Integer(w)]));
311        }
312
313        width_array
314    }
315
316    /// Create ToUnicode CMap for text extraction
317    pub fn create_to_unicode_cmap(&self) -> Vec<u8> {
318        let mut cmap = String::new();
319
320        // CMap header
321        cmap.push_str("/CIDInit /ProcSet findresource begin\n");
322        cmap.push_str("12 dict begin\n");
323        cmap.push_str("begincmap\n");
324        cmap.push_str("/CIDSystemInfo\n");
325        cmap.push_str("<< /Registry (Adobe)\n");
326        cmap.push_str("   /Ordering (UCS)\n");
327        cmap.push_str("   /Supplement 0\n");
328        cmap.push_str(">> def\n");
329        cmap.push_str("/CMapName /Adobe-Identity-UCS def\n");
330        cmap.push_str("/CMapType 2 def\n");
331        cmap.push_str("1 begincodespacerange\n");
332        cmap.push_str("<0000> <FFFF>\n");
333        cmap.push_str("endcodespacerange\n");
334
335        // Character mappings
336        let mut mappings = Vec::new();
337        for &ch in &self.used_chars {
338            if let Some(glyph) = self.font.glyph_mapping.char_to_glyph(ch) {
339                mappings.push((glyph, ch));
340            }
341        }
342
343        if !mappings.is_empty() {
344            cmap.push_str(&format!("{} beginbfchar\n", mappings.len()));
345            for (glyph, ch) in mappings {
346                cmap.push_str(&format!("<{:04X}> <{:04X}>\n", glyph, ch as u32));
347            }
348            cmap.push_str("endbfchar\n");
349        }
350
351        // CMap footer
352        cmap.push_str("endcmap\n");
353        cmap.push_str("CMapName currentdict /CMap defineresource pop\n");
354        cmap.push_str("end\n");
355        cmap.push_str("end\n");
356
357        cmap.into_bytes()
358    }
359
360    /// Get the font data for embedding
361    pub fn get_font_data(&self) -> Result<Vec<u8>> {
362        if self.options.subset {
363            // Basic subsetting: currently returns full font
364            // Full TrueType subsetting requires complex table manipulation:
365            // - Reordering glyphs in glyf/loca tables
366            // - Updating glyph indices in cmap table
367            // - Recalculating table checksums
368            // - Updating cross-references between tables
369            //
370            // For now, return the full font data but track used characters
371            // for proper width array generation
372            if !self.used_chars.is_empty() {
373                // Font will be optimized with proper width arrays
374                Ok(self.font.data.clone())
375            } else {
376                // No characters used - return minimal font
377                Ok(self.font.data.clone())
378            }
379        } else {
380            Ok(self.font.data.clone())
381        }
382    }
383}
384
385#[cfg(test)]
386mod tests {
387    use super::*;
388    use crate::fonts::{Font, FontDescriptor, FontFormat, FontMetrics, GlyphMapping};
389
390    fn create_test_font() -> Font {
391        let mut glyph_mapping = GlyphMapping::default();
392        for ch in 32..127 {
393            glyph_mapping.add_mapping(char::from(ch), ch as u16);
394            glyph_mapping.set_glyph_width(ch as u16, 600);
395        }
396
397        Font {
398            name: "TestFont".to_string(),
399            data: vec![0; 1000],
400            format: FontFormat::TrueType,
401            metrics: FontMetrics {
402                units_per_em: 1000,
403                ascent: 800,
404                descent: -200,
405                line_gap: 200,
406                cap_height: 700,
407                x_height: 500,
408            },
409            descriptor: FontDescriptor::new("TestFont"),
410            glyph_mapping,
411        }
412    }
413
414    #[test]
415    fn test_font_embedder_creation() {
416        let font = create_test_font();
417        let options = EmbeddingOptions::default();
418        let embedder = FontEmbedder::new(&font, options);
419
420        assert_eq!(embedder.used_chars.len(), 0);
421    }
422
423    #[test]
424    fn test_add_used_chars() {
425        let font = create_test_font();
426        let options = EmbeddingOptions::default();
427        let mut embedder = FontEmbedder::new(&font, options);
428
429        embedder.add_used_chars("Hello");
430        assert_eq!(embedder.used_chars.len(), 4); // H, e, l, o (l appears twice but is deduplicated)
431
432        embedder.add_used_chars("World");
433        assert_eq!(embedder.used_chars.len(), 7); // H,e,l,o,W,r,d (o and l overlap between Hello and World)
434    }
435
436    #[test]
437    fn test_char_range() {
438        let font = create_test_font();
439        let options = EmbeddingOptions::default();
440        let mut embedder = FontEmbedder::new(&font, options);
441
442        embedder.add_used_chars("AZ");
443        let (first, last) = embedder.get_char_range();
444        assert_eq!(first, b'A');
445        assert_eq!(last, b'Z');
446    }
447
448    #[test]
449    fn test_font_encoding_names() {
450        assert_eq!(FontEncoding::WinAnsiEncoding.name(), "WinAnsiEncoding");
451        assert_eq!(FontEncoding::MacRomanEncoding.name(), "MacRomanEncoding");
452        assert_eq!(FontEncoding::StandardEncoding.name(), "StandardEncoding");
453        assert_eq!(FontEncoding::IdentityH.name(), "Identity-H");
454    }
455
456    #[test]
457    fn test_create_simple_font_dict() {
458        let font = create_test_font();
459        let options = EmbeddingOptions {
460            subset: false,
461            compress: false,
462            encoding: FontEncoding::WinAnsiEncoding,
463        };
464        let mut embedder = FontEmbedder::new(&font, options);
465        embedder.add_used_chars("ABC");
466
467        let font_dict = embedder.create_font_dict(ObjectId::new(10, 0), Some(ObjectId::new(11, 0)));
468
469        assert_eq!(font_dict.get("Type").unwrap(), &Object::Name("Font".into()));
470        assert_eq!(
471            font_dict.get("Subtype").unwrap(),
472            &Object::Name("TrueType".into())
473        );
474        assert!(font_dict.get("FirstChar").is_some());
475        assert!(font_dict.get("LastChar").is_some());
476        assert!(font_dict.get("Widths").is_some());
477    }
478
479    #[test]
480    fn test_create_type0_font_dict() {
481        let font = create_test_font();
482        let options = EmbeddingOptions {
483            subset: false,
484            compress: false,
485            encoding: FontEncoding::IdentityH,
486        };
487        let embedder = FontEmbedder::new(&font, options);
488
489        let font_dict = embedder.create_font_dict(ObjectId::new(10, 0), Some(ObjectId::new(11, 0)));
490
491        assert_eq!(font_dict.get("Type").unwrap(), &Object::Name("Font".into()));
492        assert_eq!(
493            font_dict.get("Subtype").unwrap(),
494            &Object::Name("Type0".into())
495        );
496        assert_eq!(
497            font_dict.get("Encoding").unwrap(),
498            &Object::Name("Identity-H".into())
499        );
500        assert!(font_dict.get("DescendantFonts").is_some());
501    }
502
503    #[test]
504    fn test_create_widths_array() {
505        let font = create_test_font();
506        let options = EmbeddingOptions::default();
507        let embedder = FontEmbedder::new(&font, options);
508
509        let widths = embedder.create_widths_array(65, 67); // A, B, C
510        assert_eq!(widths.len(), 3);
511        for width in &widths {
512            if let Object::Integer(w) = width {
513                assert_eq!(*w, 600); // All test glyphs have width 600
514            } else {
515                panic!("Expected Integer object");
516            }
517        }
518    }
519
520    #[test]
521    fn test_create_to_unicode_cmap() {
522        let font = create_test_font();
523        let options = EmbeddingOptions::default();
524        let mut embedder = FontEmbedder::new(&font, options);
525        embedder.add_used_chars("Hello");
526
527        let cmap = embedder.create_to_unicode_cmap();
528        let cmap_str = String::from_utf8(cmap).unwrap();
529
530        assert!(cmap_str.contains("begincmap"));
531        assert!(cmap_str.contains("endcmap"));
532        assert!(cmap_str.contains("beginbfchar"));
533        assert!(cmap_str.contains("endbfchar"));
534    }
535
536    #[test]
537    fn test_get_font_data() {
538        let font = create_test_font();
539        let options = EmbeddingOptions {
540            subset: false,
541            compress: false,
542            encoding: FontEncoding::WinAnsiEncoding,
543        };
544        let embedder = FontEmbedder::new(&font, options);
545
546        let font_data = embedder.get_font_data().unwrap();
547        assert_eq!(font_data.len(), 1000);
548    }
549
550    #[test]
551    fn test_embedding_options_default() {
552        let options = EmbeddingOptions::default();
553        assert!(options.subset);
554        assert!(options.compress);
555        assert_eq!(options.encoding, FontEncoding::WinAnsiEncoding);
556    }
557
558    #[test]
559    fn test_char_range_empty() {
560        let font = create_test_font();
561        let options = EmbeddingOptions::default();
562        let embedder = FontEmbedder::new(&font, options);
563
564        let (first, last) = embedder.get_char_range();
565        assert_eq!(first, 32); // Default ASCII range
566        assert_eq!(last, 126);
567    }
568
569    #[test]
570    fn test_char_range_with_unicode() {
571        let font = create_test_font();
572        let options = EmbeddingOptions::default();
573        let mut embedder = FontEmbedder::new(&font, options);
574
575        // Add characters including non-ASCII
576        embedder.add_used_chars("A€B"); // Euro sign is > 255
577        let (first, last) = embedder.get_char_range();
578
579        // Should only consider ASCII characters
580        assert_eq!(first, b'A');
581        assert_eq!(last, b'B');
582    }
583
584    #[test]
585    fn test_cid_font_dict_creation() {
586        let font = create_test_font();
587        let options = EmbeddingOptions {
588            subset: false,
589            compress: false,
590            encoding: FontEncoding::IdentityH,
591        };
592        let embedder = FontEmbedder::new(&font, options);
593
594        let cid_dict = embedder.create_cid_font_dict(ObjectId::new(10, 0));
595
596        assert_eq!(cid_dict.get("Type").unwrap(), &Object::Name("Font".into()));
597        assert_eq!(
598            cid_dict.get("Subtype").unwrap(),
599            &Object::Name("CIDFontType2".into())
600        );
601        assert!(cid_dict.get("CIDSystemInfo").is_some());
602        assert_eq!(cid_dict.get("DW").unwrap(), &Object::Integer(1000));
603
604        // Check CIDSystemInfo
605        if let Object::Dictionary(sys_info) = cid_dict.get("CIDSystemInfo").unwrap() {
606            assert_eq!(
607                sys_info.get("Registry").unwrap(),
608                &Object::String("Adobe".into())
609            );
610            assert_eq!(
611                sys_info.get("Ordering").unwrap(),
612                &Object::String("Identity".into())
613            );
614            assert_eq!(sys_info.get("Supplement").unwrap(), &Object::Integer(0));
615        } else {
616            panic!("Expected Dictionary for CIDSystemInfo");
617        }
618    }
619
620    #[test]
621    fn test_font_encoding_equality() {
622        assert_eq!(FontEncoding::WinAnsiEncoding, FontEncoding::WinAnsiEncoding);
623        assert_ne!(
624            FontEncoding::WinAnsiEncoding,
625            FontEncoding::MacRomanEncoding
626        );
627        assert_ne!(FontEncoding::StandardEncoding, FontEncoding::IdentityH);
628    }
629
630    #[test]
631    fn test_add_duplicate_chars() {
632        let font = create_test_font();
633        let options = EmbeddingOptions::default();
634        let mut embedder = FontEmbedder::new(&font, options);
635
636        embedder.add_used_chars("AAA");
637        assert_eq!(embedder.used_chars.len(), 1); // Only one 'A' should be stored
638
639        embedder.add_used_chars("ABBA");
640        assert_eq!(embedder.used_chars.len(), 2); // 'A' and 'B'
641    }
642
643    #[test]
644    fn test_widths_array_missing_glyphs() {
645        let mut font = create_test_font();
646        // Clear all glyph mappings to test missing glyph handling
647        font.glyph_mapping = GlyphMapping::default();
648
649        let options = EmbeddingOptions::default();
650        let embedder = FontEmbedder::new(&font, options);
651
652        let widths = embedder.create_widths_array(65, 67); // A, B, C
653        assert_eq!(widths.len(), 3);
654
655        // Should use default width of 600 for missing glyphs
656        for width in &widths {
657            if let Object::Integer(w) = width {
658                assert_eq!(*w, 600);
659            }
660        }
661    }
662}