Skip to main content

fop_render/pdf/
cidfont.rs

1//! CID-keyed font support for Unicode (Japanese/CJK) text in PDF
2//!
3//! This module implements Type 0 composite fonts with CIDFont descendants,
4//! which are required for proper Unicode text rendering in PDF.
5
6use std::collections::HashMap;
7
8/// Generate a Type 0 composite font dictionary for Unicode text
9///
10/// Type 0 fonts are required for multi-byte character sets like CJK.
11/// They use CIDFont descendants with Identity-H encoding.
12pub fn generate_type0_font_dict(
13    base_font_name: &str,
14    cid_font_obj_id: usize,
15    to_unicode_obj_id: usize,
16) -> String {
17    format!(
18        "<<\n\
19         /Type /Font\n\
20         /Subtype /Type0\n\
21         /BaseFont /{}-Identity-H\n\
22         /Encoding /Identity-H\n\
23         /DescendantFonts [{} 0 R]\n\
24         /ToUnicode {} 0 R\n\
25         >>",
26        base_font_name, cid_font_obj_id, to_unicode_obj_id
27    )
28}
29
30/// Generate a CIDFont dictionary (descendant of Type 0 font)
31pub fn generate_cidfont_dict(
32    base_font_name: &str,
33    descriptor_obj_id: usize,
34    widths: &[u16],
35    default_width: u16,
36) -> String {
37    // For CIDFont, we use a default width and specific widths for used characters
38    // For simplicity, we'll use the default width for all characters initially
39
40    format!(
41        "<<\n\
42         /Type /Font\n\
43         /Subtype /CIDFontType2\n\
44         /BaseFont /{}\n\
45         /CIDSystemInfo <<\n\
46           /Registry (Adobe)\n\
47           /Ordering (Identity)\n\
48           /Supplement 0\n\
49         >>\n\
50         /FontDescriptor {} 0 R\n\
51         /DW {}\n\
52         /W [0 [{} ]]\n\
53         >>",
54        base_font_name,
55        descriptor_obj_id,
56        default_width,
57        widths
58            .iter()
59            .map(|w| w.to_string())
60            .collect::<Vec<_>>()
61            .join(" ")
62    )
63}
64
65/// Generate ToUnicode CMap for CID-keyed fonts
66pub fn generate_cidfont_tounicode_cmap(char_map: &HashMap<u16, char>) -> String {
67    let mut cmap = String::from(
68        "/CIDInit /ProcSet findresource begin\n\
69         12 dict begin\n\
70         begincmap\n\
71         /CIDSystemInfo <<\n\
72           /Registry (Adobe)\n\
73           /Ordering (UCS)\n\
74           /Supplement 0\n\
75         >> def\n\
76         /CMapName /Adobe-Identity-UCS def\n\
77         /CMapType 2 def\n\
78         1 begincodespacerange\n\
79         <0000> <FFFF>\n\
80         endcodespacerange\n",
81    );
82
83    // Add character mappings
84    if !char_map.is_empty() {
85        cmap.push_str(&format!("{} beginbfchar\n", char_map.len()));
86
87        for (&glyph_id, &ch) in char_map.iter() {
88            // Map glyph ID to Unicode code point
89            cmap.push_str(&format!("<{:04X}> <{:04X}>\n", glyph_id, ch as u32));
90        }
91
92        cmap.push_str("endbfchar\n");
93    }
94
95    cmap.push_str(
96        "endcmap\n\
97         CMapName currentdict /CMap defineresource pop\n\
98         end\n\
99         end\n",
100    );
101
102    cmap
103}
104
105/// Encode text as UTF-16BE for use with CID-keyed fonts
106///
107/// CID-keyed fonts expect text in UTF-16BE encoding.
108/// Returns a hex string suitable for PDF (e.g., <FEFF...>)
109pub fn encode_text_utf16be(text: &str) -> String {
110    let mut result = String::from("<FEFF"); // BOM for UTF-16BE
111
112    for ch in text.chars() {
113        let code = ch as u32;
114        if code <= 0xFFFF {
115            // BMP character - single UTF-16 code unit
116            result.push_str(&format!("{:04X}", code));
117        } else {
118            // Supplementary character - surrogate pair
119            let code = code - 0x10000;
120            let high = 0xD800 + (code >> 10);
121            let low = 0xDC00 + (code & 0x3FF);
122            result.push_str(&format!("{:04X}{:04X}", high, low));
123        }
124    }
125
126    result.push('>');
127    result
128}
129
130/// Generate CIDToGIDMap stream for mapping CIDs to actual glyph IDs
131///
132/// Creates a binary stream where each CID (Unicode codepoint) maps to its
133/// actual glyph ID in the TrueType font. This is required when the font's
134/// glyph IDs don't match Unicode codepoints (which is common for CJK fonts).
135///
136/// Format: Binary stream where offset = CID * 2, value = uint16 GID (big-endian)
137///
138/// # Arguments
139/// * `char_to_glyph` - Mapping from characters to their glyph IDs in the font
140/// * `used_chars` - Set of characters actually used in the document
141///
142/// # Returns
143/// Binary data suitable for embedding as a PDF stream
144pub fn generate_cidtogidmap_stream(
145    char_to_glyph: &std::collections::HashMap<char, u16>,
146    used_chars: &std::collections::BTreeSet<char>,
147) -> Vec<u8> {
148    // Find max CID (Unicode value) we'll use
149    let max_cid = used_chars.iter().map(|&c| c as u32).max().unwrap_or(0);
150
151    // Create byte array: 2 bytes per CID
152    let mut map = vec![0u8; ((max_cid + 1) * 2) as usize];
153
154    // Fill in mappings for used characters
155    for &ch in used_chars.iter() {
156        let cid = ch as u32;
157        if let Some(&gid) = char_to_glyph.get(&ch) {
158            let offset = (cid * 2) as usize;
159            // Big-endian uint16
160            map[offset] = (gid >> 8) as u8;
161            map[offset + 1] = (gid & 0xFF) as u8;
162        }
163    }
164
165    map
166}
167
168#[cfg(test)]
169mod tests {
170    use super::*;
171
172    #[test]
173    fn test_encode_text_utf16be_ascii() {
174        let encoded = encode_text_utf16be("Hello");
175        assert!(encoded.starts_with("<FEFF"));
176        assert!(encoded.ends_with('>'));
177        // H=0048, e=0065, l=006C, o=006F
178        assert!(encoded.contains("0048"));
179        assert!(encoded.contains("0065"));
180    }
181
182    #[test]
183    fn test_encode_text_utf16be_japanese() {
184        let encoded = encode_text_utf16be("請求書");
185        assert!(encoded.starts_with("<FEFF"));
186        // 請=8ACB, 求=6C42, 書=66F8
187        assert!(encoded.contains("8ACB"));
188        assert!(encoded.contains("6C42"));
189        assert!(encoded.contains("66F8"));
190    }
191
192    #[test]
193    fn test_encode_text_utf16be_mixed() {
194        let encoded = encode_text_utf16be("Hello世界");
195        assert!(encoded.starts_with("<FEFF"));
196        // Should contain both ASCII and Japanese
197        assert!(encoded.contains("0048")); // H
198        assert!(encoded.contains("4E16")); // 世
199        assert!(encoded.contains("754C")); // 界
200    }
201
202    #[test]
203    fn test_tounicode_cmap_generation() {
204        let mut char_map = HashMap::new();
205        char_map.insert(100, 'A');
206        char_map.insert(200, '請');
207
208        let cmap = generate_cidfont_tounicode_cmap(&char_map);
209
210        assert!(cmap.contains("begincmap"));
211        assert!(cmap.contains("endbfchar"));
212        assert!(cmap.contains("<0064> <0041>")); // 100 -> 'A'
213        assert!(cmap.contains("<00C8> <8ACB>")); // 200 -> '請'
214    }
215}
216
217#[cfg(test)]
218mod tests_extended {
219    use super::*;
220
221    #[test]
222    fn test_type0_font_dict_structure() {
223        let dict = generate_type0_font_dict("NotoSans", 5, 6);
224        assert!(dict.contains("/Type /Font"));
225        assert!(dict.contains("/Subtype /Type0"));
226        assert!(dict.contains("/Encoding /Identity-H"));
227        assert!(dict.contains("NotoSans"));
228        assert!(dict.contains("5 0 R")); // cid_font_obj_id
229        assert!(dict.contains("6 0 R")); // to_unicode_obj_id
230    }
231
232    #[test]
233    fn test_type0_font_dict_base_font_name_format() {
234        let dict = generate_type0_font_dict("MyFont", 10, 11);
235        // BaseFont should contain font name with Identity-H suffix
236        assert!(dict.contains("MyFont-Identity-H"));
237    }
238
239    #[test]
240    fn test_cidfont_dict_structure() {
241        let widths = vec![500u16; 10];
242        let dict = generate_cidfont_dict("NotoSans", 3, &widths, 500);
243        assert!(dict.contains("/Type /Font"));
244        assert!(dict.contains("/Subtype /CIDFontType2"));
245        assert!(dict.contains("/Registry (Adobe)"));
246        assert!(dict.contains("/Ordering (Identity)"));
247        assert!(dict.contains("NotoSans"));
248    }
249
250    #[test]
251    fn test_cidfont_dict_contains_descriptor_ref() {
252        let widths = vec![600u16; 5];
253        let dict = generate_cidfont_dict("TestFont", 42, &widths, 600);
254        assert!(dict.contains("42 0 R"));
255    }
256
257    #[test]
258    fn test_cidfont_dict_default_width() {
259        let widths: Vec<u16> = vec![];
260        let dict = generate_cidfont_dict("Font", 1, &widths, 1000);
261        assert!(dict.contains("/DW 1000"));
262    }
263
264    #[test]
265    fn test_tounicode_cmap_empty_map() {
266        let char_map = HashMap::new();
267        let cmap = generate_cidfont_tounicode_cmap(&char_map);
268        // Should still have valid CMap structure
269        assert!(cmap.contains("begincmap"));
270        assert!(cmap.contains("endcmap"));
271    }
272
273    #[test]
274    fn test_encode_text_utf16be_empty() {
275        let encoded = encode_text_utf16be("");
276        // Should at least have the BOM prefix and closing angle bracket
277        assert!(encoded.starts_with("<FEFF"));
278        assert!(encoded.ends_with('>'));
279    }
280
281    #[test]
282    fn test_generate_cidtogidmap_stream_empty() {
283        use std::collections::{BTreeSet, HashMap};
284        let char_to_glyph: HashMap<char, u16> = HashMap::new();
285        let used_chars: BTreeSet<char> = BTreeSet::new();
286        let map = generate_cidtogidmap_stream(&char_to_glyph, &used_chars);
287        // Empty used_chars → max_cid = 0 → 2 bytes (map of size (0+1)*2=2)
288        assert_eq!(map, vec![0u8; 2]);
289    }
290
291    #[test]
292    fn test_generate_cidtogidmap_stream_single_char() {
293        use std::collections::{BTreeSet, HashMap};
294        let mut char_to_glyph: HashMap<char, u16> = HashMap::new();
295        char_to_glyph.insert('A', 36);
296        let mut used_chars: BTreeSet<char> = BTreeSet::new();
297        used_chars.insert('A');
298        let map = generate_cidtogidmap_stream(&char_to_glyph, &used_chars);
299        // 'A' is U+0041 = 65; map has 2*(65+1)=132 bytes
300        assert_eq!(map.len(), 132);
301        // At offset 65*2=130: big-endian 36 = 0x00, 0x24
302        assert_eq!(map[130], 0x00);
303        assert_eq!(map[131], 0x24);
304    }
305}