1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
//! Type0 (Composite) font support for full Unicode handling
//!
//! This module implements Type0 fonts with CID support according to ISO 32000-1:2008
//! Section 9.7 (Composite Fonts). Type0 fonts use CIDs (Character IDs) to support
//! large character sets including full Unicode.
use crate::fonts::Font;
use crate::objects::{Dictionary, Object, ObjectId};
use crate::text::cmap::ToUnicodeCMapBuilder;
use crate::text::fonts::embedding::CjkFontType;
use std::collections::{HashMap, HashSet};
/// Type0 font for Unicode support
#[derive(Debug, Clone)]
pub struct Type0Font {
/// Base font
pub base_font: Font,
/// CID to Unicode mappings
pub cid_to_unicode: HashMap<u16, char>,
/// Unicode to CID mappings
pub unicode_to_cid: HashMap<char, u16>,
/// Used CIDs for subsetting
pub used_cids: HashSet<u16>,
/// Registry for CIDSystemInfo
pub registry: String,
/// Ordering for CIDSystemInfo
pub ordering: String,
/// Supplement for CIDSystemInfo
pub supplement: i32,
/// ToUnicode CMap
pub to_unicode_cmap: Option<Vec<u8>>,
}
impl Type0Font {
/// Create a new Type0 font from a base font
pub fn new(base_font: Font) -> Self {
// Detect CJK font type and set appropriate CIDSystemInfo
let (registry, ordering, supplement) =
if let Some(cjk_type) = CjkFontType::detect_from_name(base_font.postscript_name()) {
cjk_type.cid_system_info()
} else {
("Adobe", "Identity", 0)
};
let mut font = Self {
base_font,
cid_to_unicode: HashMap::new(),
unicode_to_cid: HashMap::new(),
used_cids: HashSet::new(),
registry: registry.to_string(),
ordering: ordering.to_string(),
supplement,
to_unicode_cmap: None,
};
// Build CID mappings
font.build_cid_mappings();
font
}
/// Build CID mappings from the base font's glyph mapping
fn build_cid_mappings(&mut self) {
// For Identity mapping, CID = Unicode code point for BMP characters
// This is a simplified approach - real fonts may have complex mappings
// Map common Unicode ranges
let ranges = vec![
(0x0020..=0x007E, 0x0020), // Basic Latin
(0x00A0..=0x00FF, 0x00A0), // Latin-1 Supplement
(0x0100..=0x017F, 0x0100), // Latin Extended-A
(0x0180..=0x024F, 0x0180), // Latin Extended-B
(0x2000..=0x206F, 0x2000), // General Punctuation
(0x2070..=0x209F, 0x2070), // Superscripts and Subscripts
(0x20A0..=0x20CF, 0x20A0), // Currency Symbols
(0x2100..=0x214F, 0x2100), // Letterlike Symbols
(0x2150..=0x218F, 0x2150), // Number Forms
(0x2190..=0x21FF, 0x2190), // Arrows
(0x2200..=0x22FF, 0x2200), // Mathematical Operators
(0x2300..=0x23FF, 0x2300), // Miscellaneous Technical
(0x2400..=0x243F, 0x2400), // Control Pictures
(0x2500..=0x257F, 0x2500), // Box Drawing
(0x2580..=0x259F, 0x2580), // Block Elements
(0x25A0..=0x25FF, 0x25A0), // Geometric Shapes
(0x2600..=0x26FF, 0x2600), // Miscellaneous Symbols
(0x2700..=0x27BF, 0x2700), // Dingbats
(0x2800..=0x28FF, 0x2800), // Braille Patterns
];
let mut cid = 1u16; // Start from CID 1 (0 is reserved for .notdef)
for (range, _start) in ranges {
for unicode_value in range {
if let Some(ch) = char::from_u32(unicode_value) {
self.cid_to_unicode.insert(cid, ch);
self.unicode_to_cid.insert(ch, cid);
// Also add to base font's glyph mapping
self.base_font.glyph_mapping.add_mapping(ch, cid);
cid += 1;
}
}
}
// Add specific checkbox and symbol mappings
let special_chars = vec![
('☐', 0x2610), // Ballot box
('☑', 0x2611), // Ballot box with check
('☒', 0x2612), // Ballot box with X
('✓', 0x2713), // Check mark
('✗', 0x2717), // Ballot X
('✔', 0x2714), // Heavy check mark
('✘', 0x2718), // Heavy ballot X
('•', 0x2022), // Bullet
('◦', 0x25E6), // White bullet
('▪', 0x25AA), // Black small square
('▫', 0x25AB), // White small square
('→', 0x2192), // Rightwards arrow
('←', 0x2190), // Leftwards arrow
('↑', 0x2191), // Upwards arrow
('↓', 0x2193), // Downwards arrow
('∑', 0x2211), // N-ary summation
('∏', 0x220F), // N-ary product
('∫', 0x222B), // Integral
('√', 0x221A), // Square root
('∞', 0x221E), // Infinity
('±', 0x00B1), // Plus-minus sign
('×', 0x00D7), // Multiplication sign
('÷', 0x00F7), // Division sign
('≈', 0x2248), // Almost equal to
('≠', 0x2260), // Not equal to
('≤', 0x2264), // Less than or equal to
('≥', 0x2265), // Greater than or equal to
];
for (ch, _unicode) in special_chars {
if !self.unicode_to_cid.contains_key(&ch) {
self.cid_to_unicode.insert(cid, ch);
self.unicode_to_cid.insert(ch, cid);
self.base_font.glyph_mapping.add_mapping(ch, cid);
cid += 1;
}
}
}
/// Get CID for a character
pub fn get_cid(&self, ch: char) -> Option<u16> {
self.unicode_to_cid.get(&ch).copied()
}
/// Mark characters as used
pub fn mark_chars_used(&mut self, text: &str) {
for ch in text.chars() {
if let Some(cid) = self.get_cid(ch) {
self.used_cids.insert(cid);
}
}
}
/// Generate ToUnicode CMap
pub fn generate_to_unicode_cmap(&mut self) -> Vec<u8> {
let mut builder = ToUnicodeCMapBuilder::new(2); // 2-byte CIDs
// Add all used CID mappings
for &cid in &self.used_cids {
if let Some(&ch) = self.cid_to_unicode.get(&cid) {
builder.add_mapping(vec![(cid >> 8) as u8, (cid & 0xFF) as u8], &ch.to_string());
}
}
let cmap = builder.build();
self.to_unicode_cmap = Some(cmap.clone());
cmap
}
/// Create Type0 font dictionary
pub fn create_font_dict(
&self,
descendant_font_id: ObjectId,
to_unicode_id: Option<ObjectId>,
) -> Dictionary {
let mut dict = Dictionary::new();
dict.set("Type", Object::Name("Font".to_string()));
dict.set("Subtype", Object::Name("Type0".to_string()));
dict.set(
"BaseFont",
Object::Name(self.base_font.postscript_name().to_string()),
);
dict.set("Encoding", Object::Name("Identity-H".to_string()));
// DescendantFonts array
dict.set(
"DescendantFonts",
Object::Array(vec![Object::Reference(descendant_font_id)]),
);
// ToUnicode CMap
if let Some(to_unicode) = to_unicode_id {
dict.set("ToUnicode", Object::Reference(to_unicode));
}
dict
}
/// Create CIDFont dictionary (descendant font)
pub fn create_cid_font_dict(&self, descriptor_id: ObjectId) -> Dictionary {
let mut dict = Dictionary::new();
dict.set("Type", Object::Name("Font".to_string()));
dict.set("Subtype", Object::Name("CIDFontType2".to_string())); // For TrueType
dict.set(
"BaseFont",
Object::Name(self.base_font.postscript_name().to_string()),
);
// CIDSystemInfo
let mut cid_system_info = Dictionary::new();
cid_system_info.set("Registry", Object::String(self.registry.clone()));
cid_system_info.set("Ordering", Object::String(self.ordering.clone()));
cid_system_info.set("Supplement", Object::Integer(self.supplement as i64));
dict.set("CIDSystemInfo", Object::Dictionary(cid_system_info));
// FontDescriptor
dict.set("FontDescriptor", Object::Reference(descriptor_id));
// Default width
dict.set("DW", Object::Integer(1000));
// Width array (W)
// For simplicity, using default width for all glyphs
// In production, this should contain actual glyph widths
let mut w_array = Vec::new();
// Add width information for used CIDs
if !self.used_cids.is_empty() {
let mut sorted_cids: Vec<_> = self.used_cids.iter().copied().collect();
sorted_cids.sort_unstable();
// Group consecutive CIDs
let mut current_start = sorted_cids[0];
let mut current_end = sorted_cids[0];
let mut current_widths: Vec<i64> = Vec::new();
for &cid in &sorted_cids[1..] {
if cid == current_end + 1 {
current_end = cid;
} else {
// Flush current range
if current_start == current_end {
// Single CID
w_array.push(Object::Integer(current_start as i64));
w_array.push(Object::Array(vec![Object::Integer(600)]));
// Default width
} else {
// Range of CIDs
w_array.push(Object::Integer(current_start as i64));
w_array.push(Object::Integer(current_end as i64));
w_array.push(Object::Integer(600)); // Default width for range
}
current_start = cid;
current_end = cid;
current_widths.clear();
}
}
// Flush last range
if current_start == current_end {
w_array.push(Object::Integer(current_start as i64));
w_array.push(Object::Array(vec![Object::Integer(600)]));
} else {
w_array.push(Object::Integer(current_start as i64));
w_array.push(Object::Integer(current_end as i64));
w_array.push(Object::Integer(600));
}
}
dict.set("W", Object::Array(w_array));
// CIDToGIDMap (for TrueType fonts)
dict.set("CIDToGIDMap", Object::Name("Identity".to_string()));
dict
}
/// Encode text to CIDs
pub fn encode_text(&mut self, text: &str) -> Vec<u8> {
let mut encoded = Vec::new();
for ch in text.chars() {
if let Some(cid) = self.get_cid(ch) {
// Add to used CIDs
self.used_cids.insert(cid);
// Encode as 2-byte value (big-endian)
encoded.push((cid >> 8) as u8);
encoded.push((cid & 0xFF) as u8);
} else {
// Use .notdef glyph (CID 0)
encoded.push(0);
encoded.push(0);
}
}
encoded
}
}
/// Helper to detect if text needs Type0 font
pub fn needs_type0_font(text: &str) -> bool {
// Check if text contains characters outside of Latin-1 (ISO-8859-1)
text.chars().any(|ch| ch as u32 > 255)
}
/// Helper to create Type0 font from a Font
pub fn create_type0_from_font(font: Font) -> Type0Font {
Type0Font::new(font)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_type0_font_creation() {
let base_font = Font::new("TestFont");
let type0_font = Type0Font::new(base_font);
assert_eq!(type0_font.registry, "Adobe");
assert_eq!(type0_font.ordering, "Identity");
assert_eq!(type0_font.supplement, 0);
}
#[test]
fn test_cid_mappings() {
let base_font = Font::new("TestFont");
let type0_font = Type0Font::new(base_font);
// Test basic Latin
assert!(type0_font.get_cid('A').is_some());
assert!(type0_font.get_cid('z').is_some());
// Test extended Latin
assert!(type0_font.get_cid('á').is_some());
assert!(type0_font.get_cid('ñ').is_some());
// Test special symbols
assert!(type0_font.get_cid('☑').is_some());
assert!(type0_font.get_cid('→').is_some());
assert!(type0_font.get_cid('√').is_some());
}
#[test]
fn test_text_encoding() {
let base_font = Font::new("TestFont");
let mut type0_font = Type0Font::new(base_font);
let text = "Hello ☑ Math: ∑";
let encoded = type0_font.encode_text(text);
// Should produce 2 bytes per character
assert_eq!(encoded.len(), text.chars().count() * 2);
// Check that CIDs were marked as used
assert!(!type0_font.used_cids.is_empty());
}
#[test]
fn test_needs_type0_font() {
assert!(!needs_type0_font("Hello World")); // Basic ASCII
assert!(!needs_type0_font("Café")); // Latin-1
assert!(needs_type0_font("Hello ☑")); // Unicode checkbox
assert!(needs_type0_font("Math: ∑")); // Math symbol
assert!(needs_type0_font("Arrow →")); // Arrow
}
#[test]
fn test_to_unicode_cmap_generation() {
let base_font = Font::new("TestFont");
let mut type0_font = Type0Font::new(base_font);
// Mark some characters as used
type0_font.mark_chars_used("Hello ☑");
let cmap = type0_font.generate_to_unicode_cmap();
assert!(!cmap.is_empty());
// CMap should contain required structure
let cmap_str = String::from_utf8_lossy(&cmap);
assert!(cmap_str.contains("/CMapName /Adobe-Identity-UCS def"));
assert!(cmap_str.contains("begincodespacerange"));
assert!(cmap_str.contains("beginbfchar"));
}
}