pdfplumber_parse/
cid_font.rs

1//! CID font support for CJK text extraction.
2//!
3//! Handles Type0 (composite) fonts with CIDFontType0 and CIDFontType2
4//! descendant fonts. Provides CID-to-GID mapping, /W (width) array parsing,
5//! and /DW (default width) handling for CID fonts.
6
7use std::collections::HashMap;
8
9use crate::error::BackendError;
10
11/// Default CID font width when /DW is not specified (1000/1000 of text space = full em width).
12const DEFAULT_CID_WIDTH: f64 = 1000.0;
13
14/// Default ascent for CID fonts when not specified.
15const DEFAULT_CID_ASCENT: f64 = 880.0;
16
17/// Default descent for CID fonts when not specified.
18const DEFAULT_CID_DESCENT: f64 = -120.0;
19
20/// CID font subtype.
21#[derive(Debug, Clone, Copy, PartialEq, Eq)]
22pub enum CidFontType {
23    /// CIDFontType0: CID-keyed font based on Type 1 outlines.
24    Type0,
25    /// CIDFontType2: CID-keyed font based on TrueType outlines.
26    Type2,
27}
28
29/// CID-to-GID (glyph ID) mapping strategy.
30#[derive(Debug, Clone, PartialEq)]
31pub enum CidToGidMap {
32    /// Identity mapping: CID equals GID directly.
33    Identity,
34    /// Explicit mapping: byte array where GID for CID `n` is at bytes `2n` and `2n+1`
35    /// (big-endian u16).
36    Explicit(Vec<u16>),
37}
38
39impl CidToGidMap {
40    /// Map a CID to a GID.
41    pub fn map(&self, cid: u32) -> u32 {
42        match self {
43            CidToGidMap::Identity => cid,
44            CidToGidMap::Explicit(table) => {
45                if (cid as usize) < table.len() {
46                    u32::from(table[cid as usize])
47                } else {
48                    cid
49                }
50            }
51        }
52    }
53
54    /// Parse a CIDToGIDMap from raw stream bytes (big-endian u16 pairs).
55    pub fn from_stream(data: &[u8]) -> Self {
56        let mut table = Vec::with_capacity(data.len() / 2);
57        for chunk in data.chunks(2) {
58            if chunk.len() == 2 {
59                table.push(u16::from_be_bytes([chunk[0], chunk[1]]));
60            }
61        }
62        CidToGidMap::Explicit(table)
63    }
64}
65
66/// CID system information from the /CIDSystemInfo dictionary.
67#[derive(Debug, Clone, PartialEq)]
68pub struct CidSystemInfo {
69    /// Registry (e.g., "Adobe").
70    pub registry: String,
71    /// Ordering (e.g., "Japan1", "GB1", "CNS1", "Korea1").
72    pub ordering: String,
73    /// Supplement number.
74    pub supplement: i64,
75}
76
77impl CidSystemInfo {
78    /// Check if this is an Adobe CJK system.
79    pub fn is_adobe_cjk(&self) -> bool {
80        self.registry == "Adobe"
81            && matches!(self.ordering.as_str(), "Japan1" | "GB1" | "CNS1" | "Korea1")
82    }
83}
84
85/// Font metrics for a CID font, handling the /W array and /DW default width.
86///
87/// CID fonts use a different width specification than simple fonts:
88/// - /DW: default width for all CIDs (default 1000)
89/// - /W: array of width overrides in the format:
90///   `[CID [w1 w2 ...] CIDstart CIDend w ...]`
91#[derive(Debug, Clone)]
92pub struct CidFontMetrics {
93    /// Per-CID width overrides (from /W array).
94    widths: HashMap<u32, f64>,
95    /// Default width for CIDs not in the widths map (from /DW).
96    default_width: f64,
97    /// Font ascent in glyph space units.
98    ascent: f64,
99    /// Font descent in glyph space units.
100    descent: f64,
101    /// Font bounding box.
102    font_bbox: Option<[f64; 4]>,
103    /// CID font subtype.
104    font_type: CidFontType,
105    /// CID-to-GID mapping.
106    cid_to_gid: CidToGidMap,
107    /// CID system information.
108    system_info: Option<CidSystemInfo>,
109}
110
111impl CidFontMetrics {
112    /// Create CidFontMetrics from parsed values.
113    #[allow(clippy::too_many_arguments)]
114    pub fn new(
115        widths: HashMap<u32, f64>,
116        default_width: f64,
117        ascent: f64,
118        descent: f64,
119        font_bbox: Option<[f64; 4]>,
120        font_type: CidFontType,
121        cid_to_gid: CidToGidMap,
122        system_info: Option<CidSystemInfo>,
123    ) -> Self {
124        Self {
125            widths,
126            default_width,
127            ascent,
128            descent,
129            font_bbox,
130            font_type,
131            cid_to_gid,
132            system_info,
133        }
134    }
135
136    /// Create default CidFontMetrics.
137    pub fn default_metrics() -> Self {
138        Self {
139            widths: HashMap::new(),
140            default_width: DEFAULT_CID_WIDTH,
141            ascent: DEFAULT_CID_ASCENT,
142            descent: DEFAULT_CID_DESCENT,
143            font_bbox: None,
144            font_type: CidFontType::Type2,
145            cid_to_gid: CidToGidMap::Identity,
146            system_info: None,
147        }
148    }
149
150    /// Get the width for a CID in glyph space (1/1000 of text space).
151    pub fn get_width(&self, cid: u32) -> f64 {
152        self.widths.get(&cid).copied().unwrap_or(self.default_width)
153    }
154
155    /// Font ascent in glyph space units.
156    pub fn ascent(&self) -> f64 {
157        self.ascent
158    }
159
160    /// Font descent in glyph space units.
161    pub fn descent(&self) -> f64 {
162        self.descent
163    }
164
165    /// Font bounding box.
166    pub fn font_bbox(&self) -> Option<[f64; 4]> {
167        self.font_bbox
168    }
169
170    /// Default width for CIDs not in the width overrides.
171    pub fn default_width(&self) -> f64 {
172        self.default_width
173    }
174
175    /// CID font subtype.
176    pub fn font_type(&self) -> CidFontType {
177        self.font_type
178    }
179
180    /// CID-to-GID mapping.
181    pub fn cid_to_gid(&self) -> &CidToGidMap {
182        &self.cid_to_gid
183    }
184
185    /// Map a CID to a GID.
186    pub fn map_cid_to_gid(&self, cid: u32) -> u32 {
187        self.cid_to_gid.map(cid)
188    }
189
190    /// CID system information.
191    pub fn system_info(&self) -> Option<&CidSystemInfo> {
192        self.system_info.as_ref()
193    }
194}
195
196/// Parse a /W (width) array from a CID font dictionary.
197///
198/// The /W array has the format:
199/// ```text
200/// [ c [w1 w2 ...] c_first c_last w ... ]
201/// ```
202/// Where:
203/// - `c [w1 w2 ...]` assigns widths w1, w2, ... to CIDs c, c+1, c+2, ...
204/// - `c_first c_last w` assigns width w to all CIDs from c_first to c_last
205pub fn parse_w_array(objects: &[lopdf::Object], doc: &lopdf::Document) -> HashMap<u32, f64> {
206    let mut widths = HashMap::new();
207    let mut i = 0;
208
209    while i < objects.len() {
210        let cid_start = match object_to_u32(resolve_object(doc, &objects[i])) {
211            Some(v) => v,
212            None => {
213                i += 1;
214                continue;
215            }
216        };
217        i += 1;
218
219        if i >= objects.len() {
220            break;
221        }
222
223        let next = resolve_object(doc, &objects[i]);
224        if let Ok(arr) = next.as_array() {
225            // Format: CID [w1 w2 w3 ...]
226            for (j, obj) in arr.iter().enumerate() {
227                let obj = resolve_object(doc, obj);
228                if let Some(w) = object_to_f64(obj) {
229                    widths.insert(cid_start + j as u32, w);
230                }
231            }
232            i += 1;
233        } else if let Some(cid_end) = object_to_u32(next) {
234            // Format: CID_start CID_end w
235            i += 1;
236            if i < objects.len() {
237                let w_obj = resolve_object(doc, &objects[i]);
238                if let Some(w) = object_to_f64(w_obj) {
239                    for cid in cid_start..=cid_end {
240                        widths.insert(cid, w);
241                    }
242                }
243                i += 1;
244            }
245        } else {
246            i += 1;
247        }
248    }
249
250    widths
251}
252
253/// Extract CID font metrics from a CIDFont dictionary (descendant of Type0).
254pub fn extract_cid_font_metrics(
255    doc: &lopdf::Document,
256    cid_font_dict: &lopdf::Dictionary,
257) -> Result<CidFontMetrics, BackendError> {
258    // Determine CIDFont subtype
259    let font_type = cid_font_dict
260        .get(b"Subtype")
261        .ok()
262        .and_then(|o| o.as_name_str().ok())
263        .map(|s| match s {
264            "CIDFontType0" => CidFontType::Type0,
265            _ => CidFontType::Type2,
266        })
267        .unwrap_or(CidFontType::Type2);
268
269    // Parse /DW (default width)
270    let default_width = cid_font_dict
271        .get(b"DW")
272        .ok()
273        .and_then(|o| object_to_f64(resolve_object(doc, o)))
274        .unwrap_or(DEFAULT_CID_WIDTH);
275
276    // Parse /W (width array)
277    let widths = cid_font_dict
278        .get(b"W")
279        .ok()
280        .map(|o| resolve_object(doc, o))
281        .and_then(|o| o.as_array().ok())
282        .map(|arr| parse_w_array(arr, doc))
283        .unwrap_or_default();
284
285    // Parse /CIDToGIDMap
286    let cid_to_gid = parse_cid_to_gid_map(doc, cid_font_dict);
287
288    // Parse /CIDSystemInfo
289    let system_info = parse_cid_system_info(doc, cid_font_dict);
290
291    // Parse /FontDescriptor for ascent, descent, bbox
292    let (ascent, descent, font_bbox) = parse_cid_font_descriptor(doc, cid_font_dict);
293
294    Ok(CidFontMetrics::new(
295        widths,
296        default_width,
297        ascent,
298        descent,
299        font_bbox,
300        font_type,
301        cid_to_gid,
302        system_info,
303    ))
304}
305
306/// Parse the /CIDToGIDMap entry from a CIDFont dictionary.
307fn parse_cid_to_gid_map(doc: &lopdf::Document, dict: &lopdf::Dictionary) -> CidToGidMap {
308    match dict.get(b"CIDToGIDMap") {
309        Ok(obj) => {
310            let obj = resolve_object(doc, obj);
311            if let Ok(name) = obj.as_name_str() {
312                if name == "Identity" {
313                    return CidToGidMap::Identity;
314                }
315            }
316            if let Ok(stream) = obj.as_stream() {
317                let data = if stream.dict.get(b"Filter").is_ok() {
318                    stream.decompressed_content().unwrap_or_default()
319                } else {
320                    stream.content.clone()
321                };
322                return CidToGidMap::from_stream(&data);
323            }
324            CidToGidMap::Identity
325        }
326        Err(_) => CidToGidMap::Identity,
327    }
328}
329
330/// Parse /CIDSystemInfo from a CIDFont dictionary.
331fn parse_cid_system_info(doc: &lopdf::Document, dict: &lopdf::Dictionary) -> Option<CidSystemInfo> {
332    let info_obj = dict.get(b"CIDSystemInfo").ok()?;
333    let info_obj = resolve_object(doc, info_obj);
334    let info_dict = info_obj.as_dict().ok()?;
335
336    let registry = info_dict
337        .get(b"Registry")
338        .ok()
339        .and_then(|o| match o {
340            lopdf::Object::String(s, _) => String::from_utf8(s.clone()).ok(),
341            _ => None,
342        })
343        .unwrap_or_default();
344
345    let ordering = info_dict
346        .get(b"Ordering")
347        .ok()
348        .and_then(|o| match o {
349            lopdf::Object::String(s, _) => String::from_utf8(s.clone()).ok(),
350            _ => None,
351        })
352        .unwrap_or_default();
353
354    let supplement = info_dict
355        .get(b"Supplement")
356        .ok()
357        .and_then(|o| o.as_i64().ok())
358        .unwrap_or(0);
359
360    Some(CidSystemInfo {
361        registry,
362        ordering,
363        supplement,
364    })
365}
366
367/// Parse /FontDescriptor from a CIDFont dictionary for ascent, descent, bbox.
368fn parse_cid_font_descriptor(
369    doc: &lopdf::Document,
370    dict: &lopdf::Dictionary,
371) -> (f64, f64, Option<[f64; 4]>) {
372    let desc = match dict
373        .get(b"FontDescriptor")
374        .ok()
375        .map(|o| resolve_object(doc, o))
376        .and_then(|o| o.as_dict().ok())
377    {
378        Some(d) => d,
379        None => return (DEFAULT_CID_ASCENT, DEFAULT_CID_DESCENT, None),
380    };
381
382    let ascent = desc
383        .get(b"Ascent")
384        .ok()
385        .and_then(object_to_f64)
386        .unwrap_or(DEFAULT_CID_ASCENT);
387
388    let descent = desc
389        .get(b"Descent")
390        .ok()
391        .and_then(object_to_f64)
392        .unwrap_or(DEFAULT_CID_DESCENT);
393
394    let font_bbox = desc
395        .get(b"FontBBox")
396        .ok()
397        .and_then(|o| {
398            let o = resolve_object(doc, o);
399            o.as_array().ok()
400        })
401        .and_then(|arr| {
402            if arr.len() == 4 {
403                let vals: Vec<f64> = arr.iter().filter_map(object_to_f64).collect();
404                if vals.len() == 4 {
405                    Some([vals[0], vals[1], vals[2], vals[3]])
406                } else {
407                    None
408                }
409            } else {
410                None
411            }
412        });
413
414    (ascent, descent, font_bbox)
415}
416
417/// Resolve an indirect reference to the actual object.
418fn resolve_object<'a>(doc: &'a lopdf::Document, obj: &'a lopdf::Object) -> &'a lopdf::Object {
419    match obj {
420        lopdf::Object::Reference(id) => doc.get_object(*id).unwrap_or(obj),
421        _ => obj,
422    }
423}
424
425/// Convert a lopdf object to f64.
426fn object_to_f64(obj: &lopdf::Object) -> Option<f64> {
427    match obj {
428        lopdf::Object::Integer(i) => Some(*i as f64),
429        lopdf::Object::Real(f) => Some(*f as f64),
430        _ => None,
431    }
432}
433
434/// Convert a lopdf object to u32.
435fn object_to_u32(obj: &lopdf::Object) -> Option<u32> {
436    match obj {
437        lopdf::Object::Integer(i) => Some(*i as u32),
438        lopdf::Object::Real(f) => Some(*f as u32),
439        _ => None,
440    }
441}
442
443/// Information about a predefined CMap encoding.
444#[derive(Debug, Clone, PartialEq)]
445pub struct PredefinedCMapInfo {
446    /// The full CMap name (e.g., "Adobe-Japan1-6").
447    pub name: String,
448    /// Registry (e.g., "Adobe").
449    pub registry: String,
450    /// Ordering (e.g., "Japan1").
451    pub ordering: String,
452    /// Writing mode: 0 = horizontal, 1 = vertical.
453    pub writing_mode: u8,
454    /// Whether this is an Identity CMap.
455    pub is_identity: bool,
456}
457
458/// Parse a predefined CMap name and extract its information.
459///
460/// Recognizes standard Adobe CJK CMap names:
461/// - `Identity-H` / `Identity-V`
462/// - `Adobe-Japan1-*` (with `-H` or `-V` suffix for writing mode)
463/// - `Adobe-GB1-*`
464/// - `Adobe-CNS1-*`
465/// - `Adobe-Korea1-*`
466/// - Standard encoding names like `UniJIS-UTF16-H`, `UniGB-UTF16-H`, etc.
467pub fn parse_predefined_cmap_name(name: &str) -> Option<PredefinedCMapInfo> {
468    // Identity CMaps
469    if name == "Identity-H" {
470        return Some(PredefinedCMapInfo {
471            name: name.to_string(),
472            registry: "Adobe".to_string(),
473            ordering: "Identity".to_string(),
474            writing_mode: 0,
475            is_identity: true,
476        });
477    }
478    if name == "Identity-V" {
479        return Some(PredefinedCMapInfo {
480            name: name.to_string(),
481            registry: "Adobe".to_string(),
482            ordering: "Identity".to_string(),
483            writing_mode: 1,
484            is_identity: true,
485        });
486    }
487
488    // Adobe CJK CMap names (e.g., "Adobe-Japan1-6")
489    if let Some(rest) = name.strip_prefix("Adobe-") {
490        let (ordering, supplement) = if let Some(r) = rest.strip_prefix("Japan1-") {
491            ("Japan1".to_string(), r)
492        } else if let Some(r) = rest.strip_prefix("GB1-") {
493            ("GB1".to_string(), r)
494        } else if let Some(r) = rest.strip_prefix("CNS1-") {
495            ("CNS1".to_string(), r)
496        } else if let Some(r) = rest.strip_prefix("Korea1-") {
497            ("Korea1".to_string(), r)
498        } else {
499            return None;
500        };
501
502        // Supplement should be a number
503        if supplement.parse::<i32>().is_ok() {
504            return Some(PredefinedCMapInfo {
505                name: name.to_string(),
506                registry: "Adobe".to_string(),
507                ordering,
508                writing_mode: 0,
509                is_identity: false,
510            });
511        }
512    }
513
514    // Standard CJK encoding CMaps with -H/-V suffix
515    let (base, writing_mode) = if let Some(b) = name.strip_suffix("-H") {
516        (b, 0u8)
517    } else if let Some(b) = name.strip_suffix("-V") {
518        (b, 1u8)
519    } else {
520        return None;
521    };
522
523    // Recognize known CMap base names by their ordering
524    let ordering = if base.contains("JIS")
525        || base.contains("Japan")
526        || base.contains("EUC-JP")
527        || base == "78-RKSJ"
528        || base == "83pv-RKSJ"
529        || base == "90pv-RKSJ"
530        || base == "90ms-RKSJ"
531        || base == "Hankaku"
532        || base == "Hiragana"
533        || base == "Katakana"
534        || base == "Roman"
535        || base == "WP-Symbol"
536        || base == "Add-RKSJ"
537        || base == "Ext-RKSJ"
538    {
539        "Japan1"
540    } else if base.contains("GB")
541        || base.contains("GBK")
542        || base.contains("GBpc")
543        || base.contains("GBT")
544        || base == "UniCNS-UCS2"
545    {
546        // Note: UniCNS is actually CNS1, but GB-prefixed are GB1
547        if base.starts_with("UniCNS") {
548            "CNS1"
549        } else {
550            "GB1"
551        }
552    } else if base.contains("CNS") || base.contains("ETen") || base.contains("HKscs") {
553        "CNS1"
554    } else if base.contains("KSC") || base.contains("KSCms") || base.contains("UniKS") {
555        "Korea1"
556    } else {
557        return None;
558    };
559
560    Some(PredefinedCMapInfo {
561        name: name.to_string(),
562        registry: "Adobe".to_string(),
563        ordering: ordering.to_string(),
564        writing_mode,
565        is_identity: false,
566    })
567}
568
569/// Detect whether a font dictionary represents a Type0 (composite/CID) font.
570pub fn is_type0_font(font_dict: &lopdf::Dictionary) -> bool {
571    font_dict
572        .get(b"Subtype")
573        .ok()
574        .and_then(|o| o.as_name_str().ok())
575        .is_some_and(|s| s == "Type0")
576}
577
578/// Extract the descendant CIDFont dictionary from a Type0 font.
579pub fn get_descendant_font<'a>(
580    doc: &'a lopdf::Document,
581    type0_dict: &'a lopdf::Dictionary,
582) -> Option<&'a lopdf::Dictionary> {
583    let descendants = type0_dict.get(b"DescendantFonts").ok()?;
584    let descendants = resolve_object(doc, descendants);
585    let arr = descendants.as_array().ok()?;
586    let first = arr.first()?;
587    let first = resolve_object(doc, first);
588    first.as_dict().ok()
589}
590
591/// Get the encoding name from a Type0 font dictionary.
592pub fn get_type0_encoding(font_dict: &lopdf::Dictionary) -> Option<String> {
593    let encoding = font_dict.get(b"Encoding").ok()?;
594    encoding.as_name_str().ok().map(|s| s.to_string())
595}
596
597/// Check if a font name has a subset prefix.
598///
599/// PDF subset fonts have a 6-uppercase-letter prefix followed by '+' and the
600/// real font name, e.g. `ABCDEF+ArialMT`. Returns `true` if the name matches
601/// this pattern.
602pub fn is_subset_font(font_name: &str) -> bool {
603    if font_name.len() < 8 {
604        return false;
605    }
606    let bytes = font_name.as_bytes();
607    // First 6 chars must be uppercase ASCII letters
608    for &b in &bytes[..6] {
609        if !b.is_ascii_uppercase() {
610            return false;
611        }
612    }
613    // 7th char must be '+'
614    bytes[6] == b'+'
615}
616
617/// Strip the subset prefix from a font name.
618///
619/// If the font name has the pattern `ABCDEF+RealName`, returns `RealName`.
620/// Otherwise returns the original name unchanged.
621pub fn strip_subset_prefix(font_name: &str) -> &str {
622    if is_subset_font(font_name) {
623        &font_name[7..]
624    } else {
625        font_name
626    }
627}
628
629#[cfg(test)]
630mod tests {
631    use super::*;
632    use lopdf::{Document, Object, Stream, dictionary};
633
634    // ========== CidToGidMap tests ==========
635
636    #[test]
637    fn identity_map_returns_same_cid() {
638        let map = CidToGidMap::Identity;
639        assert_eq!(map.map(0), 0);
640        assert_eq!(map.map(100), 100);
641        assert_eq!(map.map(65535), 65535);
642    }
643
644    #[test]
645    fn explicit_map_looks_up_table() {
646        let table = vec![10, 20, 30, 40, 50];
647        let map = CidToGidMap::Explicit(table);
648        assert_eq!(map.map(0), 10);
649        assert_eq!(map.map(1), 20);
650        assert_eq!(map.map(4), 50);
651    }
652
653    #[test]
654    fn explicit_map_out_of_range_returns_cid() {
655        let table = vec![10, 20, 30];
656        let map = CidToGidMap::Explicit(table);
657        assert_eq!(map.map(5), 5); // out of range → fallback to CID
658    }
659
660    #[test]
661    fn from_stream_parses_big_endian_u16() {
662        // CID 0 → GID 5, CID 1 → GID 10
663        let data = vec![0x00, 0x05, 0x00, 0x0A];
664        let map = CidToGidMap::from_stream(&data);
665        assert_eq!(map.map(0), 5);
666        assert_eq!(map.map(1), 10);
667    }
668
669    #[test]
670    fn from_stream_handles_odd_length() {
671        // Only one complete pair, last byte ignored
672        let data = vec![0x00, 0x05, 0x00];
673        let map = CidToGidMap::from_stream(&data);
674        assert_eq!(map.map(0), 5);
675        assert_eq!(map.map(1), 1); // out of range
676    }
677
678    #[test]
679    fn from_stream_empty() {
680        let map = CidToGidMap::from_stream(&[]);
681        assert_eq!(map.map(0), 0); // out of range, falls back to CID
682    }
683
684    // ========== CidSystemInfo tests ==========
685
686    #[test]
687    fn cid_system_info_adobe_japan1() {
688        let info = CidSystemInfo {
689            registry: "Adobe".to_string(),
690            ordering: "Japan1".to_string(),
691            supplement: 6,
692        };
693        assert!(info.is_adobe_cjk());
694    }
695
696    #[test]
697    fn cid_system_info_adobe_gb1() {
698        let info = CidSystemInfo {
699            registry: "Adobe".to_string(),
700            ordering: "GB1".to_string(),
701            supplement: 5,
702        };
703        assert!(info.is_adobe_cjk());
704    }
705
706    #[test]
707    fn cid_system_info_adobe_cns1() {
708        let info = CidSystemInfo {
709            registry: "Adobe".to_string(),
710            ordering: "CNS1".to_string(),
711            supplement: 7,
712        };
713        assert!(info.is_adobe_cjk());
714    }
715
716    #[test]
717    fn cid_system_info_adobe_korea1() {
718        let info = CidSystemInfo {
719            registry: "Adobe".to_string(),
720            ordering: "Korea1".to_string(),
721            supplement: 2,
722        };
723        assert!(info.is_adobe_cjk());
724    }
725
726    #[test]
727    fn cid_system_info_non_adobe_not_cjk() {
728        let info = CidSystemInfo {
729            registry: "Custom".to_string(),
730            ordering: "Japan1".to_string(),
731            supplement: 0,
732        };
733        assert!(!info.is_adobe_cjk());
734    }
735
736    #[test]
737    fn cid_system_info_adobe_non_cjk_ordering() {
738        let info = CidSystemInfo {
739            registry: "Adobe".to_string(),
740            ordering: "Identity".to_string(),
741            supplement: 0,
742        };
743        assert!(!info.is_adobe_cjk());
744    }
745
746    // ========== CidFontMetrics tests ==========
747
748    #[test]
749    fn cid_font_metrics_get_width_from_map() {
750        let mut widths = HashMap::new();
751        widths.insert(1, 500.0);
752        widths.insert(2, 600.0);
753        widths.insert(100, 250.0);
754
755        let metrics = CidFontMetrics::new(
756            widths,
757            1000.0,
758            880.0,
759            -120.0,
760            None,
761            CidFontType::Type2,
762            CidToGidMap::Identity,
763            None,
764        );
765
766        assert_eq!(metrics.get_width(1), 500.0);
767        assert_eq!(metrics.get_width(2), 600.0);
768        assert_eq!(metrics.get_width(100), 250.0);
769    }
770
771    #[test]
772    fn cid_font_metrics_get_width_returns_default() {
773        let metrics = CidFontMetrics::new(
774            HashMap::new(),
775            1000.0,
776            880.0,
777            -120.0,
778            None,
779            CidFontType::Type2,
780            CidToGidMap::Identity,
781            None,
782        );
783
784        assert_eq!(metrics.get_width(0), 1000.0);
785        assert_eq!(metrics.get_width(999), 1000.0);
786    }
787
788    #[test]
789    fn cid_font_metrics_custom_default_width() {
790        let metrics = CidFontMetrics::new(
791            HashMap::new(),
792            500.0,
793            880.0,
794            -120.0,
795            None,
796            CidFontType::Type0,
797            CidToGidMap::Identity,
798            None,
799        );
800
801        assert_eq!(metrics.get_width(0), 500.0);
802        assert_eq!(metrics.default_width(), 500.0);
803    }
804
805    #[test]
806    fn cid_font_metrics_accessors() {
807        let info = CidSystemInfo {
808            registry: "Adobe".to_string(),
809            ordering: "Japan1".to_string(),
810            supplement: 6,
811        };
812        let metrics = CidFontMetrics::new(
813            HashMap::new(),
814            1000.0,
815            880.0,
816            -120.0,
817            Some([-100.0, -200.0, 1100.0, 900.0]),
818            CidFontType::Type0,
819            CidToGidMap::Identity,
820            Some(info),
821        );
822
823        assert_eq!(metrics.ascent(), 880.0);
824        assert_eq!(metrics.descent(), -120.0);
825        assert_eq!(metrics.font_bbox(), Some([-100.0, -200.0, 1100.0, 900.0]));
826        assert_eq!(metrics.font_type(), CidFontType::Type0);
827        assert_eq!(metrics.cid_to_gid(), &CidToGidMap::Identity);
828        assert!(metrics.system_info().unwrap().is_adobe_cjk());
829    }
830
831    #[test]
832    fn cid_font_metrics_map_cid_to_gid() {
833        let table = vec![10, 20, 30];
834        let metrics = CidFontMetrics::new(
835            HashMap::new(),
836            1000.0,
837            880.0,
838            -120.0,
839            None,
840            CidFontType::Type2,
841            CidToGidMap::Explicit(table),
842            None,
843        );
844
845        assert_eq!(metrics.map_cid_to_gid(0), 10);
846        assert_eq!(metrics.map_cid_to_gid(1), 20);
847        assert_eq!(metrics.map_cid_to_gid(2), 30);
848        assert_eq!(metrics.map_cid_to_gid(5), 5); // fallback
849    }
850
851    #[test]
852    fn cid_font_metrics_default() {
853        let metrics = CidFontMetrics::default_metrics();
854        assert_eq!(metrics.default_width(), DEFAULT_CID_WIDTH);
855        assert_eq!(metrics.ascent(), DEFAULT_CID_ASCENT);
856        assert_eq!(metrics.descent(), DEFAULT_CID_DESCENT);
857        assert_eq!(metrics.font_bbox(), None);
858        assert_eq!(metrics.font_type(), CidFontType::Type2);
859        assert_eq!(metrics.cid_to_gid(), &CidToGidMap::Identity);
860        assert!(metrics.system_info().is_none());
861    }
862
863    // ========== parse_w_array tests ==========
864
865    #[test]
866    fn parse_w_array_individual_widths() {
867        // [1 [500 600 700]] → CID 1=500, CID 2=600, CID 3=700
868        let doc = Document::with_version("1.5");
869        let objects = vec![
870            Object::Integer(1),
871            Object::Array(vec![
872                Object::Integer(500),
873                Object::Integer(600),
874                Object::Integer(700),
875            ]),
876        ];
877
878        let widths = parse_w_array(&objects, &doc);
879        assert_eq!(widths.get(&1), Some(&500.0));
880        assert_eq!(widths.get(&2), Some(&600.0));
881        assert_eq!(widths.get(&3), Some(&700.0));
882        assert_eq!(widths.get(&0), None);
883        assert_eq!(widths.get(&4), None);
884    }
885
886    #[test]
887    fn parse_w_array_range_format() {
888        // [10 20 500] → CIDs 10-20 all have width 500
889        let doc = Document::with_version("1.5");
890        let objects = vec![
891            Object::Integer(10),
892            Object::Integer(20),
893            Object::Integer(500),
894        ];
895
896        let widths = parse_w_array(&objects, &doc);
897        for cid in 10..=20 {
898            assert_eq!(widths.get(&cid), Some(&500.0), "CID {} should be 500", cid);
899        }
900        assert_eq!(widths.get(&9), None);
901        assert_eq!(widths.get(&21), None);
902    }
903
904    #[test]
905    fn parse_w_array_mixed_formats() {
906        // [1 [250 300] 10 20 500]
907        let doc = Document::with_version("1.5");
908        let objects = vec![
909            Object::Integer(1),
910            Object::Array(vec![Object::Integer(250), Object::Integer(300)]),
911            Object::Integer(10),
912            Object::Integer(20),
913            Object::Integer(500),
914        ];
915
916        let widths = parse_w_array(&objects, &doc);
917        assert_eq!(widths.get(&1), Some(&250.0));
918        assert_eq!(widths.get(&2), Some(&300.0));
919        for cid in 10..=20 {
920            assert_eq!(widths.get(&cid), Some(&500.0));
921        }
922    }
923
924    #[test]
925    fn parse_w_array_empty() {
926        let doc = Document::with_version("1.5");
927        let widths = parse_w_array(&[], &doc);
928        assert!(widths.is_empty());
929    }
930
931    #[test]
932    fn parse_w_array_real_values() {
933        let doc = Document::with_version("1.5");
934        let objects = vec![
935            Object::Integer(1),
936            Object::Array(vec![Object::Real(500.5), Object::Real(600.5)]),
937        ];
938
939        let widths = parse_w_array(&objects, &doc);
940        assert!((widths[&1] - 500.5).abs() < 0.1);
941        assert!((widths[&2] - 600.5).abs() < 0.1);
942    }
943
944    #[test]
945    fn parse_w_array_single_cid_range() {
946        // [5 5 700] → CID 5 = 700
947        let doc = Document::with_version("1.5");
948        let objects = vec![Object::Integer(5), Object::Integer(5), Object::Integer(700)];
949
950        let widths = parse_w_array(&objects, &doc);
951        assert_eq!(widths.get(&5), Some(&700.0));
952        assert_eq!(widths.len(), 1);
953    }
954
955    // ========== extract_cid_font_metrics tests ==========
956
957    #[test]
958    fn extract_cid_font_metrics_basic() {
959        let mut doc = Document::with_version("1.5");
960
961        // Create a CIDFont dictionary
962        let w_array = Object::Array(vec![
963            Object::Integer(1),
964            Object::Array(vec![Object::Integer(500), Object::Integer(600)]),
965        ]);
966        let w_id = doc.add_object(w_array);
967
968        let cid_font_dict = dictionary! {
969            "Type" => "Font",
970            "Subtype" => "CIDFontType2",
971            "BaseFont" => "MSGothic",
972            "DW" => Object::Integer(1000),
973            "W" => w_id,
974            "CIDToGIDMap" => "Identity",
975        };
976
977        let metrics = extract_cid_font_metrics(&doc, &cid_font_dict).unwrap();
978        assert_eq!(metrics.font_type(), CidFontType::Type2);
979        assert_eq!(metrics.default_width(), 1000.0);
980        assert_eq!(metrics.get_width(1), 500.0);
981        assert_eq!(metrics.get_width(2), 600.0);
982        assert_eq!(metrics.get_width(3), 1000.0); // default
983        assert_eq!(metrics.cid_to_gid(), &CidToGidMap::Identity);
984    }
985
986    #[test]
987    fn extract_cid_font_metrics_type0() {
988        let doc = Document::with_version("1.5");
989
990        let cid_font_dict = dictionary! {
991            "Type" => "Font",
992            "Subtype" => "CIDFontType0",
993            "BaseFont" => "KozMinPro-Regular",
994        };
995
996        let metrics = extract_cid_font_metrics(&doc, &cid_font_dict).unwrap();
997        assert_eq!(metrics.font_type(), CidFontType::Type0);
998        assert_eq!(metrics.default_width(), DEFAULT_CID_WIDTH);
999    }
1000
1001    #[test]
1002    fn extract_cid_font_metrics_with_descriptor() {
1003        let mut doc = Document::with_version("1.5");
1004
1005        let desc_id = doc.add_object(Object::Dictionary(dictionary! {
1006            "Type" => "FontDescriptor",
1007            "FontName" => "MSGothic",
1008            "Ascent" => Object::Integer(859),
1009            "Descent" => Object::Integer(-140),
1010            "FontBBox" => Object::Array(vec![
1011                Object::Integer(0),
1012                Object::Integer(-137),
1013                Object::Integer(1000),
1014                Object::Integer(859),
1015            ]),
1016        }));
1017
1018        let cid_font_dict = dictionary! {
1019            "Type" => "Font",
1020            "Subtype" => "CIDFontType2",
1021            "BaseFont" => "MSGothic",
1022            "FontDescriptor" => desc_id,
1023        };
1024
1025        let metrics = extract_cid_font_metrics(&doc, &cid_font_dict).unwrap();
1026        assert_eq!(metrics.ascent(), 859.0);
1027        assert_eq!(metrics.descent(), -140.0);
1028        assert!(metrics.font_bbox().is_some());
1029    }
1030
1031    #[test]
1032    fn extract_cid_font_metrics_with_system_info() {
1033        let doc = Document::with_version("1.5");
1034
1035        let cid_font_dict = dictionary! {
1036            "Type" => "Font",
1037            "Subtype" => "CIDFontType2",
1038            "BaseFont" => "MSGothic",
1039            "CIDSystemInfo" => Object::Dictionary(dictionary! {
1040                "Registry" => Object::String("Adobe".as_bytes().to_vec(), lopdf::StringFormat::Literal),
1041                "Ordering" => Object::String("Japan1".as_bytes().to_vec(), lopdf::StringFormat::Literal),
1042                "Supplement" => Object::Integer(6),
1043            }),
1044        };
1045
1046        let metrics = extract_cid_font_metrics(&doc, &cid_font_dict).unwrap();
1047        let info = metrics.system_info().unwrap();
1048        assert_eq!(info.registry, "Adobe");
1049        assert_eq!(info.ordering, "Japan1");
1050        assert_eq!(info.supplement, 6);
1051        assert!(info.is_adobe_cjk());
1052    }
1053
1054    #[test]
1055    fn extract_cid_font_metrics_explicit_gid_map() {
1056        let mut doc = Document::with_version("1.5");
1057
1058        // CIDToGIDMap stream: CID 0→GID 5, CID 1→GID 10
1059        let gid_data = vec![0x00, 0x05, 0x00, 0x0A];
1060        let gid_stream = Stream::new(dictionary! {}, gid_data);
1061        let gid_stream_id = doc.add_object(Object::Stream(gid_stream));
1062
1063        let cid_font_dict = dictionary! {
1064            "Type" => "Font",
1065            "Subtype" => "CIDFontType2",
1066            "BaseFont" => "CustomFont",
1067            "CIDToGIDMap" => gid_stream_id,
1068        };
1069
1070        let metrics = extract_cid_font_metrics(&doc, &cid_font_dict).unwrap();
1071        assert_eq!(metrics.map_cid_to_gid(0), 5);
1072        assert_eq!(metrics.map_cid_to_gid(1), 10);
1073    }
1074
1075    // ========== Predefined CMap name parsing tests ==========
1076
1077    #[test]
1078    fn parse_identity_h() {
1079        let info = parse_predefined_cmap_name("Identity-H").unwrap();
1080        assert_eq!(info.name, "Identity-H");
1081        assert_eq!(info.writing_mode, 0);
1082        assert!(info.is_identity);
1083    }
1084
1085    #[test]
1086    fn parse_identity_v() {
1087        let info = parse_predefined_cmap_name("Identity-V").unwrap();
1088        assert_eq!(info.name, "Identity-V");
1089        assert_eq!(info.writing_mode, 1);
1090        assert!(info.is_identity);
1091    }
1092
1093    #[test]
1094    fn parse_adobe_japan1() {
1095        let info = parse_predefined_cmap_name("Adobe-Japan1-6").unwrap();
1096        assert_eq!(info.registry, "Adobe");
1097        assert_eq!(info.ordering, "Japan1");
1098        assert!(!info.is_identity);
1099    }
1100
1101    #[test]
1102    fn parse_adobe_gb1() {
1103        let info = parse_predefined_cmap_name("Adobe-GB1-5").unwrap();
1104        assert_eq!(info.ordering, "GB1");
1105    }
1106
1107    #[test]
1108    fn parse_adobe_cns1() {
1109        let info = parse_predefined_cmap_name("Adobe-CNS1-7").unwrap();
1110        assert_eq!(info.ordering, "CNS1");
1111    }
1112
1113    #[test]
1114    fn parse_adobe_korea1() {
1115        let info = parse_predefined_cmap_name("Adobe-Korea1-2").unwrap();
1116        assert_eq!(info.ordering, "Korea1");
1117    }
1118
1119    #[test]
1120    fn parse_unijis_utf16_h() {
1121        let info = parse_predefined_cmap_name("UniJIS-UTF16-H").unwrap();
1122        assert_eq!(info.ordering, "Japan1");
1123        assert_eq!(info.writing_mode, 0);
1124    }
1125
1126    #[test]
1127    fn parse_unijis_utf16_v() {
1128        let info = parse_predefined_cmap_name("UniJIS-UTF16-V").unwrap();
1129        assert_eq!(info.ordering, "Japan1");
1130        assert_eq!(info.writing_mode, 1);
1131    }
1132
1133    #[test]
1134    fn parse_unigb_utf16_h() {
1135        let info = parse_predefined_cmap_name("UniGB-UTF16-H").unwrap();
1136        assert_eq!(info.ordering, "GB1");
1137    }
1138
1139    #[test]
1140    fn parse_uniksc_utf16_h() {
1141        let info = parse_predefined_cmap_name("UniKS-UTF16-H").unwrap();
1142        assert_eq!(info.ordering, "Korea1");
1143    }
1144
1145    #[test]
1146    fn parse_90ms_rksj_h() {
1147        let info = parse_predefined_cmap_name("90ms-RKSJ-H").unwrap();
1148        assert_eq!(info.ordering, "Japan1");
1149        assert_eq!(info.writing_mode, 0);
1150    }
1151
1152    #[test]
1153    fn parse_unknown_cmap_returns_none() {
1154        assert!(parse_predefined_cmap_name("UnknownCMap").is_none());
1155    }
1156
1157    #[test]
1158    fn parse_empty_cmap_returns_none() {
1159        assert!(parse_predefined_cmap_name("").is_none());
1160    }
1161
1162    // ========== Type0 font detection tests ==========
1163
1164    #[test]
1165    fn detect_type0_font() {
1166        let dict = dictionary! {
1167            "Type" => "Font",
1168            "Subtype" => "Type0",
1169            "BaseFont" => "SomeFont",
1170        };
1171        assert!(is_type0_font(&dict));
1172    }
1173
1174    #[test]
1175    fn detect_non_type0_font() {
1176        let dict = dictionary! {
1177            "Type" => "Font",
1178            "Subtype" => "Type1",
1179            "BaseFont" => "Helvetica",
1180        };
1181        assert!(!is_type0_font(&dict));
1182    }
1183
1184    #[test]
1185    fn detect_truetype_font() {
1186        let dict = dictionary! {
1187            "Type" => "Font",
1188            "Subtype" => "TrueType",
1189            "BaseFont" => "Arial",
1190        };
1191        assert!(!is_type0_font(&dict));
1192    }
1193
1194    // ========== get_descendant_font tests ==========
1195
1196    #[test]
1197    fn get_descendant_font_basic() {
1198        let mut doc = Document::with_version("1.5");
1199
1200        let cid_font_dict = dictionary! {
1201            "Type" => "Font",
1202            "Subtype" => "CIDFontType2",
1203            "BaseFont" => "MSGothic",
1204        };
1205        let cid_font_id = doc.add_object(Object::Dictionary(cid_font_dict));
1206
1207        let type0_dict = dictionary! {
1208            "Type" => "Font",
1209            "Subtype" => "Type0",
1210            "BaseFont" => "MSGothic",
1211            "DescendantFonts" => Object::Array(vec![Object::Reference(cid_font_id)]),
1212        };
1213
1214        let desc = get_descendant_font(&doc, &type0_dict);
1215        assert!(desc.is_some());
1216        let desc = desc.unwrap();
1217        assert_eq!(
1218            desc.get(b"Subtype").unwrap().as_name_str().unwrap(),
1219            "CIDFontType2"
1220        );
1221    }
1222
1223    #[test]
1224    fn get_descendant_font_missing() {
1225        let doc = Document::with_version("1.5");
1226        let type0_dict = dictionary! {
1227            "Type" => "Font",
1228            "Subtype" => "Type0",
1229            "BaseFont" => "MSGothic",
1230        };
1231
1232        assert!(get_descendant_font(&doc, &type0_dict).is_none());
1233    }
1234
1235    // ========== get_type0_encoding tests ==========
1236
1237    #[test]
1238    fn get_encoding_identity_h() {
1239        let dict = dictionary! {
1240            "Subtype" => "Type0",
1241            "Encoding" => "Identity-H",
1242        };
1243        assert_eq!(get_type0_encoding(&dict), Some("Identity-H".to_string()));
1244    }
1245
1246    #[test]
1247    fn get_encoding_missing() {
1248        let dict = dictionary! {
1249            "Subtype" => "Type0",
1250        };
1251        assert_eq!(get_type0_encoding(&dict), None);
1252    }
1253
1254    // ========== Subset font detection tests ==========
1255
1256    #[test]
1257    fn is_subset_font_valid() {
1258        assert!(is_subset_font("ABCDEF+ArialMT"));
1259        assert!(is_subset_font("XYZABC+TimesNewRoman"));
1260        assert!(is_subset_font("AAAAAA+A")); // minimal real name
1261    }
1262
1263    #[test]
1264    fn is_subset_font_invalid() {
1265        assert!(!is_subset_font("ArialMT")); // no prefix
1266        assert!(!is_subset_font("abcdef+ArialMT")); // lowercase
1267        assert!(!is_subset_font("ABCDE+ArialMT")); // only 5 uppercase
1268        assert!(!is_subset_font("ABCDEF-ArialMT")); // dash not plus
1269        assert!(!is_subset_font("ABC1EF+ArialMT")); // digit in prefix
1270        assert!(!is_subset_font("")); // empty
1271        assert!(!is_subset_font("ABCDEF+")); // nothing after +
1272    }
1273
1274    #[test]
1275    fn strip_subset_prefix_with_prefix() {
1276        assert_eq!(strip_subset_prefix("ABCDEF+ArialMT"), "ArialMT");
1277        assert_eq!(strip_subset_prefix("XYZABC+TimesNewRoman"), "TimesNewRoman");
1278    }
1279
1280    #[test]
1281    fn strip_subset_prefix_without_prefix() {
1282        assert_eq!(strip_subset_prefix("ArialMT"), "ArialMT");
1283        assert_eq!(strip_subset_prefix("Helvetica"), "Helvetica");
1284        assert_eq!(strip_subset_prefix(""), "");
1285    }
1286
1287    // ========== Identity-H/V encoding behavior tests ==========
1288
1289    #[test]
1290    fn identity_h_encoding_detected() {
1291        let dict = dictionary! {
1292            "Subtype" => "Type0",
1293            "Encoding" => "Identity-H",
1294        };
1295        let enc = get_type0_encoding(&dict).unwrap();
1296        let info = parse_predefined_cmap_name(&enc).unwrap();
1297        assert!(info.is_identity);
1298        assert_eq!(info.writing_mode, 0); // horizontal
1299    }
1300
1301    #[test]
1302    fn identity_v_encoding_detected() {
1303        let dict = dictionary! {
1304            "Subtype" => "Type0",
1305            "Encoding" => "Identity-V",
1306        };
1307        let enc = get_type0_encoding(&dict).unwrap();
1308        let info = parse_predefined_cmap_name(&enc).unwrap();
1309        assert!(info.is_identity);
1310        assert_eq!(info.writing_mode, 1); // vertical
1311    }
1312}
pdfplumber_parse/cid_font.rs

pdfplumber_parse/
cid_font.rs