oxidize-pdf 2.5.4

A pure Rust PDF generation and manipulation library with zero external dependencies
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
//! CFF/OpenType font subsetting
//!
//! This module implements subsetting for CFF (Compact Font Format) fonts embedded
//! inside OpenType files (signature OTTO). It extracts only the glyphs actually
//! needed, rebuilding the CFF table and reconstructing the OTF wrapper.
//!
//! ## CFF Table Structure
//!
//! ```text
//! CFF Table:
//!   Header (4 bytes): major, minor, hdrSize, offSize
//!   Name INDEX
//!   Top DICT INDEX  ← contains offsets to CharStrings, Charset, Private DICT
//!   String INDEX
//!   Global Subr INDEX
//!   --- data section (offsets from Top DICT point here) ---
//!   Charset
//!   CharStrings INDEX
//!   Private DICT + Local Subr INDEX
//! ```
//!
//! ## Subsetting Strategy
//!
//! - Name INDEX, String INDEX, Global Subr INDEX: copied verbatim (conservative)
//! - CharStrings INDEX: only the needed GIDs are kept; GID 0 (.notdef) is always included
//! - Charset: rebuilt as format 0 for the subset GIDs
//! - Private DICT + Local Subr INDEX: copied verbatim (conservative)
//! - Top DICT: rebuilt with updated offsets
//! - CIDFonts (FDArray/FDSelect): fall back to returning the full font

use crate::fonts::cmap_utils::parse_cmap_format_12_filtered;
use crate::parser::{ParseError, ParseResult};
use crate::text::fonts::cff::charstring::desubroutinize;
use crate::text::fonts::cff::dict::{
    build_cid_top_dict, build_minimal_fd_dict, parse_fd_private, parse_fd_select,
    parse_local_subrs_offset, parse_top_dict, rebuild_fd_dict, strip_private_subrs_op, FdData,
    TopDictOffsets,
};
use crate::text::fonts::cff::index::{
    build_cff_index, parse_cff_index, usize_to_cff_offset, CffIndex,
};
use crate::text::fonts::cff::types::{read_i16, read_u16, read_u32};
use std::collections::{HashMap, HashSet};

// =============================================================================
// Public API
// =============================================================================

/// Result of CFF font subsetting
pub struct CffSubsetResult {
    /// Font data: raw CFF bytes for CID-keyed fonts, OTF for non-CID fonts
    pub font_data: Vec<u8>,
    /// Unicode codepoint → new GID mapping
    pub glyph_mapping: HashMap<u32, u16>,
    /// True if font_data is raw CFF (embed with /CIDFontType0C),
    /// false if it's a full OTF (embed with /OpenType)
    pub is_raw_cff: bool,
}

/// Subset a CFF/OpenType font to include only the specified characters.
///
/// `font_data` is the complete OTF file (with OTTO signature).
/// `used_chars` is the set of Unicode characters to keep.
///
/// On success, returns the subsetted OTF file and the updated glyph mapping.
/// On CIDFont detection, returns the full font (conservative fallback).
pub fn subset_cff_font(
    font_data: &[u8],
    used_chars: &HashSet<char>,
) -> ParseResult<CffSubsetResult> {
    let otf = OtfFile::parse(font_data)?;
    let cff_entry = otf.find_table(b"CFF ")?;
    let cff_start = cff_entry.offset as usize;
    let cff_end = (cff_entry.offset as usize)
        .checked_add(cff_entry.length as usize)
        .ok_or_else(|| ParseError::SyntaxError {
            position: cff_start,
            message: "CFF table range overflow (offset + length wraps usize)".to_string(),
        })?;

    if cff_end > font_data.len() {
        return Err(ParseError::SyntaxError {
            position: cff_start,
            message: "CFF table extends beyond font data".to_string(),
        });
    }
    let cff_data = &font_data[cff_start..cff_end];

    // Parse cmap to determine which GIDs to keep
    let cmap_entry = otf.find_table(b"cmap")?;
    let cmap_start = cmap_entry.offset as usize;
    let cmap_end = (cmap_entry.offset as usize)
        .checked_add(cmap_entry.length as usize)
        .ok_or_else(|| ParseError::SyntaxError {
            position: cmap_start,
            message: "cmap table range overflow (offset + length wraps usize)".to_string(),
        })?;
    if cmap_end > font_data.len() {
        return Err(ParseError::SyntaxError {
            position: cmap_start,
            message: "cmap table extends beyond font data".to_string(),
        });
    }
    let cmap_data = &font_data[cmap_start..cmap_end];
    // Build a u32 codepoint filter so that parse_cmap (Format 12 path) skips
    // the 70 000+ CJK entries that are not needed.
    let codepoint_filter: HashSet<u32> = used_chars.iter().map(|c| *c as u32).collect();
    let unicode_to_gid = parse_cmap(cmap_data, Some(&codepoint_filter))?;

    // Determine needed GIDs
    let mut needed_gids: Vec<u16> = vec![0]; // .notdef always included
    let mut new_glyph_mapping: HashMap<u32, u16> = HashMap::new();

    for ch in used_chars {
        let codepoint = *ch as u32;
        if let Some(&gid) = unicode_to_gid.get(&codepoint) {
            if gid != 0 {
                needed_gids.push(gid);
            }
            // Will update to new GID after remapping
        }
    }

    // Sort and deduplicate
    needed_gids.sort();
    needed_gids.dedup();

    // Build old_gid → new_gid mapping
    let mut gid_remap: HashMap<u16, u16> = HashMap::new();
    for (new_gid, &old_gid) in needed_gids.iter().enumerate() {
        gid_remap.insert(old_gid, new_gid as u16);
    }

    // Build final glyph mapping: unicode → new GID
    for ch in used_chars {
        let codepoint = *ch as u32;
        if let Some(&old_gid) = unicode_to_gid.get(&codepoint) {
            if let Some(&new_gid) = gid_remap.get(&old_gid) {
                new_glyph_mapping.insert(codepoint, new_gid);
            }
        }
    }

    // Build old_gid→unicode mapping for CID charset (Identity-H requires CID = Unicode codepoint)
    let mut old_gid_to_unicode: HashMap<u16, u32> = HashMap::new();
    for ch in used_chars {
        let codepoint = *ch as u32;
        if let Some(&gid) = unicode_to_gid.get(&codepoint) {
            old_gid_to_unicode.entry(gid).or_insert(codepoint);
        }
    }

    // Subset the CFF table
    let new_cff = match subset_cff_table(cff_data, &needed_gids, &gid_remap, &old_gid_to_unicode) {
        Ok(data) => data,
        Err(e) => {
            tracing::debug!(
                "CFF table subsetting failed: {:?}; falling back to full font",
                e
            );
            // Fallback: return full font with mapping filtered to used_chars only
            // (consistent with the success path which only maps used_chars)
            let filtered_mapping: HashMap<u32, u16> = used_chars
                .iter()
                .filter_map(|ch| {
                    let cp = *ch as u32;
                    unicode_to_gid.get(&cp).map(|&gid| (cp, gid))
                })
                .collect();
            return Ok(CffSubsetResult {
                font_data: font_data.to_vec(),
                glyph_mapping: filtered_mapping,
                is_raw_cff: false,
            });
        }
    };

    // Both the CID path and the SID→CID conversion emit CID-keyed raw CFF,
    // which the PDF writer embeds with /Subtype /CIDFontType0C. `otf` is no
    // longer needed once the input has been parsed — the output is never
    // wrapped in an OTF container, and the binding drops naturally at
    // end-of-scope.
    Ok(CffSubsetResult {
        font_data: new_cff,
        glyph_mapping: new_glyph_mapping,
        is_raw_cff: true,
    })
}

// =============================================================================
// OTF file structure
// =============================================================================

struct OtfTableEntry {
    tag: [u8; 4],
    offset: u32,
    length: u32,
}

struct OtfFile {
    tables: Vec<OtfTableEntry>,
}

impl OtfFile {
    /// Parse the OTF table directory. We only need the entries for CFF and cmap;
    /// rebuilding the OTF wrapper is no longer required since subsetted output
    /// is always emitted as raw CFF.
    fn parse(data: &[u8]) -> ParseResult<Self> {
        if data.len() < 12 {
            return Err(ParseError::SyntaxError {
                position: 0,
                message: "OTF file too small".to_string(),
            });
        }

        let num_tables = read_u16(data, 4)? as usize;

        if data.len() < 12 + num_tables * 16 {
            return Err(ParseError::SyntaxError {
                position: 0,
                message: "OTF table directory truncated".to_string(),
            });
        }

        let mut tables = Vec::with_capacity(num_tables);
        for i in 0..num_tables {
            let base = 12 + i * 16;
            let tag = [data[base], data[base + 1], data[base + 2], data[base + 3]];
            let offset = read_u32(data, base + 8)?;
            let length = read_u32(data, base + 12)?;
            tables.push(OtfTableEntry {
                tag,
                offset,
                length,
            });
        }

        Ok(Self { tables })
    }

    fn find_table(&self, tag: &[u8; 4]) -> ParseResult<&OtfTableEntry> {
        self.tables
            .iter()
            .find(|e| &e.tag == tag)
            .ok_or_else(|| ParseError::SyntaxError {
                position: 0,
                message: format!("Table {} not found", String::from_utf8_lossy(tag)),
            })
    }
}

// =============================================================================
// CID-keyed CFF subsetting
// =============================================================================

/// Subset a CID-keyed CFF table.
/// This handles fonts with FDArray and FDSelect operators.
fn subset_cid_cff_table(
    cff: &[u8],
    needed_gids: &[u16],
    gid_remap: &HashMap<u16, u16>,
    old_gid_to_unicode: &HashMap<u16, u32>,
    top_dict_bytes: &[u8],
    top_dict_offsets: &TopDictOffsets,
    name_index: &CffIndex,
    _top_dict_index: &CffIndex,
    string_index: &CffIndex,
    global_subr_index: &CffIndex,
) -> ParseResult<Vec<u8>> {
    let hdr_size = cff[2] as usize;
    let header_bytes = &cff[0..hdr_size];

    let fd_array_off = top_dict_offsets
        .fd_array_offset
        .ok_or_else(|| ParseError::SyntaxError {
            position: 0,
            message: "CIDFont missing FDArray offset".to_string(),
        })?;

    let fd_select_off =
        top_dict_offsets
            .fd_select_offset
            .ok_or_else(|| ParseError::SyntaxError {
                position: 0,
                message: "CIDFont missing FDSelect offset".to_string(),
            })?;

    let charstrings_off =
        top_dict_offsets
            .charstrings_offset
            .ok_or_else(|| ParseError::SyntaxError {
                position: 0,
                message: "CIDFont missing CharStrings offset".to_string(),
            })?;

    // Parse CharStrings INDEX
    if charstrings_off <= 0 || charstrings_off as usize >= cff.len() {
        return Err(ParseError::SyntaxError {
            position: 0,
            message: format!("CharStrings offset out of range: {}", charstrings_off),
        });
    }
    let charstrings_index = parse_cff_index(cff, charstrings_off as usize)?;
    let total_glyphs = charstrings_index.count();

    tracing::debug!(
        "CID CFF subsetting: {} total glyphs, {} needed",
        total_glyphs,
        needed_gids.len()
    );

    // Validate needed_gids
    for &gid in needed_gids {
        if gid as usize >= total_glyphs {
            return Err(ParseError::SyntaxError {
                position: 0,
                message: format!(
                    "GID {} out of range (font has {} glyphs)",
                    gid, total_glyphs
                ),
            });
        }
    }

    // Parse FDSelect — maps GID → FD index
    if fd_select_off <= 0 || fd_select_off as usize >= cff.len() {
        return Err(ParseError::SyntaxError {
            position: 0,
            message: format!("FDSelect offset out of range: {}", fd_select_off),
        });
    }
    let fd_select = parse_fd_select(cff, fd_select_off as usize, total_glyphs)?;

    // Parse FDArray INDEX
    if fd_array_off <= 0 || fd_array_off as usize >= cff.len() {
        return Err(ParseError::SyntaxError {
            position: 0,
            message: format!("FDArray offset out of range: {}", fd_array_off),
        });
    }
    let fd_array_index = parse_cff_index(cff, fd_array_off as usize)?;
    let num_fds = fd_array_index.count();

    tracing::debug!("CID CFF: {} FDs in FDArray", num_fds);

    // Determine which FDs are needed for the subset
    let mut needed_fd_set: std::collections::BTreeSet<u8> = std::collections::BTreeSet::new();
    for &gid in needed_gids {
        let fd = if (gid as usize) < fd_select.len() {
            fd_select[gid as usize]
        } else {
            0
        };
        needed_fd_set.insert(fd);
    }
    let needed_fds: Vec<u8> = needed_fd_set.into_iter().collect();

    tracing::debug!("CID CFF: needed FDs: {:?}", needed_fds);

    // Build old FD index → new FD index mapping
    let fd_remap: HashMap<u8, u8> = needed_fds
        .iter()
        .enumerate()
        .map(|(new_fd, &old_fd)| (old_fd, new_fd as u8))
        .collect();

    // Extract CharStrings in new-GID order (same as non-CID path)
    let sorted_old_gids: Vec<u16> = {
        let mut pairs: Vec<(u16, u16)> = needed_gids
            .iter()
            .filter_map(|&old_gid| gid_remap.get(&old_gid).map(|&new_gid| (new_gid, old_gid)))
            .collect();
        pairs.sort_by_key(|&(new, _)| new);
        let mut gids: Vec<u16> = pairs.iter().map(|&(_, old)| old).collect();
        // Ensure .notdef is first
        if gids.first().copied() != Some(0) {
            gids.retain(|&g| g != 0);
            gids.insert(0, 0);
        }
        gids
    };

    // Extract each needed FD's Private DICT and parse its Local Subr INDEX.
    // Charstrings will be desubroutinized, so the Subrs op (19) is stripped
    // from each Private DICT and no Local Subr INDEX is written.
    let mut fd_data_list: Vec<FdData> = Vec::new();
    let mut fd_local_subrs: HashMap<u8, CffIndex> = HashMap::new();
    for &old_fd in &needed_fds {
        let fd_dict = fd_array_index
            .get_item(old_fd as usize, cff)
            .ok_or_else(|| ParseError::SyntaxError {
                position: 0,
                message: format!("FD {} not found in FDArray", old_fd),
            })?;

        let (mut private_bytes, local_subrs) =
            if let Some((priv_size, priv_off)) = parse_fd_private(fd_dict) {
                if priv_off > 0 && priv_size > 0 {
                    let start = priv_off as usize;
                    let end = (start + priv_size as usize).min(cff.len());
                    let pb = cff[start..end].to_vec();
                    // Per CFF spec §5.1, Local Subrs exist only when op 19 (Subrs)
                    // is present in the Private DICT; no speculative probing.
                    let ls = if let Some(subrs_rel) = parse_local_subrs_offset(&pb) {
                        parse_cff_index(cff, start + subrs_rel)?
                    } else {
                        CffIndex::empty()
                    };
                    (pb, ls)
                } else {
                    (vec![], CffIndex::empty())
                }
            } else {
                (vec![], CffIndex::empty())
            };

        strip_private_subrs_op(&mut private_bytes);

        fd_local_subrs.insert(old_fd, local_subrs);
        fd_data_list.push(FdData {
            fd_dict_bytes: fd_dict.to_vec(),
            private_bytes,
            local_subr_bytes: Vec::new(),
        });
    }

    // Desubroutinize every kept charstring so the output is self-contained.
    // The result owns its bytes; rebuild CharStrings INDEX from references.
    let empty_index = CffIndex::empty();
    let desub_charstrings: Vec<Vec<u8>> = sorted_old_gids
        .iter()
        .map(|&old_gid| {
            let cs = charstrings_index
                .get_item(old_gid as usize, cff)
                .unwrap_or(&[0x0E]);
            let old_fd = if (old_gid as usize) < fd_select.len() {
                fd_select[old_gid as usize]
            } else {
                0
            };
            let local = fd_local_subrs.get(&old_fd).unwrap_or(&empty_index);
            desubroutinize(cs, global_subr_index, cff, local, cff)
        })
        .collect::<ParseResult<Vec<_>>>()?;
    let new_charstrings: Vec<&[u8]> = desub_charstrings.iter().map(|v| v.as_slice()).collect();
    let new_charstrings_index = build_cff_index(&new_charstrings);

    // Build new FDSelect (Format 0: one byte per new GID)
    let mut new_fd_select: Vec<u8> = Vec::new();
    new_fd_select.push(0); // format 0
    for &old_gid in &sorted_old_gids {
        let old_fd = if (old_gid as usize) < fd_select.len() {
            fd_select[old_gid as usize]
        } else {
            0
        };
        let new_fd = fd_remap.get(&old_fd).copied().unwrap_or(0);
        new_fd_select.push(new_fd);
    }

    // Build new CID Charset (format 0: format byte + CID for each new GID >= 1)
    // With Identity-H encoding, the content stream sends Unicode codepoints as CIDs.
    // The charset must map each new GID to its Unicode codepoint so that
    // CID (= Unicode codepoint) resolves to the correct GID in the subset font.
    let mut new_charset: Vec<u8> = Vec::new();
    new_charset.push(0); // format 0
    for (new_gid_idx, &old_gid) in sorted_old_gids.iter().enumerate() {
        if new_gid_idx == 0 {
            continue; // GID 0 = .notdef, not listed
        }
        // Use Unicode codepoint as CID for Identity-H compatibility.
        // CFF CIDs are 16-bit (CFF spec §7), so SMP codepoints (>= U+10000)
        // cannot be used directly. Fall back to old_gid which preserves the
        // glyph identity within the subset even though copy/search won't work
        // for that specific character.
        let cid = old_gid_to_unicode
            .get(&old_gid)
            .copied()
            .unwrap_or(old_gid as u32);
        let cid16 = if cid > 0xFFFF {
            tracing::debug!(
                "CFF subsetter: U+{:X} exceeds 16-bit CID range for GID {}, using GID as CID",
                cid,
                old_gid
            );
            old_gid
        } else {
            cid as u16
        };
        new_charset.extend_from_slice(&cid16.to_be_bytes());
    }

    // --- Two-pass offset assembly ---
    // Layout:
    //   [0] Header
    //   [1] Name INDEX
    //   [2] Top DICT INDEX (rebuilt)
    //   [3] String INDEX
    //   [4] Global Subr INDEX (empty — charstrings are desubroutinized)
    //   [5] Charset
    //   [6] FDSelect
    //   [7] CharStrings INDEX
    //   [8] FDArray INDEX (with rebuilt FD dicts)
    //   [9..] Private DICTs (one per needed FD, Subrs op stripped)

    let name_bytes = name_index.raw_bytes(cff);
    // Emit an empty String INDEX. Our rebuilt Top DICT references only standard
    // SIDs (Adobe=138, Identity=139), and our minimal FD dicts reference no
    // strings, so the original font's String INDEX is entirely unreachable.
    // Preserving it wasted ~22 KB for Latin fonts and ~5 KB for CJK fonts.
    // `string_index` remains in scope for signature compatibility but is
    // intentionally unused; silence the unused-binding warning:
    let _ = &string_index;
    let string_bytes = build_cff_index(&[]);

    // Desubroutinized charstrings make the Global Subr INDEX obsolete.
    let global_subr_bytes = build_cff_index(&[]);

    let placeholder_offset = 100_000i32;
    let num_glyphs = usize_to_cff_offset(needed_gids.len())?;

    // Pass 1: build placeholder Top DICT INDEX to determine its size.
    // `build_cid_top_dict` (vs `rebuild_cid_top_dict`) emits a minimal CID
    // Top DICT with only ROS + FontBBox + CIDCount + essential offsets,
    // dropping cosmetic operators (Notice, Copyright, FullName, …) that
    // reference SIDs in the now-empty String INDEX.
    let placeholder_top_dict = build_cid_top_dict(
        top_dict_bytes,
        num_glyphs,
        placeholder_offset,
        placeholder_offset,
        placeholder_offset,
        placeholder_offset,
    );
    let placeholder_top_dict_ref: &[u8] = &placeholder_top_dict;
    let placeholder_top_dict_index = build_cff_index(&[placeholder_top_dict_ref]);

    // Pass 1: build placeholder FDArray to determine its size
    // Each FD dict gets Private at placeholder offset — compute per-FD sizes
    let placeholder_fd_dicts: Vec<Vec<u8>> = fd_data_list
        .iter()
        .map(|fd| {
            let private_size = usize_to_cff_offset(fd.private_bytes.len())?;
            Ok(rebuild_fd_dict(
                &fd.fd_dict_bytes,
                private_size,
                placeholder_offset,
            ))
        })
        .collect::<ParseResult<Vec<_>>>()?;
    let placeholder_fd_refs: Vec<&[u8]> =
        placeholder_fd_dicts.iter().map(|v| v.as_slice()).collect();
    let placeholder_fd_array_index = build_cff_index(&placeholder_fd_refs);

    // Compute actual offsets
    let after_header = header_bytes.len();
    let after_name = after_header + name_bytes.len();
    let after_top_dict = after_name + placeholder_top_dict_index.len();
    let after_string = after_top_dict + string_bytes.len();
    let after_global_subr = after_string + global_subr_bytes.len();

    let new_charset_offset = usize_to_cff_offset(after_global_subr)?;
    let new_fd_select_offset = usize_to_cff_offset(after_global_subr + new_charset.len())?;
    let new_charstrings_offset =
        usize_to_cff_offset(new_fd_select_offset as usize + new_fd_select.len())?;
    let new_fd_array_offset =
        usize_to_cff_offset(new_charstrings_offset as usize + new_charstrings_index.len())?;

    // After FDArray comes the Private DICTs
    // Compute Private DICT offsets relative to start of CFF
    let after_fd_array = new_fd_array_offset as usize + placeholder_fd_array_index.len();
    let mut private_offsets: Vec<i32> = Vec::new();
    let mut cursor = after_fd_array;
    for fd in &fd_data_list {
        private_offsets.push(usize_to_cff_offset(cursor)?);
        cursor += fd.private_bytes.len() + fd.local_subr_bytes.len();
    }

    // Pass 2: build real FD dicts with correct private offsets
    let real_fd_dicts: Vec<Vec<u8>> = fd_data_list
        .iter()
        .zip(private_offsets.iter())
        .map(|(fd, &priv_off)| {
            let private_size = usize_to_cff_offset(fd.private_bytes.len())?;
            Ok(rebuild_fd_dict(&fd.fd_dict_bytes, private_size, priv_off))
        })
        .collect::<ParseResult<Vec<_>>>()?;
    let real_fd_refs: Vec<&[u8]> = real_fd_dicts.iter().map(|v| v.as_slice()).collect();
    let real_fd_array_index = build_cff_index(&real_fd_refs);

    // Verify FDArray size is stable between passes
    if real_fd_array_index.len() != placeholder_fd_array_index.len() {
        return Err(ParseError::SyntaxError {
            position: 0,
            message: format!(
                "FDArray size changed between passes ({} vs {})",
                real_fd_array_index.len(),
                placeholder_fd_array_index.len()
            ),
        });
    }

    // Pass 2: build real Top DICT
    let real_top_dict = build_cid_top_dict(
        top_dict_bytes,
        num_glyphs,
        new_charset_offset,
        new_charstrings_offset,
        new_fd_array_offset,
        new_fd_select_offset,
    );
    let real_top_dict_ref: &[u8] = &real_top_dict;
    let real_top_dict_index = build_cff_index(&[real_top_dict_ref]);

    // Verify Top DICT size is stable
    if real_top_dict_index.len() != placeholder_top_dict_index.len() {
        return Err(ParseError::SyntaxError {
            position: 0,
            message: format!(
                "CID Top DICT size changed between passes ({} vs {})",
                real_top_dict_index.len(),
                placeholder_top_dict_index.len()
            ),
        });
    }

    // Assemble new CFF
    let mut new_cff: Vec<u8> = Vec::new();
    new_cff.extend_from_slice(header_bytes);
    new_cff.extend_from_slice(name_bytes);
    new_cff.extend_from_slice(&real_top_dict_index);
    new_cff.extend_from_slice(&string_bytes);
    new_cff.extend_from_slice(&global_subr_bytes);
    new_cff.extend_from_slice(&new_charset);
    new_cff.extend_from_slice(&new_fd_select);
    new_cff.extend_from_slice(&new_charstrings_index);
    new_cff.extend_from_slice(&real_fd_array_index);
    for fd in &fd_data_list {
        new_cff.extend_from_slice(&fd.private_bytes);
        new_cff.extend_from_slice(&fd.local_subr_bytes);
    }

    tracing::debug!(
        "CID CFF subset: {} bytes → {} bytes ({} glyphs, {} FDs)",
        cff.len(),
        new_cff.len(),
        sorted_old_gids.len(),
        needed_fds.len()
    );

    Ok(new_cff)
}

// =============================================================================
// CFF table subsetting core
// =============================================================================

/// Subset the CFF table to include only the glyphs in `needed_gids`.
///
/// `needed_gids` must be sorted and include GID 0 (.notdef).
/// `gid_remap` maps old GID → new GID.
/// `old_gid_to_unicode` maps old GID → Unicode codepoint (for CID charset with Identity-H).
fn subset_cff_table(
    cff: &[u8],
    needed_gids: &[u16],
    gid_remap: &HashMap<u16, u16>,
    old_gid_to_unicode: &HashMap<u16, u32>,
) -> ParseResult<Vec<u8>> {
    if cff.len() < 4 {
        return Err(ParseError::SyntaxError {
            position: 0,
            message: "CFF table too small".to_string(),
        });
    }

    // Parse CFF header
    let _major = cff[0];
    let _minor = cff[1];
    let hdr_size = cff[2] as usize;
    // cff[3] is offSize — global offSize for top-level structures (not used after header)

    if hdr_size > cff.len() {
        return Err(ParseError::SyntaxError {
            position: 2,
            message: "CFF hdrSize larger than table".to_string(),
        });
    }

    let header_bytes = &cff[0..hdr_size];

    // Parse Name INDEX
    let name_index = parse_cff_index(cff, hdr_size)?;

    // Parse Top DICT INDEX
    let top_dict_index = parse_cff_index(cff, name_index.end_offset())?;

    if top_dict_index.count() == 0 {
        return Err(ParseError::SyntaxError {
            position: name_index.end_offset(),
            message: "CFF Top DICT INDEX is empty".to_string(),
        });
    }

    // Get Top DICT bytes (first entry; we only support single-font CFF)
    let top_dict_bytes =
        top_dict_index
            .get_item(0, cff)
            .ok_or_else(|| ParseError::SyntaxError {
                position: top_dict_index.start_offset,
                message: "Cannot read Top DICT item 0".to_string(),
            })?;

    let top_dict_offsets = parse_top_dict(top_dict_bytes);

    // Parse String INDEX and Global Subr INDEX (needed for both CID and non-CID paths)
    let string_index = parse_cff_index(cff, top_dict_index.end_offset())?;

    // Parse Global Subr INDEX
    let global_subr_index = parse_cff_index(cff, string_index.end_offset())?;

    // CID-keyed font: delegate to dedicated CID subsetting path
    if top_dict_offsets.fd_array_offset.is_some() || top_dict_offsets.fd_select_offset.is_some() {
        return subset_cid_cff_table(
            cff,
            needed_gids,
            gid_remap,
            old_gid_to_unicode,
            top_dict_bytes,
            &top_dict_offsets,
            &name_index,
            &top_dict_index,
            &string_index,
            &global_subr_index,
        );
    }

    // Locate CharStrings INDEX using offset from Top DICT
    let charstrings_offset =
        top_dict_offsets
            .charstrings_offset
            .ok_or_else(|| ParseError::SyntaxError {
                position: 0,
                message: "Top DICT has no CharStrings offset".to_string(),
            })?;

    if charstrings_offset <= 0 || charstrings_offset as usize >= cff.len() {
        return Err(ParseError::SyntaxError {
            position: 0,
            message: format!("CharStrings offset out of range: {}", charstrings_offset),
        });
    }

    let charstrings_index = parse_cff_index(cff, charstrings_offset as usize)?;
    let total_glyphs = charstrings_index.count();

    // Validate needed_gids against total_glyphs
    for &gid in needed_gids {
        if gid as usize >= total_glyphs {
            return Err(ParseError::SyntaxError {
                position: 0,
                message: format!(
                    "GID {} out of range (font has {} glyphs)",
                    gid, total_glyphs
                ),
            });
        }
    }

    // Extract needed CharStrings in new GID order
    let mut sorted_new: Vec<u16> = {
        let mut pairs: Vec<(u16, u16)> = needed_gids
            .iter()
            .filter_map(|&old_gid| gid_remap.get(&old_gid).map(|&new_gid| (new_gid, old_gid)))
            .collect();
        pairs.sort_by_key(|&(new, _)| new);
        pairs.iter().map(|&(_, old)| old).collect()
    };

    // Ensure .notdef (GID 0) is first
    if sorted_new.first().copied() != Some(0) {
        sorted_new.retain(|&g| g != 0);
        sorted_new.insert(0, 0);
    }

    // Parse Private DICT and Local Subr INDEX (used only to desubroutinize).
    let (mut private_bytes, private_orig_offset) =
        if let Some((size, offset)) = top_dict_offsets.private_dict {
            if offset > 0 && size > 0 {
                let start = offset as usize;
                let end = (start + size as usize).min(cff.len());
                (cff[start..end].to_vec(), offset)
            } else {
                (vec![], 0)
            }
        } else {
            (vec![], 0)
        };

    let local_subrs = if !private_bytes.is_empty() && private_orig_offset > 0 {
        let priv_start = private_orig_offset as usize;
        // Per CFF spec §5.1, Local Subrs exist only when op 19 (Subrs) is
        // present in the Private DICT; no speculative probing of bytes
        // that happen to follow the DICT.
        if let Some(subrs_rel) = parse_local_subrs_offset(&private_bytes) {
            parse_cff_index(cff, priv_start + subrs_rel)?
        } else {
            CffIndex::empty()
        }
    } else {
        CffIndex::empty()
    };

    // Strip the Subrs op — desubroutinized charstrings no longer reference Local Subrs.
    strip_private_subrs_op(&mut private_bytes);

    // Desubroutinize every kept charstring so the output is self-contained.
    let desub_charstrings: Vec<Vec<u8>> = sorted_new
        .iter()
        .map(|&old_gid| {
            let cs = charstrings_index
                .get_item(old_gid as usize, cff)
                .unwrap_or(&[0x0E]);
            desubroutinize(cs, &global_subr_index, cff, &local_subrs, cff)
        })
        .collect::<ParseResult<Vec<_>>>()?;
    let new_charstrings: Vec<&[u8]> = desub_charstrings.iter().map(|v| v.as_slice()).collect();
    let new_charstrings_index = build_cff_index(&new_charstrings);

    // Build a CID-keyed Charset (Format 0) with CID = Unicode codepoint for
    // each kept glyph — matches the existing CID path so the PDF content
    // stream can emit Unicode codepoints as CIDs with Identity-H encoding.
    let mut new_charset: Vec<u8> = vec![0]; // format 0
    for (new_gid_idx, &old_gid) in sorted_new.iter().enumerate() {
        if new_gid_idx == 0 {
            continue; // GID 0 = .notdef is implicit in charset Format 0
        }
        let cid = old_gid_to_unicode
            .get(&old_gid)
            .copied()
            .unwrap_or(old_gid as u32);
        // CFF CIDs are 16-bit; fall back to old_gid for SMP codepoints.
        let cid16 = if cid > 0xFFFF { old_gid } else { cid as u16 };
        new_charset.extend_from_slice(&cid16.to_be_bytes());
    }

    // FDSelect (Format 0): every GID maps to FD 0 (single-FD font).
    let num_glyphs = usize_to_cff_offset(sorted_new.len())?;
    let new_fd_select: Vec<u8> = vec![0u8; 1 + sorted_new.len()]; // format byte + N FD entries

    // Assembly: the CID-keyed layout matches subset_cid_cff_table, since a
    // converted SID font is effectively a CID font with exactly one FD.
    //
    //   [0] Header
    //   [1] Name INDEX (verbatim)
    //   [2] Top DICT INDEX (rebuilt as CID Top DICT)
    //   [3] String INDEX (verbatim)
    //   [4] Global Subr INDEX (empty — charstrings are desubroutinized)
    //   [5] Charset (Unicode-keyed CIDs)
    //   [6] FDSelect
    //   [7] CharStrings INDEX
    //   [8] FDArray INDEX (single FD dict)
    //   [9] Private DICT (Subrs op stripped)

    let name_bytes = name_index.raw_bytes(cff);
    // Emit an empty String INDEX — `build_cid_top_dict` produces a Top DICT
    // that only references standard SIDs (Adobe=138, Identity=139), and our
    // minimal FD dict references no strings.
    let _ = &string_index;
    let string_bytes = build_cff_index(&[]);
    let global_subr_bytes = build_cff_index(&[]);

    let placeholder_offset = 100_000i32;
    let private_size = usize_to_cff_offset(private_bytes.len())?;

    // Pass 1: size estimation with placeholder offsets (5-byte fixed encoding
    // keeps the Top DICT / FD dict / FDArray sizes stable between passes).
    let placeholder_top_dict = build_cid_top_dict(
        top_dict_bytes,
        num_glyphs,
        placeholder_offset,
        placeholder_offset,
        placeholder_offset,
        placeholder_offset,
    );
    let placeholder_top_dict_ref: &[u8] = &placeholder_top_dict;
    let placeholder_top_dict_index = build_cff_index(&[placeholder_top_dict_ref]);

    let placeholder_fd_dict = build_minimal_fd_dict(private_size, placeholder_offset);
    let placeholder_fd_dict_ref: &[u8] = &placeholder_fd_dict;
    let placeholder_fd_array_index = build_cff_index(&[placeholder_fd_dict_ref]);

    // Compute final offsets
    let after_header = header_bytes.len();
    let after_name = after_header + name_bytes.len();
    let after_top_dict = after_name + placeholder_top_dict_index.len();
    let after_string = after_top_dict + string_bytes.len();
    let after_global_subr = after_string + global_subr_bytes.len();

    let new_charset_offset = usize_to_cff_offset(after_global_subr)?;
    let new_fd_select_offset = usize_to_cff_offset(after_global_subr + new_charset.len())?;
    let new_charstrings_offset =
        usize_to_cff_offset(new_fd_select_offset as usize + new_fd_select.len())?;
    let new_fd_array_offset =
        usize_to_cff_offset(new_charstrings_offset as usize + new_charstrings_index.len())?;
    let new_private_offset =
        usize_to_cff_offset(new_fd_array_offset as usize + placeholder_fd_array_index.len())?;

    // Pass 2: build real Top DICT and FD dict with correct offsets.
    let real_top_dict = build_cid_top_dict(
        top_dict_bytes,
        num_glyphs,
        new_charset_offset,
        new_charstrings_offset,
        new_fd_array_offset,
        new_fd_select_offset,
    );
    let real_top_dict_ref: &[u8] = &real_top_dict;
    let real_top_dict_index = build_cff_index(&[real_top_dict_ref]);

    let real_fd_dict = build_minimal_fd_dict(private_size, new_private_offset);
    let real_fd_dict_ref: &[u8] = &real_fd_dict;
    let real_fd_array_index = build_cff_index(&[real_fd_dict_ref]);

    // Size stability checks — 5-byte integer encoding should keep these equal.
    if real_top_dict_index.len() != placeholder_top_dict_index.len() {
        return Err(ParseError::SyntaxError {
            position: 0,
            message: format!(
                "Top DICT size changed between passes ({} vs {})",
                real_top_dict_index.len(),
                placeholder_top_dict_index.len()
            ),
        });
    }
    if real_fd_array_index.len() != placeholder_fd_array_index.len() {
        return Err(ParseError::SyntaxError {
            position: 0,
            message: format!(
                "FDArray size changed between passes ({} vs {})",
                real_fd_array_index.len(),
                placeholder_fd_array_index.len()
            ),
        });
    }

    // Assemble CID-keyed CFF
    let mut new_cff = Vec::new();
    new_cff.extend_from_slice(header_bytes);
    new_cff.extend_from_slice(name_bytes);
    new_cff.extend_from_slice(&real_top_dict_index);
    new_cff.extend_from_slice(&string_bytes);
    new_cff.extend_from_slice(&global_subr_bytes);
    new_cff.extend_from_slice(&new_charset);
    new_cff.extend_from_slice(&new_fd_select);
    new_cff.extend_from_slice(&new_charstrings_index);
    new_cff.extend_from_slice(&real_fd_array_index);
    new_cff.extend_from_slice(&private_bytes);

    Ok(new_cff)
}

// =============================================================================
// cmap parsing
// =============================================================================

/// Parse a cmap table to produce a unicode → GID mapping.
/// Supports Format 4 (most common for Latin/CJK) and Format 12 (full Unicode).
///
/// `used_codepoints` is an optional filter: when `Some(filter)`, Format 12
/// parsing only inserts codepoints present in the filter, which avoids
/// allocating 70 000+ entries when only a handful of CJK characters are needed.
fn parse_cmap(
    cmap: &[u8],
    used_codepoints: Option<&HashSet<u32>>,
) -> ParseResult<HashMap<u32, u16>> {
    if cmap.len() < 4 {
        return Err(ParseError::SyntaxError {
            position: 0,
            message: "cmap table too small".to_string(),
        });
    }

    let num_subtables = read_u16(cmap, 2)? as usize;

    // Prefer: platform 3 encoding 1 (Windows Unicode BMP) or
    //         platform 3 encoding 10 (Windows Unicode full)
    //         platform 0 (Unicode)
    let mut best_offset: Option<u32> = None;
    let mut best_priority = 0u8;

    for i in 0..num_subtables {
        let base = 4 + i * 8;
        if base + 8 > cmap.len() {
            break;
        }
        let platform = read_u16(cmap, base)?;
        let encoding = read_u16(cmap, base + 2)?;
        let offset = read_u32(cmap, base + 4)?;

        // Priority: 3 (platform 3, enc 10) > 2 (platform 3, enc 1) > 1 (platform 0)
        let priority = match (platform, encoding) {
            (3, 10) => 3,
            (3, 1) => 2,
            (0, _) => 1,
            _ => 0,
        };
        if priority > best_priority {
            best_priority = priority;
            best_offset = Some(offset);
        }
    }

    let subtable_offset = best_offset.ok_or_else(|| ParseError::SyntaxError {
        position: 0,
        message: "No usable cmap subtable found".to_string(),
    })? as usize;

    if subtable_offset + 2 > cmap.len() {
        return Err(ParseError::SyntaxError {
            position: subtable_offset,
            message: "cmap subtable offset out of range".to_string(),
        });
    }

    let format = read_u16(cmap, subtable_offset)?;
    match format {
        4 => parse_cmap_format_4(cmap, subtable_offset),
        12 => parse_cmap_format_12_filtered(cmap, subtable_offset, used_codepoints).map_err(|e| {
            ParseError::SyntaxError {
                position: subtable_offset,
                message: e.to_string(),
            }
        }),
        _ => {
            // Unsupported format — return empty mapping rather than hard-failing
            Ok(HashMap::new())
        }
    }
}

/// Parse cmap Format 4 subtable.
fn parse_cmap_format_4(cmap: &[u8], offset: usize) -> ParseResult<HashMap<u32, u16>> {
    if offset + 14 > cmap.len() {
        return Err(ParseError::SyntaxError {
            position: offset,
            message: "cmap Format 4 header truncated".to_string(),
        });
    }

    let seg_count = (read_u16(cmap, offset + 6)? / 2) as usize;
    let seg_arrays_start = offset + 14;

    // Arrays: endCode[segCount], reservedPad, startCode[segCount], idDelta[segCount], idRangeOffset[segCount]
    let end_code_start = seg_arrays_start;
    let start_code_start = seg_arrays_start + seg_count * 2 + 2; // +2 for reservedPad
    let id_delta_start = start_code_start + seg_count * 2;
    let id_range_offset_start = id_delta_start + seg_count * 2;
    let glyph_id_array_start = id_range_offset_start + seg_count * 2;

    let needed = glyph_id_array_start;
    if needed > cmap.len() {
        return Err(ParseError::SyntaxError {
            position: offset,
            message: "cmap Format 4 data truncated".to_string(),
        });
    }

    let mut map = HashMap::new();

    for i in 0..seg_count {
        let end_code = read_u16(cmap, end_code_start + i * 2)? as u32;
        if end_code == 0xFFFF {
            break; // terminal segment
        }
        let start_code = read_u16(cmap, start_code_start + i * 2)? as u32;
        let id_delta = read_i16(cmap, id_delta_start + i * 2)? as i32;
        let id_range_offset = read_u16(cmap, id_range_offset_start + i * 2)? as usize;

        if start_code > end_code {
            continue;
        }
        for code in start_code..=end_code {
            let gid = if id_range_offset == 0 {
                ((code as i32 + id_delta) & 0xFFFF) as u16
            } else {
                // Indirect lookup via glyphIdArray
                let range_offset_pos = id_range_offset_start + i * 2;
                let glyph_idx =
                    range_offset_pos + id_range_offset + (code as usize - start_code as usize) * 2;
                if glyph_idx + 2 > cmap.len() {
                    continue;
                }
                let raw_gid = read_u16(cmap, glyph_idx)?;
                if raw_gid == 0 {
                    0
                } else {
                    ((raw_gid as i32 + id_delta) & 0xFFFF) as u16
                }
            };
            if gid != 0 {
                map.insert(code, gid);
            }
        }
    }

    Ok(map)
}