1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
//! Issue #272 scope-B — end-to-end coverage for the embedded-stream `/Encoding`
//! branch of `extract_font_info`.
//!
//! The wiring `PdfObject::Reference → stream.decode → EncodingCMap::parse →
//! CidEncoding::Cmap → decode_via_encoding_cmap` was previously exercised only
//! by a unit test that built the `EncodingCMap` in memory; no test drove a real
//! embedded CMapType-1 `/Encoding` stream through document loading. The corpus
//! lacks a PDF that combines an embedded encoding CMap with a recognised Adobe
//! ordering (GB1/Japan1/Korea1/CNS1), so we synthesise one here.
//!
//! Construction: a Type0 font whose `/Encoding` is a *reference* to a stream
//! holding a minimal CMapType-1 CMap. The CMap maps three 2-byte codes to three
//! Adobe-GB1 CIDs whose Unicode values are well-known (verified against the
//! GB1 slice of `cid_to_unicode.rs`):
//! - code <0001> → CID 4559 → 中 (U+4E2D)
//! - code <0002> → CID 3809 → 我 (U+6211)
//! - code <0003> → CID 1875 → 国 (U+56FD)
//!
//! The descendant CIDFont declares `/CIDSystemInfo << /Ordering (GB1) ... >>`,
//! which selects the Adobe-GB1 CID→Unicode collection.
//!
//! The codes 0x0001/0x0002/0x0003 are deliberately small: under the Identity
//! fallback they would be treated as CIDs 1/2/3 (U+00A0/!/"), which are *not*
//! CJK. Extracting 中我国 therefore proves the embedded CMap remapped the codes,
//! not the Identity path.
use oxidize_pdf::parser::{PdfDocument, PdfReader};
use oxidize_pdf::text::{ExtractionOptions, TextExtractor};
use std::io::Cursor;
/// A minimal but realistic embedded CMapType-1 CMap mapping three codes to GB1
/// CIDs. Includes a `/CIDSystemInfo` dict and `defineresource` epilogue exactly
/// as a font tool would emit — the literal `(Adobe)`/`(GB1)` strings also
/// exercise the tokenizer on non-hex content inside the stream.
const EMBEDDED_CMAP: &[u8] = b"/CIDInit /ProcSet findresource begin\n\
12 dict begin\n\
begincmap\n\
/CIDSystemInfo 3 dict dup begin\n\
/Registry (Adobe) def\n\
/Ordering (GB1) def\n\
/Supplement 0 def\n\
end def\n\
/CMapName /Test-Embedded-H def\n\
/CMapType 1 def\n\
1 begincodespacerange\n\
<0000> <ffff>\n\
endcodespacerange\n\
3 begincidchar\n\
<0001> 4559\n\
<0002> 3809\n\
<0003> 1875\n\
endcidchar\n\
endcmap\n\
CMapName currentdict /CMap defineresource pop\n\
end\n\
end\n";
/// Content stream showing the three 2-byte codes as a single hex string.
const CONTENT: &[u8] = b"BT\n/F0 12 Tf\n100 700 Td\n<000100020003> Tj\nET\n";
/// Assemble a PDF from full object bodies (object N is `objects[N-1]`).
fn assemble_pdf(objects: &[Vec<u8>]) -> Vec<u8> {
let n = objects.len();
let mut bytes: Vec<u8> = Vec::with_capacity(1024);
bytes.extend_from_slice(b"%PDF-1.4\n%\xE2\xE3\xCF\xD3\n");
let mut offsets = vec![0usize; n + 1];
for (idx, body) in objects.iter().enumerate() {
let id = idx + 1;
offsets[id] = bytes.len();
bytes.extend_from_slice(format!("{} 0 obj\n", id).as_bytes());
bytes.extend_from_slice(body);
bytes.extend_from_slice(b"\nendobj\n");
}
let xref_off = bytes.len();
bytes.extend_from_slice(format!("xref\n0 {}\n0000000000 65535 f \n", n + 1).as_bytes());
for off in offsets.iter().skip(1) {
bytes.extend_from_slice(format!("{:010} 00000 n \n", off).as_bytes());
}
bytes.extend_from_slice(
format!(
"trailer\n<< /Size {} /Root 1 0 R >>\nstartxref\n{}\n%%EOF\n",
n + 1,
xref_off
)
.as_bytes(),
);
bytes
}
/// Build a stream object body: `<< {dict} /Length L >> stream … endstream`.
fn stream_obj(dict: &str, data: &[u8]) -> Vec<u8> {
let mut v = Vec::new();
v.extend_from_slice(format!("<< {} /Length {} >>\nstream\n", dict, data.len()).as_bytes());
v.extend_from_slice(data);
v.extend_from_slice(b"\nendstream");
v
}
/// Build the 7-object Type0 PDF. `encoding_entry` is the value of the Type0
/// font's `/Encoding` key (e.g. "6 0 R" for the embedded stream, or
/// "/Identity-H" for the contrast case). Object 6 always carries the embedded
/// CMap stream so layout is identical across cases.
fn build_type0_pdf(encoding_entry: &str) -> Vec<u8> {
let objects: Vec<Vec<u8>> = vec![
b"<< /Type /Catalog /Pages 2 0 R >>".to_vec(),
b"<< /Type /Pages /Kids [3 0 R] /Count 1 >>".to_vec(),
b"<< /Type /Page /Parent 2 0 R /Resources << /Font << /F0 5 0 R >> >> \
/Contents 4 0 R /MediaBox [0 0 612 792] >>"
.to_vec(),
stream_obj("", CONTENT),
format!(
"<< /Type /Font /Subtype /Type0 /BaseFont /Test-GB1 \
/Encoding {} /DescendantFonts [7 0 R] >>",
encoding_entry
)
.into_bytes(),
stream_obj(
"/Type /CMap /CMapName /Test-Embedded-H /CMapType 1",
EMBEDDED_CMAP,
),
b"<< /Type /Font /Subtype /CIDFontType2 /BaseFont /Test-GB1 \
/CIDSystemInfo << /Registry (Adobe) /Ordering (GB1) /Supplement 0 >> \
/CIDToGIDMap /Identity >>"
.to_vec(),
];
assemble_pdf(&objects)
}
fn extract(pdf: Vec<u8>) -> String {
let reader = PdfReader::new(Cursor::new(pdf)).expect("fixture must be a readable PDF");
let document = PdfDocument::new(reader);
let mut extractor = TextExtractor::with_options(ExtractionOptions::default());
extractor
.extract_from_page(&document, 0)
.expect("extract page 0")
.text
}
/// The embedded `/Encoding` stream drives code→CID→Unicode end-to-end: the three
/// codes resolve to the GB1 CIDs declared in the CMap, and the descendant's
/// `/Ordering (GB1)` selects the Adobe-GB1 collection that maps them to 中我国.
#[test]
fn embedded_stream_encoding_maps_codes_to_cid_to_unicode() {
let text = extract(build_type0_pdf("6 0 R"));
let cjk: String = text
.chars()
.filter(|&c| ('\u{4E00}'..='\u{9FFF}').contains(&c))
.collect();
assert_eq!(
cjk, "中我国",
"embedded-stream encoding CMap must map codes <0001><0002><0003> to GB1 \
CIDs 4559/3809/1875 and on to 中我国; got full text {:?}",
text
);
assert!(
!text.contains('\u{FFFD}'),
"no replacement characters expected when every code is mapped; got {:?}",
text
);
}
/// Contrast: with `/Encoding /Identity-H` and no embedded CMap consulted, the
/// same bytes are interpreted as raw CIDs 1/2/3 (U+00A0/!/"), so no CJK appears.
/// This proves the CJK output above comes from the embedded stream, not a
/// coincidental default.
#[test]
fn identity_h_without_embedded_cmap_does_not_produce_cjk() {
let text = extract(build_type0_pdf("/Identity-H"));
let cjk: String = text
.chars()
.filter(|&c| ('\u{4E00}'..='\u{9FFF}').contains(&c))
.collect();
assert!(
cjk.is_empty(),
"Identity-H must treat the bytes as raw CIDs 1/2/3 (non-CJK), not remap \
them via the embedded CMap; got CJK {:?} in {:?}",
cjk,
text
);
}