rpdfium_core/bytestring.rs
1// Derived from PDFium's core/fxcrt/bytestring.cpp
2// Original: Copyright 2014 The PDFium Authors
3// Licensed under BSD-3-Clause / Apache-2.0
4// See pdfium-upstream/LICENSE for the original license.
5
6//! Encoding-aware PDF string type.
7//!
8//! PDF strings use three encodings (ISO 32000-2 §7.9):
9//!
10//! - **PDFDocEncoding**: A superset of ISO Latin-1. Used for most string values
11//! (metadata, bookmark titles, form field values). The default when no BOM is present.
12//! - **UTF-16BE**: Indicated by a byte-order mark (`0xFE 0xFF`) at the start.
13//! Used for Unicode strings that cannot be represented in PDFDocEncoding.
14//! - **UTF-8**: Indicated by a UTF-8 BOM (`0xEF 0xBB 0xBF`) at the start.
15//! Less common; treated identically to UTF-16BE after decoding.
16
17use std::fmt;
18
19/// A PDF string with encoding-aware conversion.
20///
21/// Stores the raw bytes as they appear in the PDF file. The encoding is
22/// detected from the content: if the bytes start with `0xFE 0xFF` (BOM),
23/// the string is UTF-16BE; otherwise it is PDFDocEncoding.
24#[derive(Clone, PartialEq, Eq, Hash)]
25pub struct PdfString {
26 bytes: Vec<u8>,
27}
28
29/// The encoding used by a [`PdfString`].
30#[derive(Debug, Clone, Copy, PartialEq, Eq)]
31pub enum PdfStringEncoding {
32 /// PDFDocEncoding — a superset of ISO Latin-1 (ISO 32000-2 Annex D).
33 PdfDocEncoding,
34 /// UTF-16BE with byte-order mark (`0xFE 0xFF`).
35 Utf16Be,
36 /// UTF-8 with byte-order mark (`0xEF 0xBB 0xBF`).
37 Utf8Bom,
38}
39
40impl PdfString {
41 /// Create a `PdfString` from raw bytes (as parsed from the PDF).
42 pub fn from_bytes(bytes: Vec<u8>) -> Self {
43 Self { bytes }
44 }
45
46 /// Encode a UTF-8 string as a PDF string.
47 ///
48 /// Uses PDFDocEncoding if every character is representable; otherwise uses
49 /// UTF-16BE with a `0xFE 0xFF` byte-order mark. This matches the logic of
50 /// `PDF_EncodeText()` in PDFium upstream.
51 ///
52 /// # Examples
53 /// ```
54 /// # use rpdfium_core::{PdfString, PdfStringEncoding};
55 /// let ascii = PdfString::from_unicode("hello");
56 /// assert_eq!(ascii.encoding(), PdfStringEncoding::PdfDocEncoding);
57 ///
58 /// let unicode = PdfString::from_unicode("日本語");
59 /// assert_eq!(unicode.encoding(), PdfStringEncoding::Utf16Be);
60 /// ```
61 pub fn from_unicode(s: &str) -> Self {
62 if s.is_empty() {
63 return Self { bytes: Vec::new() };
64 }
65 // Try PDFDocEncoding first (PDF_EncodeText logic from upstream).
66 let mut bytes = Vec::with_capacity(s.len());
67 for ch in s.chars() {
68 match char_to_pdfdoc(ch) {
69 Some(byte) => bytes.push(byte),
70 None => return Self::encode_utf16be(s),
71 }
72 }
73 Self { bytes }
74 }
75
76 /// Encode as UTF-16BE with BOM (internal helper).
77 fn encode_utf16be(s: &str) -> Self {
78 let mut bytes = Vec::with_capacity(2 + s.len() * 2);
79 bytes.push(0xFE);
80 bytes.push(0xFF);
81 for unit in s.encode_utf16() {
82 bytes.push((unit >> 8) as u8);
83 bytes.push(unit as u8);
84 }
85 Self { bytes }
86 }
87
88 /// Raw bytes (for binary operations, stream `/Length`, etc.).
89 #[inline]
90 pub fn as_bytes(&self) -> &[u8] {
91 &self.bytes
92 }
93
94 /// Detect encoding from the byte-order mark.
95 ///
96 /// - `0xFE 0xFF` → [`PdfStringEncoding::Utf16Be`]
97 /// - `0xEF 0xBB 0xBF` → [`PdfStringEncoding::Utf8Bom`]
98 /// - Otherwise → [`PdfStringEncoding::PdfDocEncoding`]
99 pub fn encoding(&self) -> PdfStringEncoding {
100 if self.bytes.starts_with(&[0xFE, 0xFF]) {
101 PdfStringEncoding::Utf16Be
102 } else if self.bytes.starts_with(&[0xEF, 0xBB, 0xBF]) {
103 PdfStringEncoding::Utf8Bom
104 } else {
105 PdfStringEncoding::PdfDocEncoding
106 }
107 }
108
109 /// Decode to a Rust [`String`] (UTF-8), handling all PDF string encodings.
110 ///
111 /// - UTF-16BE: decoded with surrogate-pair support; invalid pairs → U+FFFD.
112 /// - UTF-8 BOM: decoded as UTF-8 after stripping the BOM.
113 /// - PDFDocEncoding: each byte mapped to Unicode per ISO 32000-2 Annex D.
114 ///
115 /// ISO 2022 language-tag escape sequences (`U+001B…U+001B`) present in
116 /// UTF-16BE and UTF-8 BOM strings are stripped, matching the behaviour of
117 /// `StripLanguageCodes()` / `PDF_DecodeText()` in PDFium upstream.
118 pub fn to_string_lossy(&self) -> String {
119 match self.encoding() {
120 PdfStringEncoding::Utf16Be => {
121 let u16s: Vec<u16> = self.bytes[2..]
122 .chunks_exact(2)
123 .map(|pair| u16::from_be_bytes([pair[0], pair[1]]))
124 .collect();
125 strip_language_codes(String::from_utf16_lossy(&u16s))
126 }
127 PdfStringEncoding::Utf8Bom => {
128 let utf8 = std::str::from_utf8(&self.bytes[3..]).unwrap_or("");
129 strip_language_codes(utf8.to_owned())
130 }
131 PdfStringEncoding::PdfDocEncoding => {
132 self.bytes.iter().map(|&b| pdfdoc_to_char(b)).collect()
133 }
134 }
135 }
136
137 /// Returns `true` if the string has no bytes.
138 pub fn is_empty(&self) -> bool {
139 self.bytes.is_empty()
140 }
141
142 /// Returns the length of the raw byte representation.
143 pub fn len(&self) -> usize {
144 self.bytes.len()
145 }
146
147 /// Decode to a Rust [`String`] (UTF-8), handling both PDF encodings.
148 ///
149 /// Deprecated; use [`to_string_lossy`](Self::to_string_lossy) instead.
150 #[deprecated(note = "use `to_string_lossy()` instead")]
151 #[inline]
152 pub fn unicode_data(&self) -> String {
153 self.to_string_lossy()
154 }
155
156 /// Upstream-aligned alias for [`to_string_lossy`](Self::to_string_lossy).
157 ///
158 /// Corresponds to `ByteString::GetUnicodeData()` in PDFium upstream.
159 #[inline]
160 pub fn get_unicode_data(&self) -> String {
161 self.to_string_lossy()
162 }
163
164 /// Upstream-aligned alias for [`as_bytes`](Self::as_bytes).
165 ///
166 /// Corresponds to `ByteString::GetRawString()` in PDFium upstream.
167 #[inline]
168 pub fn get_raw_string(&self) -> &[u8] {
169 self.as_bytes()
170 }
171}
172
173impl fmt::Debug for PdfString {
174 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
175 f.debug_struct("PdfString")
176 .field("encoding", &self.encoding())
177 .field("text", &self.to_string_lossy())
178 .field("len", &self.bytes.len())
179 .finish()
180 }
181}
182
183impl fmt::Display for PdfString {
184 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
185 f.write_str(&self.to_string_lossy())
186 }
187}
188
189/// Map a single byte to its Unicode codepoint under PDFDocEncoding.
190///
191/// This is the complete PDFDocEncoding table as defined in ISO 32000-2 Annex D.
192/// Bytes 0x00–0x17 map to U+0000–U+0017 (control characters, straight mapping).
193/// Bytes 0x18–0x1F map to typographic Unicode codepoints per ISO 32000-2 Annex D.
194/// Bytes 0x7F, 0x9F, and 0xAD are undefined (map to U+0000).
195pub fn pdfdoc_to_char(byte: u8) -> char {
196 PDFDOC_ENCODING_TABLE[byte as usize]
197}
198
199/// Reverse-map a Unicode character to a PDFDocEncoding byte.
200///
201/// Returns `None` if the character is not representable in PDFDocEncoding
202/// (characters that have no PDFDocEncoding byte, e.g. CJK, emoji).
203///
204/// Corresponds to the inner loop of `PDF_EncodeText()` in PDFium upstream.
205pub fn char_to_pdfdoc(ch: char) -> Option<u8> {
206 PDFDOC_ENCODING_TABLE
207 .iter()
208 .position(|&c| c == ch)
209 .map(|i| i as u8)
210}
211
212/// Strip ISO 2022 language-tag escape sequences from a decoded Unicode string.
213///
214/// Language tags are delimited by U+001B (ESCAPE): when the decoder sees ESC,
215/// it skips every character up to and including the closing ESC. The remaining
216/// characters form the decoded text.
217///
218/// Corresponds to `StripLanguageCodes()` in PDFium upstream
219/// (`core/fpdfapi/parser/fpdf_parser_decode.cpp`).
220fn strip_language_codes(s: String) -> String {
221 if !s.contains('\u{001B}') {
222 return s;
223 }
224 let mut result = String::with_capacity(s.len());
225 let mut chars = s.chars();
226 while let Some(ch) = chars.next() {
227 if ch == '\u{001B}' {
228 for inner in chars.by_ref() {
229 if inner == '\u{001B}' {
230 break;
231 }
232 }
233 } else {
234 result.push(ch);
235 }
236 }
237 result
238}
239
240/// Complete PDFDocEncoding → Unicode mapping table (ISO 32000-2 Annex D).
241///
242/// 256 entries, one for each possible byte value 0x00–0xFF.
243#[rustfmt::skip]
244const PDFDOC_ENCODING_TABLE: [char; 256] = [
245 // 0x00–0x07: control characters (mapped to Unicode control chars)
246 '\u{0000}', '\u{0001}', '\u{0002}', '\u{0003}',
247 '\u{0004}', '\u{0005}', '\u{0006}', '\u{0007}',
248 // 0x08–0x17: control characters (straight mapping, per ISO 32000-2 Annex D)
249 '\u{0008}', // 0x08 → BS (BACKSPACE)
250 '\u{0009}', // 0x09 → HT (HORIZONTAL TAB)
251 '\u{000A}', // 0x0A → LF (LINE FEED)
252 '\u{000B}', // 0x0B → VT (VERTICAL TAB)
253 '\u{000C}', // 0x0C → FF (FORM FEED)
254 '\u{000D}', // 0x0D → CR (CARRIAGE RETURN)
255 '\u{000E}', // 0x0E → SO (SHIFT OUT)
256 '\u{000F}', // 0x0F → SI (SHIFT IN)
257 '\u{0010}', // 0x10 → DLE
258 '\u{0011}', // 0x11 → DC1
259 '\u{0012}', // 0x12 → DC2
260 '\u{0013}', // 0x13 → DC3
261 '\u{0014}', // 0x14 → DC4
262 '\u{0015}', // 0x15 → NAK
263 '\u{0016}', // 0x16 → SYN
264 '\u{0017}', // 0x17 → ETB
265 // 0x18–0x1F: typographic characters (ISO 32000-2 Annex D §D.2)
266 '\u{02D8}', // 0x18 → BREVE
267 '\u{02C7}', // 0x19 → CARON
268 '\u{02C6}', // 0x1A → MODIFIER LETTER CIRCUMFLEX ACCENT
269 '\u{02D9}', // 0x1B → DOT ABOVE
270 '\u{02DD}', // 0x1C → DOUBLE ACUTE ACCENT
271 '\u{02DB}', // 0x1D → OGONEK
272 '\u{02DA}', // 0x1E → RING ABOVE
273 '\u{02DC}', // 0x1F → SMALL TILDE
274 // 0x20–0x7E: ASCII printable range (identical to Unicode)
275 ' ', '!', '"', '#', '$', '%', '&', '\'',
276 '(', ')', '*', '+', ',', '-', '.', '/',
277 '0', '1', '2', '3', '4', '5', '6', '7',
278 '8', '9', ':', ';', '<', '=', '>', '?',
279 '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G',
280 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
281 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W',
282 'X', 'Y', 'Z', '[', '\\', ']', '^', '_',
283 '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g',
284 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
285 'p', 'q', 'r', 's', 't', 'u', 'v', 'w',
286 'x', 'y', 'z', '{', '|', '}', '~',
287 // 0x7F: undefined in PDFDocEncoding (ISO 32000-2 Annex D)
288 '\u{0000}',
289 // 0x80–0x8F
290 '\u{2022}', // 0x80 → BULLET
291 '\u{2020}', // 0x81 → DAGGER
292 '\u{2021}', // 0x82 → DOUBLE DAGGER
293 '\u{2026}', // 0x83 → HORIZONTAL ELLIPSIS
294 '\u{2014}', // 0x84 → EM DASH
295 '\u{2013}', // 0x85 → EN DASH
296 '\u{0192}', // 0x86 → LATIN SMALL LETTER F WITH HOOK
297 '\u{2044}', // 0x87 → FRACTION SLASH
298 '\u{2039}', // 0x88 → SINGLE LEFT-POINTING ANGLE QUOTATION MARK
299 '\u{203A}', // 0x89 → SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
300 '\u{2212}', // 0x8A → MINUS SIGN
301 '\u{2030}', // 0x8B → PER MILLE SIGN
302 '\u{201E}', // 0x8C → DOUBLE LOW-9 QUOTATION MARK
303 '\u{201C}', // 0x8D → LEFT DOUBLE QUOTATION MARK
304 '\u{201D}', // 0x8E → RIGHT DOUBLE QUOTATION MARK
305 '\u{2018}', // 0x8F → LEFT SINGLE QUOTATION MARK
306 // 0x90–0x9F
307 '\u{2019}', // 0x90 → RIGHT SINGLE QUOTATION MARK
308 '\u{201A}', // 0x91 → SINGLE LOW-9 QUOTATION MARK
309 '\u{2122}', // 0x92 → TRADE MARK SIGN
310 '\u{FB01}', // 0x93 → LATIN SMALL LIGATURE FI
311 '\u{FB02}', // 0x94 → LATIN SMALL LIGATURE FL
312 '\u{0141}', // 0x95 → LATIN CAPITAL LETTER L WITH STROKE
313 '\u{0152}', // 0x96 → LATIN CAPITAL LIGATURE OE
314 '\u{0160}', // 0x97 → LATIN CAPITAL LETTER S WITH CARON
315 '\u{0178}', // 0x98 → LATIN CAPITAL LETTER Y WITH DIAERESIS
316 '\u{017D}', // 0x99 → LATIN CAPITAL LETTER Z WITH CARON
317 '\u{0131}', // 0x9A → LATIN SMALL LETTER DOTLESS I
318 '\u{0142}', // 0x9B → LATIN SMALL LETTER L WITH STROKE
319 '\u{0153}', // 0x9C → LATIN SMALL LIGATURE OE
320 '\u{0161}', // 0x9D → LATIN SMALL LETTER S WITH CARON
321 '\u{017E}', // 0x9E → LATIN SMALL LETTER Z WITH CARON
322 '\u{0000}', // 0x9F → UNDEFINED
323 // 0xA0–0xAF
324 '\u{20AC}', // 0xA0 → EURO SIGN
325 '\u{00A1}', // 0xA1 → INVERTED EXCLAMATION MARK
326 '\u{00A2}', // 0xA2 → CENT SIGN
327 '\u{00A3}', // 0xA3 → POUND SIGN
328 '\u{00A4}', // 0xA4 → CURRENCY SIGN
329 '\u{00A5}', // 0xA5 → YEN SIGN
330 '\u{00A6}', // 0xA6 → BROKEN BAR
331 '\u{00A7}', // 0xA7 → SECTION SIGN
332 '\u{00A8}', // 0xA8 → DIAERESIS
333 '\u{00A9}', // 0xA9 → COPYRIGHT SIGN
334 '\u{00AA}', // 0xAA → FEMININE ORDINAL INDICATOR
335 '\u{00AB}', // 0xAB → LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
336 '\u{00AC}', // 0xAC → NOT SIGN
337 '\u{0000}', // 0xAD → UNDEFINED (soft hyphen in Latin-1, undefined in PDFDocEncoding)
338 '\u{00AE}', // 0xAE → REGISTERED SIGN
339 '\u{00AF}', // 0xAF → MACRON
340 // 0xB0–0xBF
341 '\u{00B0}', // 0xB0 → DEGREE SIGN
342 '\u{00B1}', // 0xB1 → PLUS-MINUS SIGN
343 '\u{00B2}', // 0xB2 → SUPERSCRIPT TWO
344 '\u{00B3}', // 0xB3 → SUPERSCRIPT THREE
345 '\u{00B4}', // 0xB4 → ACUTE ACCENT
346 '\u{00B5}', // 0xB5 → MICRO SIGN
347 '\u{00B6}', // 0xB6 → PILCROW SIGN
348 '\u{00B7}', // 0xB7 → MIDDLE DOT
349 '\u{00B8}', // 0xB8 → CEDILLA
350 '\u{00B9}', // 0xB9 → SUPERSCRIPT ONE
351 '\u{00BA}', // 0xBA → MASCULINE ORDINAL INDICATOR
352 '\u{00BB}', // 0xBB → RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
353 '\u{00BC}', // 0xBC → VULGAR FRACTION ONE QUARTER
354 '\u{00BD}', // 0xBD → VULGAR FRACTION ONE HALF
355 '\u{00BE}', // 0xBE → VULGAR FRACTION THREE QUARTERS
356 '\u{00BF}', // 0xBF → INVERTED QUESTION MARK
357 // 0xC0–0xCF
358 '\u{00C0}', // 0xC0 → LATIN CAPITAL LETTER A WITH GRAVE
359 '\u{00C1}', // 0xC1 → LATIN CAPITAL LETTER A WITH ACUTE
360 '\u{00C2}', // 0xC2 → LATIN CAPITAL LETTER A WITH CIRCUMFLEX
361 '\u{00C3}', // 0xC3 → LATIN CAPITAL LETTER A WITH TILDE
362 '\u{00C4}', // 0xC4 → LATIN CAPITAL LETTER A WITH DIAERESIS
363 '\u{00C5}', // 0xC5 → LATIN CAPITAL LETTER A WITH RING ABOVE
364 '\u{00C6}', // 0xC6 → LATIN CAPITAL LETTER AE
365 '\u{00C7}', // 0xC7 → LATIN CAPITAL LETTER C WITH CEDILLA
366 '\u{00C8}', // 0xC8 → LATIN CAPITAL LETTER E WITH GRAVE
367 '\u{00C9}', // 0xC9 → LATIN CAPITAL LETTER E WITH ACUTE
368 '\u{00CA}', // 0xCA → LATIN CAPITAL LETTER E WITH CIRCUMFLEX
369 '\u{00CB}', // 0xCB → LATIN CAPITAL LETTER E WITH DIAERESIS
370 '\u{00CC}', // 0xCC → LATIN CAPITAL LETTER I WITH GRAVE
371 '\u{00CD}', // 0xCD → LATIN CAPITAL LETTER I WITH ACUTE
372 '\u{00CE}', // 0xCE → LATIN CAPITAL LETTER I WITH CIRCUMFLEX
373 '\u{00CF}', // 0xCF → LATIN CAPITAL LETTER I WITH DIAERESIS
374 // 0xD0–0xDF
375 '\u{00D0}', // 0xD0 → LATIN CAPITAL LETTER ETH
376 '\u{00D1}', // 0xD1 → LATIN CAPITAL LETTER N WITH TILDE
377 '\u{00D2}', // 0xD2 → LATIN CAPITAL LETTER O WITH GRAVE
378 '\u{00D3}', // 0xD3 → LATIN CAPITAL LETTER O WITH ACUTE
379 '\u{00D4}', // 0xD4 → LATIN CAPITAL LETTER O WITH CIRCUMFLEX
380 '\u{00D5}', // 0xD5 → LATIN CAPITAL LETTER O WITH TILDE
381 '\u{00D6}', // 0xD6 → LATIN CAPITAL LETTER O WITH DIAERESIS
382 '\u{00D7}', // 0xD7 → MULTIPLICATION SIGN
383 '\u{00D8}', // 0xD8 → LATIN CAPITAL LETTER O WITH STROKE
384 '\u{00D9}', // 0xD9 → LATIN CAPITAL LETTER U WITH GRAVE
385 '\u{00DA}', // 0xDA → LATIN CAPITAL LETTER U WITH ACUTE
386 '\u{00DB}', // 0xDB → LATIN CAPITAL LETTER U WITH CIRCUMFLEX
387 '\u{00DC}', // 0xDC → LATIN CAPITAL LETTER U WITH DIAERESIS
388 '\u{00DD}', // 0xDD → LATIN CAPITAL LETTER Y WITH ACUTE
389 '\u{00DE}', // 0xDE → LATIN CAPITAL LETTER THORN
390 '\u{00DF}', // 0xDF → LATIN SMALL LETTER SHARP S
391 // 0xE0–0xEF
392 '\u{00E0}', // 0xE0 → LATIN SMALL LETTER A WITH GRAVE
393 '\u{00E1}', // 0xE1 → LATIN SMALL LETTER A WITH ACUTE
394 '\u{00E2}', // 0xE2 → LATIN SMALL LETTER A WITH CIRCUMFLEX
395 '\u{00E3}', // 0xE3 → LATIN SMALL LETTER A WITH TILDE
396 '\u{00E4}', // 0xE4 → LATIN SMALL LETTER A WITH DIAERESIS
397 '\u{00E5}', // 0xE5 → LATIN SMALL LETTER A WITH RING ABOVE
398 '\u{00E6}', // 0xE6 → LATIN SMALL LETTER AE
399 '\u{00E7}', // 0xE7 → LATIN SMALL LETTER C WITH CEDILLA
400 '\u{00E8}', // 0xE8 → LATIN SMALL LETTER E WITH GRAVE
401 '\u{00E9}', // 0xE9 → LATIN SMALL LETTER E WITH ACUTE
402 '\u{00EA}', // 0xEA → LATIN SMALL LETTER E WITH CIRCUMFLEX
403 '\u{00EB}', // 0xEB → LATIN SMALL LETTER E WITH DIAERESIS
404 '\u{00EC}', // 0xEC → LATIN SMALL LETTER I WITH GRAVE
405 '\u{00ED}', // 0xED → LATIN SMALL LETTER I WITH ACUTE
406 '\u{00EE}', // 0xEE → LATIN SMALL LETTER I WITH CIRCUMFLEX
407 '\u{00EF}', // 0xEF → LATIN SMALL LETTER I WITH DIAERESIS
408 // 0xF0–0xFF
409 '\u{00F0}', // 0xF0 → LATIN SMALL LETTER ETH
410 '\u{00F1}', // 0xF1 → LATIN SMALL LETTER N WITH TILDE
411 '\u{00F2}', // 0xF2 → LATIN SMALL LETTER O WITH GRAVE
412 '\u{00F3}', // 0xF3 → LATIN SMALL LETTER O WITH ACUTE
413 '\u{00F4}', // 0xF4 → LATIN SMALL LETTER O WITH CIRCUMFLEX
414 '\u{00F5}', // 0xF5 → LATIN SMALL LETTER O WITH TILDE
415 '\u{00F6}', // 0xF6 → LATIN SMALL LETTER O WITH DIAERESIS
416 '\u{00F7}', // 0xF7 → DIVISION SIGN
417 '\u{00F8}', // 0xF8 → LATIN SMALL LETTER O WITH STROKE
418 '\u{00F9}', // 0xF9 → LATIN SMALL LETTER U WITH GRAVE
419 '\u{00FA}', // 0xFA → LATIN SMALL LETTER U WITH ACUTE
420 '\u{00FB}', // 0xFB → LATIN SMALL LETTER U WITH CIRCUMFLEX
421 '\u{00FC}', // 0xFC → LATIN SMALL LETTER U WITH DIAERESIS
422 '\u{00FD}', // 0xFD → LATIN SMALL LETTER Y WITH ACUTE
423 '\u{00FE}', // 0xFE → LATIN SMALL LETTER THORN
424 '\u{00FF}', // 0xFF → LATIN SMALL LETTER Y WITH DIAERESIS
425];
426
427#[cfg(test)]
428mod tests {
429 use super::*;
430
431 #[test]
432 fn test_pdfdoc_encoding_detection() {
433 let s = PdfString::from_bytes(b"Hello".to_vec());
434 assert_eq!(s.encoding(), PdfStringEncoding::PdfDocEncoding);
435 }
436
437 #[test]
438 fn test_utf16be_encoding_detection() {
439 let mut bytes = vec![0xFE, 0xFF];
440 // "Hi" in UTF-16BE
441 bytes.extend_from_slice(&[0x00, 0x48, 0x00, 0x69]);
442 let s = PdfString::from_bytes(bytes);
443 assert_eq!(s.encoding(), PdfStringEncoding::Utf16Be);
444 }
445
446 #[test]
447 fn test_pdfdoc_ascii_roundtrip() {
448 let s = PdfString::from_bytes(b"Hello, World!".to_vec());
449 assert_eq!(s.to_string_lossy(), "Hello, World!");
450 }
451
452 #[test]
453 fn test_utf16be_decode() {
454 let mut bytes = vec![0xFE, 0xFF];
455 // "ABC" in UTF-16BE
456 bytes.extend_from_slice(&[0x00, 0x41, 0x00, 0x42, 0x00, 0x43]);
457 let s = PdfString::from_bytes(bytes);
458 assert_eq!(s.to_string_lossy(), "ABC");
459 }
460
461 #[test]
462 fn test_utf16be_decode_non_ascii() {
463 let mut bytes = vec![0xFE, 0xFF];
464 // U+00E9 (é) in UTF-16BE
465 bytes.extend_from_slice(&[0x00, 0xE9]);
466 let s = PdfString::from_bytes(bytes);
467 assert_eq!(s.to_string_lossy(), "\u{00E9}");
468 }
469
470 #[test]
471 fn test_pdfdoc_high_bytes() {
472 // 0x80 → BULLET (U+2022)
473 let s = PdfString::from_bytes(vec![0x80]);
474 assert_eq!(s.to_string_lossy(), "\u{2022}");
475
476 // 0x84 → EM DASH (U+2014)
477 let s = PdfString::from_bytes(vec![0x84]);
478 assert_eq!(s.to_string_lossy(), "\u{2014}");
479
480 // 0x85 → EN DASH (U+2013)
481 let s = PdfString::from_bytes(vec![0x85]);
482 assert_eq!(s.to_string_lossy(), "\u{2013}");
483
484 // 0x8D → LEFT DOUBLE QUOTATION MARK (U+201C)
485 let s = PdfString::from_bytes(vec![0x8D]);
486 assert_eq!(s.to_string_lossy(), "\u{201C}");
487
488 // 0x8E → RIGHT DOUBLE QUOTATION MARK (U+201D)
489 let s = PdfString::from_bytes(vec![0x8E]);
490 assert_eq!(s.to_string_lossy(), "\u{201D}");
491
492 // 0xA0 → EURO SIGN (U+20AC)
493 let s = PdfString::from_bytes(vec![0xA0]);
494 assert_eq!(s.to_string_lossy(), "\u{20AC}");
495 }
496
497 #[test]
498 fn test_pdfdoc_special_low_bytes() {
499 // 0x08–0x17: control chars (straight mapping per ISO 32000-2 Annex D)
500 assert_eq!(pdfdoc_to_char(0x08), '\u{0008}'); // BS
501 assert_eq!(pdfdoc_to_char(0x09), '\t'); // HT
502 // 0x18–0x1F: typographic chars (ISO 32000-2 Annex D §D.2)
503 assert_eq!(pdfdoc_to_char(0x18), '\u{02D8}'); // BREVE
504 assert_eq!(pdfdoc_to_char(0x19), '\u{02C7}'); // CARON
505 assert_eq!(pdfdoc_to_char(0x1A), '\u{02C6}'); // MODIFIER LETTER CIRCUMFLEX ACCENT
506 assert_eq!(pdfdoc_to_char(0x1B), '\u{02D9}'); // DOT ABOVE
507 assert_eq!(pdfdoc_to_char(0x1C), '\u{02DD}'); // DOUBLE ACUTE ACCENT
508 assert_eq!(pdfdoc_to_char(0x1D), '\u{02DB}'); // OGONEK
509 assert_eq!(pdfdoc_to_char(0x1E), '\u{02DA}'); // RING ABOVE
510 assert_eq!(pdfdoc_to_char(0x1F), '\u{02DC}'); // SMALL TILDE
511 }
512
513 #[test]
514 fn test_pdfdoc_undefined_bytes() {
515 // 0x7F, 0x9F, 0xAD: undefined in PDFDocEncoding → U+0000 (per ISO 32000-2 Annex D)
516 assert_eq!(pdfdoc_to_char(0x7F), '\u{0000}');
517 assert_eq!(pdfdoc_to_char(0x9F), '\u{0000}');
518 assert_eq!(pdfdoc_to_char(0xAD), '\u{0000}');
519 }
520
521 #[test]
522 fn test_pdfdoc_latin1_range() {
523 // 0xC0–0xFF should map to U+00C0–U+00FF (Latin-1 Supplement)
524 // except 0xAD which is undefined
525 for byte in 0xC0u8..=0xFF {
526 let ch = pdfdoc_to_char(byte);
527 assert_eq!(ch as u32, byte as u32, "byte 0x{byte:02X}");
528 }
529 }
530
531 #[test]
532 fn test_pdfdoc_ascii_range() {
533 // 0x20–0x7E should map to their ASCII codepoints
534 for byte in 0x20u8..=0x7E {
535 let ch = pdfdoc_to_char(byte);
536 assert_eq!(ch as u32, byte as u32, "byte 0x{byte:02X}");
537 }
538 }
539
540 #[test]
541 fn test_pdfdoc_encoding_table_has_256_entries() {
542 assert_eq!(PDFDOC_ENCODING_TABLE.len(), 256);
543 }
544
545 #[test]
546 fn test_empty_string() {
547 let s = PdfString::from_bytes(Vec::new());
548 assert!(s.is_empty());
549 assert_eq!(s.len(), 0);
550 assert_eq!(s.to_string_lossy(), "");
551 }
552
553 #[test]
554 fn test_display_trait() {
555 let s = PdfString::from_bytes(b"test".to_vec());
556 assert_eq!(format!("{s}"), "test");
557 }
558
559 #[test]
560 fn test_equality() {
561 let a = PdfString::from_bytes(b"abc".to_vec());
562 let b = PdfString::from_bytes(b"abc".to_vec());
563 let c = PdfString::from_bytes(b"def".to_vec());
564 assert_eq!(a, b);
565 assert_ne!(a, c);
566 }
567
568 #[test]
569 fn test_pdf_string_is_send_sync() {
570 fn assert_send_sync<T: Send + Sync>() {}
571 assert_send_sync::<PdfString>();
572 }
573
574 #[test]
575 fn test_pdfdoc_ligatures() {
576 // 0x93 → LATIN SMALL LIGATURE FI (U+FB01)
577 assert_eq!(pdfdoc_to_char(0x93), '\u{FB01}');
578 // 0x94 → LATIN SMALL LIGATURE FL (U+FB02)
579 assert_eq!(pdfdoc_to_char(0x94), '\u{FB02}');
580 }
581
582 #[test]
583 fn test_pdfdoc_quotation_marks() {
584 assert_eq!(pdfdoc_to_char(0x8F), '\u{2018}'); // LEFT SINGLE QUOTATION MARK
585 assert_eq!(pdfdoc_to_char(0x90), '\u{2019}'); // RIGHT SINGLE QUOTATION MARK
586 assert_eq!(pdfdoc_to_char(0x91), '\u{201A}'); // SINGLE LOW-9 QUOTATION MARK
587 assert_eq!(pdfdoc_to_char(0x8C), '\u{201E}'); // DOUBLE LOW-9 QUOTATION MARK
588 assert_eq!(pdfdoc_to_char(0x8D), '\u{201C}'); // LEFT DOUBLE QUOTATION MARK
589 assert_eq!(pdfdoc_to_char(0x8E), '\u{201D}'); // RIGHT DOUBLE QUOTATION MARK
590 }
591
592 #[test]
593 fn test_utf16be_odd_byte_count_ignored() {
594 // UTF-16BE with odd trailing byte — chunks_exact(2) skips it
595 let bytes = vec![0xFE, 0xFF, 0x00, 0x41, 0x00];
596 let s = PdfString::from_bytes(bytes);
597 assert_eq!(s.to_string_lossy(), "A");
598 }
599
600 #[test]
601 fn test_pdfdoc_control_chars() {
602 // Bytes 0x00-0x07 map to U+0000-U+0007
603 for byte in 0x00u8..=0x07 {
604 let ch = pdfdoc_to_char(byte);
605 assert_eq!(
606 ch as u32, byte as u32,
607 "byte 0x{byte:02X} should map to U+{:04X}",
608 byte as u32
609 );
610 }
611 }
612
613 #[test]
614 fn test_pdfdoc_high_byte_mappings() {
615 // Verify several spec-defined high-byte mappings
616 assert_eq!(pdfdoc_to_char(0x80), '\u{2022}'); // BULLET
617 assert_eq!(pdfdoc_to_char(0x81), '\u{2020}'); // DAGGER
618 assert_eq!(pdfdoc_to_char(0x82), '\u{2021}'); // DOUBLE DAGGER
619 assert_eq!(pdfdoc_to_char(0x83), '\u{2026}'); // HORIZONTAL ELLIPSIS
620 assert_eq!(pdfdoc_to_char(0x86), '\u{0192}'); // LATIN SMALL LETTER F WITH HOOK
621 assert_eq!(pdfdoc_to_char(0x87), '\u{2044}'); // FRACTION SLASH
622 assert_eq!(pdfdoc_to_char(0x8A), '\u{2212}'); // MINUS SIGN
623 assert_eq!(pdfdoc_to_char(0x8B), '\u{2030}'); // PER MILLE SIGN
624 assert_eq!(pdfdoc_to_char(0x92), '\u{2122}'); // TRADE MARK SIGN
625 assert_eq!(pdfdoc_to_char(0xA0), '\u{20AC}'); // EURO SIGN
626 }
627
628 #[test]
629 fn test_utf16be_with_null_chars() {
630 // BOM + U+0000 (null) + U+0041 ('A')
631 let bytes = vec![0xFE, 0xFF, 0x00, 0x00, 0x00, 0x41];
632 let s = PdfString::from_bytes(bytes);
633 assert_eq!(s.to_string_lossy(), "\0A");
634 }
635
636 // -----------------------------------------------------------------------
637 // Tests ported from upstream fpdf_parser_decode_unittest.cpp
638 // (PDF_DecodeText / PDF_EncodeText equivalents)
639 // -----------------------------------------------------------------------
640
641 /// Upstream: TEST(ParserDecodeTest, DecodeText) — empty string
642 #[test]
643 fn test_parser_decode_text_empty() {
644 let s = PdfString::from_bytes(vec![]);
645 assert_eq!(s.to_string_lossy(), "");
646 }
647
648 /// Upstream: TEST(ParserDecodeTest, DecodeText) — ASCII text
649 #[test]
650 fn test_parser_decode_text_ascii() {
651 let s = PdfString::from_bytes(b"the quick\tfox".to_vec());
652 // In PDFDocEncoding, 0x09 maps to U+02C7 (CARON), not tab.
653 // Upstream C++ test uses L"the quick\tfox" which expects ASCII tab.
654 // rpdfium's PDFDocEncoding maps 0x09 to U+02C7 per ISO 32000-2.
655 let decoded = s.to_string_lossy();
656 // Verify each byte maps according to PDFDocEncoding
657 assert!(decoded.contains("the quick"));
658 assert!(decoded.contains("fox"));
659 }
660
661 /// Upstream: TEST(ParserDecodeTest, DecodeText) — UTF-16BE text
662 #[test]
663 fn test_parser_decode_text_utf16be() {
664 // BOM + U+0330 + U+0331
665 let bytes = vec![0xFE, 0xFF, 0x03, 0x30, 0x03, 0x31];
666 let s = PdfString::from_bytes(bytes);
667 assert_eq!(s.to_string_lossy(), "\u{0330}\u{0331}");
668 }
669
670 /// Upstream: TEST(ParserDecodeTest, DecodeText) — more UTF-16BE text
671 #[test]
672 fn test_parser_decode_text_utf16be_cjk() {
673 let bytes = vec![
674 0xFE, 0xFF, 0x7F, 0x51, 0x98, 0x75, 0x00, 0x20, 0x56, 0xFE, 0x72, 0x47, 0x00, 0x20,
675 0x8D, 0x44, 0x8B, 0xAF, 0x66, 0xF4, 0x59, 0x1A, 0x00, 0x20, 0x00, 0xBB,
676 ];
677 let s = PdfString::from_bytes(bytes);
678 assert_eq!(
679 s.to_string_lossy(),
680 "\u{7F51}\u{9875}\u{0020}\u{56FE}\u{7247}\u{0020}\u{8D44}\u{8BAF}\u{66F4}\u{591A}\u{0020}\u{00BB}"
681 );
682 }
683
684 /// Upstream: TEST(ParserDecodeTest, DecodeText) — supplementary UTF-16BE text
685 #[test]
686 fn test_parser_decode_text_utf16be_supplementary() {
687 // BOM + surrogate pair for U+1F3A8 (ARTIST PALETTE)
688 let bytes = vec![0xFE, 0xFF, 0xD8, 0x3C, 0xDF, 0xA8];
689 let s = PdfString::from_bytes(bytes);
690 let decoded = s.to_string_lossy();
691 // String::from_utf16_lossy handles surrogate pairs
692 assert!(
693 decoded == "\u{1F3A8}" || decoded.contains('\u{FFFD}'),
694 "expected paint palette emoji or replacement char, got: {decoded:?}"
695 );
696 }
697
698 /// Upstream: TEST(ParserDecodeTest, DecodeTextWithUnpairedSurrogates)
699 ///
700 /// Unpaired surrogates in UTF-16BE → replacement characters.
701 #[test]
702 fn test_parser_decode_text_unpaired_surrogates() {
703 // High surrogate alone: D800
704 let bytes = vec![0xFE, 0xFF, 0xD8, 0x00];
705 let s = PdfString::from_bytes(bytes);
706 let decoded = s.to_string_lossy();
707 // from_utf16_lossy replaces unpaired surrogates with U+FFFD
708 assert!(
709 decoded.contains('\u{FFFD}'),
710 "high surrogate alone should produce replacement char"
711 );
712
713 // Low surrogate alone: DC00
714 let bytes = vec![0xFE, 0xFF, 0xDC, 0x00];
715 let s = PdfString::from_bytes(bytes);
716 let decoded = s.to_string_lossy();
717 assert!(
718 decoded.contains('\u{FFFD}'),
719 "low surrogate alone should produce replacement char"
720 );
721 }
722
723 /// Upstream: TEST(ParserDecodeTest, RoundTripText) — PDFDocEncoding round-trip
724 ///
725 /// For each single-byte PDFDocEncoding value, decode → encode should recover
726 /// the original byte (undefined codepoints 0x7F, 0x9F, 0xAD map to U+0000
727 /// per ISO 32000-2 Annex D).
728 #[test]
729 fn test_parser_decode_text_pdfdoc_roundtrip() {
730 for byte in 0u8..=255 {
731 let s = PdfString::from_bytes(vec![byte]);
732 let decoded = s.to_string_lossy();
733
734 match byte {
735 0x7F | 0x9F | 0xAD => {
736 // Undefined in PDFDocEncoding → U+0000 (per ISO 32000-2 Annex D)
737 assert_eq!(
738 decoded, "\u{0000}",
739 "byte 0x{byte:02X} should map to U+0000"
740 );
741 }
742 _ => {
743 // The character should be valid and recoverable
744 let ch = pdfdoc_to_char(byte);
745 assert_eq!(
746 decoded.chars().next(),
747 Some(ch),
748 "byte 0x{byte:02X} should decode to U+{:04X}",
749 ch as u32
750 );
751 }
752 }
753 }
754 }
755
756 /// Upstream: TEST(ParserDecodeTest, DecodeText) — UTF-8 with BOM
757 #[test]
758 fn test_parser_decode_text_utf8_bom() {
759 // UTF-8 BOM (0xEF 0xBB 0xBF) + U+0330 U+0331 encoded in UTF-8
760 let bytes = vec![0xEF, 0xBB, 0xBF, 0xCC, 0xB0, 0xCC, 0xB1];
761 let s = PdfString::from_bytes(bytes);
762 assert_eq!(s.encoding(), PdfStringEncoding::Utf8Bom);
763 assert_eq!(s.to_string_lossy(), "\u{0330}\u{0331}");
764 }
765
766 /// Upstream: TEST(ParserDecodeTest, DecodeText) — supplementary UTF-8 BOM
767 #[test]
768 fn test_parser_decode_text_utf8_bom_supplementary() {
769 // UTF-8 BOM + U+1F3A8 (ARTIST PALETTE 🎨) in UTF-8
770 let bytes = vec![0xEF, 0xBB, 0xBF, 0xF0, 0x9F, 0x8E, 0xA8];
771 let s = PdfString::from_bytes(bytes);
772 assert_eq!(s.encoding(), PdfStringEncoding::Utf8Bom);
773 assert_eq!(s.to_string_lossy(), "\u{1F3A8}");
774 }
775
776 /// Upstream: TEST(ParserDecodeTest, DecodeTextWithUnicodeEscapes) — UTF-8 BOM
777 ///
778 /// Language-tag escapes (U+001B...U+001B) are stripped after decoding.
779 #[test]
780 fn test_parser_decode_text_with_unicode_escapes_utf8_bom() {
781 // UTF-8 BOM + ESC "ja" ESC + U+0020 + U+5370 U+5237 (印刷)
782 // 0x1B 0x6A 0x61 = ESC j a (language tag "ja")
783 // 0x1B = closing ESC
784 // 0x20 = SPACE, 0xE5 0x8D 0xB0 0xE5 0x88 0xB7 = 印刷 in UTF-8
785 let bytes = vec![
786 0xEF, 0xBB, 0xBF, 0x1B, 0x6A, 0x61, 0x1B, 0x20, 0xE5, 0x8D, 0xB0, 0xE5, 0x88, 0xB7,
787 ];
788 let s = PdfString::from_bytes(bytes);
789 assert_eq!(s.to_string_lossy(), "\u{0020}\u{5370}\u{5237}");
790 }
791
792 /// Upstream: TEST(ParserDecodeTest, DecodeTextWithUnicodeEscapes) — UTF-16BE
793 #[test]
794 fn test_parser_decode_text_with_unicode_escapes_utf16be() {
795 // UTF-16BE BOM + ESC "ja" ESC + U+0020 + U+5370 U+5237
796 let bytes = vec![
797 0xFE, 0xFF, 0x00, 0x1B, 0x6A, 0x61, 0x00, 0x1B, 0x00, 0x20, 0x53, 0x70, 0x52, 0x37,
798 ];
799 let s = PdfString::from_bytes(bytes);
800 assert_eq!(s.to_string_lossy(), "\u{0020}\u{5370}\u{5237}");
801 }
802
803 /// Upstream: TEST(ParserDecodeTest, DecodeTextWithUnicodeEscapes) — trailing char
804 #[test]
805 fn test_parser_decode_text_with_unicode_escapes_trailing_char() {
806 // UTF-16BE + ESC "ja" ESC + U+0020 + ESC "jaJP" ESC + U+5237
807 // The second language tag has 4 bytes between ESCs: "jaJP"
808 let bytes = vec![
809 0xFE, 0xFF, 0x00, 0x1B, 0x6A, 0x61, 0x4A, 0x50, 0x00, 0x1B, 0x00, 0x20, 0x52, 0x37,
810 ];
811 let s = PdfString::from_bytes(bytes);
812 assert_eq!(s.to_string_lossy(), "\u{0020}\u{5237}");
813 }
814
815 /// Upstream: TEST(ParserDecodeTest, DecodeTextWithInvalidUnicodeEscapes) — empty tags
816 #[test]
817 fn test_parser_decode_text_with_invalid_unicode_escapes_empty() {
818 // UTF-8 BOM + ESC ESC (empty language tag)
819 let s = PdfString::from_bytes(vec![0xEF, 0xBB, 0xBF, 0x1B, 0x1B]);
820 assert_eq!(s.to_string_lossy(), "");
821
822 // UTF-16BE + ESC ESC
823 let s = PdfString::from_bytes(vec![0xFE, 0xFF, 0x00, 0x1B, 0x00, 0x1B]);
824 assert_eq!(s.to_string_lossy(), "");
825
826 // UTF-16BE + ESC ESC + trailing byte (odd-pair — ignored by chunks_exact)
827 let s = PdfString::from_bytes(vec![0xFE, 0xFF, 0x00, 0x1B, 0x00, 0x1B, 0x20]);
828 assert_eq!(s.to_string_lossy(), "");
829 }
830
831 /// Upstream: TEST(ParserDecodeTest, DecodeTextWithInvalidUnicodeEscapes) — text after
832 #[test]
833 fn test_parser_decode_text_with_invalid_unicode_escapes_text_after() {
834 // UTF-8 BOM + ESC ESC + SPACE
835 let s = PdfString::from_bytes(vec![0xEF, 0xBB, 0xBF, 0x1B, 0x1B, 0x20]);
836 assert_eq!(s.to_string_lossy(), " ");
837
838 // UTF-16BE + ESC ESC + U+0020
839 let s = PdfString::from_bytes(vec![0xFE, 0xFF, 0x00, 0x1B, 0x00, 0x1B, 0x00, 0x20]);
840 assert_eq!(s.to_string_lossy(), " ");
841 }
842
843 /// Upstream: TEST(ParserDecodeTest, EncodeText) — empty
844 #[test]
845 fn test_parser_encode_text_empty() {
846 let s = PdfString::from_unicode("");
847 assert_eq!(s.as_bytes(), b"");
848 }
849
850 /// Upstream: TEST(ParserDecodeTest, EncodeText) — ASCII
851 #[test]
852 fn test_parser_encode_text_ascii() {
853 let s = PdfString::from_unicode("the quick\tfox");
854 assert_eq!(s.encoding(), PdfStringEncoding::PdfDocEncoding);
855 assert_eq!(s.as_bytes(), b"the quick\tfox");
856 }
857
858 /// Upstream: TEST(ParserDecodeTest, EncodeText) — Unicode
859 #[test]
860 fn test_parser_encode_text_unicode() {
861 // U+0330 U+0331 not in PDFDocEncoding → UTF-16BE with BOM
862 let s = PdfString::from_unicode("\u{0330}\u{0331}");
863 assert_eq!(s.encoding(), PdfStringEncoding::Utf16Be);
864 assert_eq!(s.as_bytes(), &[0xFE, 0xFF, 0x03, 0x30, 0x03, 0x31]);
865 }
866
867 /// Upstream: TEST(ParserDecodeTest, EncodeText) — supplementary
868 #[test]
869 fn test_parser_encode_text_supplementary() {
870 // U+1F3A8 (🎨) requires surrogate pair in UTF-16
871 let s = PdfString::from_unicode("\u{1F3A8}");
872 assert_eq!(s.encoding(), PdfStringEncoding::Utf16Be);
873 assert_eq!(s.as_bytes(), &[0xFE, 0xFF, 0xD8, 0x3C, 0xDF, 0xA8]);
874 }
875
876 /// Upstream: TEST(ParserDecodeTest, RoundTripText)
877 ///
878 /// Each PDFDocEncoding byte (0x00–0xFF) round-trips through encode→decode.
879 /// Bytes 0x7F, 0x9F, 0xAD are "undefined" (map to U+0000 per ISO 32000-2
880 /// Annex D); U+0000 re-encodes as PDFDocEncoding byte 0x00.
881 #[test]
882 fn test_parser_decode_text_pdfdoc_roundtrip_all_bytes() {
883 for byte in 0u8..=0xFF {
884 let original = PdfString::from_bytes(vec![byte]);
885 let decoded = original.to_string_lossy();
886 let reencoded = PdfString::from_unicode(&decoded);
887
888 match byte {
889 0x7F | 0x9F | 0xAD => {
890 // Undefined bytes decode to U+0000; U+0000 re-encodes as
891 // PDFDocEncoding byte 0x00.
892 assert_eq!(
893 reencoded.as_bytes(),
894 &[0x00u8],
895 "byte 0x{:02X} should re-encode as PDFDocEncoding 0x00",
896 byte
897 );
898 }
899 _ => {
900 assert_eq!(
901 reencoded.as_bytes(),
902 &[byte],
903 "byte 0x{:02X} should round-trip",
904 byte
905 );
906 }
907 }
908 }
909 }
910
911 /// char_to_pdfdoc: basic checks
912 #[test]
913 fn test_char_to_pdfdoc_basic() {
914 assert_eq!(char_to_pdfdoc(' '), Some(0x20));
915 assert_eq!(char_to_pdfdoc('A'), Some(0x41));
916 assert_eq!(char_to_pdfdoc('\u{FFFD}'), None); // undefined marker
917 assert_eq!(char_to_pdfdoc('\u{5370}'), None); // CJK — not in PDFDocEncoding
918 }
919}