rpdfium_core/
bytestring.rs

1// Derived from PDFium's core/fxcrt/bytestring.cpp
2// Original: Copyright 2014 The PDFium Authors
3// Licensed under BSD-3-Clause / Apache-2.0
4// See pdfium-upstream/LICENSE for the original license.
5
6//! Encoding-aware PDF string type.
7//!
8//! PDF strings use three encodings (ISO 32000-2 §7.9):
9//!
10//! - **PDFDocEncoding**: A superset of ISO Latin-1. Used for most string values
11//!   (metadata, bookmark titles, form field values). The default when no BOM is present.
12//! - **UTF-16BE**: Indicated by a byte-order mark (`0xFE 0xFF`) at the start.
13//!   Used for Unicode strings that cannot be represented in PDFDocEncoding.
14//! - **UTF-8**: Indicated by a UTF-8 BOM (`0xEF 0xBB 0xBF`) at the start.
15//!   Less common; treated identically to UTF-16BE after decoding.
16
17use std::fmt;
18
19/// A PDF string with encoding-aware conversion.
20///
21/// Stores the raw bytes as they appear in the PDF file. The encoding is
22/// detected from the content: if the bytes start with `0xFE 0xFF` (BOM),
23/// the string is UTF-16BE; otherwise it is PDFDocEncoding.
24#[derive(Clone, PartialEq, Eq, Hash)]
25pub struct PdfString {
26    bytes: Vec<u8>,
27}
28
29/// The encoding used by a [`PdfString`].
30#[derive(Debug, Clone, Copy, PartialEq, Eq)]
31pub enum PdfStringEncoding {
32    /// PDFDocEncoding — a superset of ISO Latin-1 (ISO 32000-2 Annex D).
33    PdfDocEncoding,
34    /// UTF-16BE with byte-order mark (`0xFE 0xFF`).
35    Utf16Be,
36    /// UTF-8 with byte-order mark (`0xEF 0xBB 0xBF`).
37    Utf8Bom,
38}
39
40impl PdfString {
41    /// Create a `PdfString` from raw bytes (as parsed from the PDF).
42    pub fn from_bytes(bytes: Vec<u8>) -> Self {
43        Self { bytes }
44    }
45
46    /// Encode a UTF-8 string as a PDF string.
47    ///
48    /// Uses PDFDocEncoding if every character is representable; otherwise uses
49    /// UTF-16BE with a `0xFE 0xFF` byte-order mark. This matches the logic of
50    /// `PDF_EncodeText()` in PDFium upstream.
51    ///
52    /// # Examples
53    /// ```
54    /// # use rpdfium_core::{PdfString, PdfStringEncoding};
55    /// let ascii = PdfString::from_unicode("hello");
56    /// assert_eq!(ascii.encoding(), PdfStringEncoding::PdfDocEncoding);
57    ///
58    /// let unicode = PdfString::from_unicode("日本語");
59    /// assert_eq!(unicode.encoding(), PdfStringEncoding::Utf16Be);
60    /// ```
61    pub fn from_unicode(s: &str) -> Self {
62        if s.is_empty() {
63            return Self { bytes: Vec::new() };
64        }
65        // Try PDFDocEncoding first (PDF_EncodeText logic from upstream).
66        let mut bytes = Vec::with_capacity(s.len());
67        for ch in s.chars() {
68            match char_to_pdfdoc(ch) {
69                Some(byte) => bytes.push(byte),
70                None => return Self::encode_utf16be(s),
71            }
72        }
73        Self { bytes }
74    }
75
76    /// Encode as UTF-16BE with BOM (internal helper).
77    fn encode_utf16be(s: &str) -> Self {
78        let mut bytes = Vec::with_capacity(2 + s.len() * 2);
79        bytes.push(0xFE);
80        bytes.push(0xFF);
81        for unit in s.encode_utf16() {
82            bytes.push((unit >> 8) as u8);
83            bytes.push(unit as u8);
84        }
85        Self { bytes }
86    }
87
88    /// Raw bytes (for binary operations, stream `/Length`, etc.).
89    #[inline]
90    pub fn as_bytes(&self) -> &[u8] {
91        &self.bytes
92    }
93
94    /// Detect encoding from the byte-order mark.
95    ///
96    /// - `0xFE 0xFF` → [`PdfStringEncoding::Utf16Be`]
97    /// - `0xEF 0xBB 0xBF` → [`PdfStringEncoding::Utf8Bom`]
98    /// - Otherwise → [`PdfStringEncoding::PdfDocEncoding`]
99    pub fn encoding(&self) -> PdfStringEncoding {
100        if self.bytes.starts_with(&[0xFE, 0xFF]) {
101            PdfStringEncoding::Utf16Be
102        } else if self.bytes.starts_with(&[0xEF, 0xBB, 0xBF]) {
103            PdfStringEncoding::Utf8Bom
104        } else {
105            PdfStringEncoding::PdfDocEncoding
106        }
107    }
108
109    /// Decode to a Rust [`String`] (UTF-8), handling all PDF string encodings.
110    ///
111    /// - UTF-16BE: decoded with surrogate-pair support; invalid pairs → U+FFFD.
112    /// - UTF-8 BOM: decoded as UTF-8 after stripping the BOM.
113    /// - PDFDocEncoding: each byte mapped to Unicode per ISO 32000-2 Annex D.
114    ///
115    /// ISO 2022 language-tag escape sequences (`U+001B…U+001B`) present in
116    /// UTF-16BE and UTF-8 BOM strings are stripped, matching the behaviour of
117    /// `StripLanguageCodes()` / `PDF_DecodeText()` in PDFium upstream.
118    pub fn to_string_lossy(&self) -> String {
119        match self.encoding() {
120            PdfStringEncoding::Utf16Be => {
121                let u16s: Vec<u16> = self.bytes[2..]
122                    .chunks_exact(2)
123                    .map(|pair| u16::from_be_bytes([pair[0], pair[1]]))
124                    .collect();
125                strip_language_codes(String::from_utf16_lossy(&u16s))
126            }
127            PdfStringEncoding::Utf8Bom => {
128                let utf8 = std::str::from_utf8(&self.bytes[3..]).unwrap_or("");
129                strip_language_codes(utf8.to_owned())
130            }
131            PdfStringEncoding::PdfDocEncoding => {
132                self.bytes.iter().map(|&b| pdfdoc_to_char(b)).collect()
133            }
134        }
135    }
136
137    /// Returns `true` if the string has no bytes.
138    pub fn is_empty(&self) -> bool {
139        self.bytes.is_empty()
140    }
141
142    /// Returns the length of the raw byte representation.
143    pub fn len(&self) -> usize {
144        self.bytes.len()
145    }
146
147    /// Decode to a Rust [`String`] (UTF-8), handling both PDF encodings.
148    ///
149    /// Deprecated; use [`to_string_lossy`](Self::to_string_lossy) instead.
150    #[deprecated(note = "use `to_string_lossy()` instead")]
151    #[inline]
152    pub fn unicode_data(&self) -> String {
153        self.to_string_lossy()
154    }
155
156    /// Upstream-aligned alias for [`to_string_lossy`](Self::to_string_lossy).
157    ///
158    /// Corresponds to `ByteString::GetUnicodeData()` in PDFium upstream.
159    #[inline]
160    pub fn get_unicode_data(&self) -> String {
161        self.to_string_lossy()
162    }
163
164    /// Upstream-aligned alias for [`as_bytes`](Self::as_bytes).
165    ///
166    /// Corresponds to `ByteString::GetRawString()` in PDFium upstream.
167    #[inline]
168    pub fn get_raw_string(&self) -> &[u8] {
169        self.as_bytes()
170    }
171}
172
173impl fmt::Debug for PdfString {
174    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
175        f.debug_struct("PdfString")
176            .field("encoding", &self.encoding())
177            .field("text", &self.to_string_lossy())
178            .field("len", &self.bytes.len())
179            .finish()
180    }
181}
182
183impl fmt::Display for PdfString {
184    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
185        f.write_str(&self.to_string_lossy())
186    }
187}
188
189/// Map a single byte to its Unicode codepoint under PDFDocEncoding.
190///
191/// This is the complete PDFDocEncoding table as defined in ISO 32000-2 Annex D.
192/// Bytes 0x00–0x17 map to U+0000–U+0017 (control characters, straight mapping).
193/// Bytes 0x18–0x1F map to typographic Unicode codepoints per ISO 32000-2 Annex D.
194/// Bytes 0x7F, 0x9F, and 0xAD are undefined (map to U+0000).
195pub fn pdfdoc_to_char(byte: u8) -> char {
196    PDFDOC_ENCODING_TABLE[byte as usize]
197}
198
199/// Reverse-map a Unicode character to a PDFDocEncoding byte.
200///
201/// Returns `None` if the character is not representable in PDFDocEncoding
202/// (characters that have no PDFDocEncoding byte, e.g. CJK, emoji).
203///
204/// Corresponds to the inner loop of `PDF_EncodeText()` in PDFium upstream.
205pub fn char_to_pdfdoc(ch: char) -> Option<u8> {
206    PDFDOC_ENCODING_TABLE
207        .iter()
208        .position(|&c| c == ch)
209        .map(|i| i as u8)
210}
211
212/// Strip ISO 2022 language-tag escape sequences from a decoded Unicode string.
213///
214/// Language tags are delimited by U+001B (ESCAPE): when the decoder sees ESC,
215/// it skips every character up to and including the closing ESC. The remaining
216/// characters form the decoded text.
217///
218/// Corresponds to `StripLanguageCodes()` in PDFium upstream
219/// (`core/fpdfapi/parser/fpdf_parser_decode.cpp`).
220fn strip_language_codes(s: String) -> String {
221    if !s.contains('\u{001B}') {
222        return s;
223    }
224    let mut result = String::with_capacity(s.len());
225    let mut chars = s.chars();
226    while let Some(ch) = chars.next() {
227        if ch == '\u{001B}' {
228            for inner in chars.by_ref() {
229                if inner == '\u{001B}' {
230                    break;
231                }
232            }
233        } else {
234            result.push(ch);
235        }
236    }
237    result
238}
239
240/// Complete PDFDocEncoding → Unicode mapping table (ISO 32000-2 Annex D).
241///
242/// 256 entries, one for each possible byte value 0x00–0xFF.
243#[rustfmt::skip]
244const PDFDOC_ENCODING_TABLE: [char; 256] = [
245    // 0x00–0x07: control characters (mapped to Unicode control chars)
246    '\u{0000}', '\u{0001}', '\u{0002}', '\u{0003}',
247    '\u{0004}', '\u{0005}', '\u{0006}', '\u{0007}',
248    // 0x08–0x17: control characters (straight mapping, per ISO 32000-2 Annex D)
249    '\u{0008}', // 0x08 → BS (BACKSPACE)
250    '\u{0009}', // 0x09 → HT (HORIZONTAL TAB)
251    '\u{000A}', // 0x0A → LF (LINE FEED)
252    '\u{000B}', // 0x0B → VT (VERTICAL TAB)
253    '\u{000C}', // 0x0C → FF (FORM FEED)
254    '\u{000D}', // 0x0D → CR (CARRIAGE RETURN)
255    '\u{000E}', // 0x0E → SO (SHIFT OUT)
256    '\u{000F}', // 0x0F → SI (SHIFT IN)
257    '\u{0010}', // 0x10 → DLE
258    '\u{0011}', // 0x11 → DC1
259    '\u{0012}', // 0x12 → DC2
260    '\u{0013}', // 0x13 → DC3
261    '\u{0014}', // 0x14 → DC4
262    '\u{0015}', // 0x15 → NAK
263    '\u{0016}', // 0x16 → SYN
264    '\u{0017}', // 0x17 → ETB
265    // 0x18–0x1F: typographic characters (ISO 32000-2 Annex D §D.2)
266    '\u{02D8}', // 0x18 → BREVE
267    '\u{02C7}', // 0x19 → CARON
268    '\u{02C6}', // 0x1A → MODIFIER LETTER CIRCUMFLEX ACCENT
269    '\u{02D9}', // 0x1B → DOT ABOVE
270    '\u{02DD}', // 0x1C → DOUBLE ACUTE ACCENT
271    '\u{02DB}', // 0x1D → OGONEK
272    '\u{02DA}', // 0x1E → RING ABOVE
273    '\u{02DC}', // 0x1F → SMALL TILDE
274    // 0x20–0x7E: ASCII printable range (identical to Unicode)
275    ' ', '!', '"', '#', '$', '%', '&', '\'',
276    '(', ')', '*', '+', ',', '-', '.', '/',
277    '0', '1', '2', '3', '4', '5', '6', '7',
278    '8', '9', ':', ';', '<', '=', '>', '?',
279    '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G',
280    'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
281    'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W',
282    'X', 'Y', 'Z', '[', '\\', ']', '^', '_',
283    '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g',
284    'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
285    'p', 'q', 'r', 's', 't', 'u', 'v', 'w',
286    'x', 'y', 'z', '{', '|', '}', '~',
287    // 0x7F: undefined in PDFDocEncoding (ISO 32000-2 Annex D)
288    '\u{0000}',
289    // 0x80–0x8F
290    '\u{2022}', // 0x80 → BULLET
291    '\u{2020}', // 0x81 → DAGGER
292    '\u{2021}', // 0x82 → DOUBLE DAGGER
293    '\u{2026}', // 0x83 → HORIZONTAL ELLIPSIS
294    '\u{2014}', // 0x84 → EM DASH
295    '\u{2013}', // 0x85 → EN DASH
296    '\u{0192}', // 0x86 → LATIN SMALL LETTER F WITH HOOK
297    '\u{2044}', // 0x87 → FRACTION SLASH
298    '\u{2039}', // 0x88 → SINGLE LEFT-POINTING ANGLE QUOTATION MARK
299    '\u{203A}', // 0x89 → SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
300    '\u{2212}', // 0x8A → MINUS SIGN
301    '\u{2030}', // 0x8B → PER MILLE SIGN
302    '\u{201E}', // 0x8C → DOUBLE LOW-9 QUOTATION MARK
303    '\u{201C}', // 0x8D → LEFT DOUBLE QUOTATION MARK
304    '\u{201D}', // 0x8E → RIGHT DOUBLE QUOTATION MARK
305    '\u{2018}', // 0x8F → LEFT SINGLE QUOTATION MARK
306    // 0x90–0x9F
307    '\u{2019}', // 0x90 → RIGHT SINGLE QUOTATION MARK
308    '\u{201A}', // 0x91 → SINGLE LOW-9 QUOTATION MARK
309    '\u{2122}', // 0x92 → TRADE MARK SIGN
310    '\u{FB01}', // 0x93 → LATIN SMALL LIGATURE FI
311    '\u{FB02}', // 0x94 → LATIN SMALL LIGATURE FL
312    '\u{0141}', // 0x95 → LATIN CAPITAL LETTER L WITH STROKE
313    '\u{0152}', // 0x96 → LATIN CAPITAL LIGATURE OE
314    '\u{0160}', // 0x97 → LATIN CAPITAL LETTER S WITH CARON
315    '\u{0178}', // 0x98 → LATIN CAPITAL LETTER Y WITH DIAERESIS
316    '\u{017D}', // 0x99 → LATIN CAPITAL LETTER Z WITH CARON
317    '\u{0131}', // 0x9A → LATIN SMALL LETTER DOTLESS I
318    '\u{0142}', // 0x9B → LATIN SMALL LETTER L WITH STROKE
319    '\u{0153}', // 0x9C → LATIN SMALL LIGATURE OE
320    '\u{0161}', // 0x9D → LATIN SMALL LETTER S WITH CARON
321    '\u{017E}', // 0x9E → LATIN SMALL LETTER Z WITH CARON
322    '\u{0000}', // 0x9F → UNDEFINED
323    // 0xA0–0xAF
324    '\u{20AC}', // 0xA0 → EURO SIGN
325    '\u{00A1}', // 0xA1 → INVERTED EXCLAMATION MARK
326    '\u{00A2}', // 0xA2 → CENT SIGN
327    '\u{00A3}', // 0xA3 → POUND SIGN
328    '\u{00A4}', // 0xA4 → CURRENCY SIGN
329    '\u{00A5}', // 0xA5 → YEN SIGN
330    '\u{00A6}', // 0xA6 → BROKEN BAR
331    '\u{00A7}', // 0xA7 → SECTION SIGN
332    '\u{00A8}', // 0xA8 → DIAERESIS
333    '\u{00A9}', // 0xA9 → COPYRIGHT SIGN
334    '\u{00AA}', // 0xAA → FEMININE ORDINAL INDICATOR
335    '\u{00AB}', // 0xAB → LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
336    '\u{00AC}', // 0xAC → NOT SIGN
337    '\u{0000}', // 0xAD → UNDEFINED (soft hyphen in Latin-1, undefined in PDFDocEncoding)
338    '\u{00AE}', // 0xAE → REGISTERED SIGN
339    '\u{00AF}', // 0xAF → MACRON
340    // 0xB0–0xBF
341    '\u{00B0}', // 0xB0 → DEGREE SIGN
342    '\u{00B1}', // 0xB1 → PLUS-MINUS SIGN
343    '\u{00B2}', // 0xB2 → SUPERSCRIPT TWO
344    '\u{00B3}', // 0xB3 → SUPERSCRIPT THREE
345    '\u{00B4}', // 0xB4 → ACUTE ACCENT
346    '\u{00B5}', // 0xB5 → MICRO SIGN
347    '\u{00B6}', // 0xB6 → PILCROW SIGN
348    '\u{00B7}', // 0xB7 → MIDDLE DOT
349    '\u{00B8}', // 0xB8 → CEDILLA
350    '\u{00B9}', // 0xB9 → SUPERSCRIPT ONE
351    '\u{00BA}', // 0xBA → MASCULINE ORDINAL INDICATOR
352    '\u{00BB}', // 0xBB → RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
353    '\u{00BC}', // 0xBC → VULGAR FRACTION ONE QUARTER
354    '\u{00BD}', // 0xBD → VULGAR FRACTION ONE HALF
355    '\u{00BE}', // 0xBE → VULGAR FRACTION THREE QUARTERS
356    '\u{00BF}', // 0xBF → INVERTED QUESTION MARK
357    // 0xC0–0xCF
358    '\u{00C0}', // 0xC0 → LATIN CAPITAL LETTER A WITH GRAVE
359    '\u{00C1}', // 0xC1 → LATIN CAPITAL LETTER A WITH ACUTE
360    '\u{00C2}', // 0xC2 → LATIN CAPITAL LETTER A WITH CIRCUMFLEX
361    '\u{00C3}', // 0xC3 → LATIN CAPITAL LETTER A WITH TILDE
362    '\u{00C4}', // 0xC4 → LATIN CAPITAL LETTER A WITH DIAERESIS
363    '\u{00C5}', // 0xC5 → LATIN CAPITAL LETTER A WITH RING ABOVE
364    '\u{00C6}', // 0xC6 → LATIN CAPITAL LETTER AE
365    '\u{00C7}', // 0xC7 → LATIN CAPITAL LETTER C WITH CEDILLA
366    '\u{00C8}', // 0xC8 → LATIN CAPITAL LETTER E WITH GRAVE
367    '\u{00C9}', // 0xC9 → LATIN CAPITAL LETTER E WITH ACUTE
368    '\u{00CA}', // 0xCA → LATIN CAPITAL LETTER E WITH CIRCUMFLEX
369    '\u{00CB}', // 0xCB → LATIN CAPITAL LETTER E WITH DIAERESIS
370    '\u{00CC}', // 0xCC → LATIN CAPITAL LETTER I WITH GRAVE
371    '\u{00CD}', // 0xCD → LATIN CAPITAL LETTER I WITH ACUTE
372    '\u{00CE}', // 0xCE → LATIN CAPITAL LETTER I WITH CIRCUMFLEX
373    '\u{00CF}', // 0xCF → LATIN CAPITAL LETTER I WITH DIAERESIS
374    // 0xD0–0xDF
375    '\u{00D0}', // 0xD0 → LATIN CAPITAL LETTER ETH
376    '\u{00D1}', // 0xD1 → LATIN CAPITAL LETTER N WITH TILDE
377    '\u{00D2}', // 0xD2 → LATIN CAPITAL LETTER O WITH GRAVE
378    '\u{00D3}', // 0xD3 → LATIN CAPITAL LETTER O WITH ACUTE
379    '\u{00D4}', // 0xD4 → LATIN CAPITAL LETTER O WITH CIRCUMFLEX
380    '\u{00D5}', // 0xD5 → LATIN CAPITAL LETTER O WITH TILDE
381    '\u{00D6}', // 0xD6 → LATIN CAPITAL LETTER O WITH DIAERESIS
382    '\u{00D7}', // 0xD7 → MULTIPLICATION SIGN
383    '\u{00D8}', // 0xD8 → LATIN CAPITAL LETTER O WITH STROKE
384    '\u{00D9}', // 0xD9 → LATIN CAPITAL LETTER U WITH GRAVE
385    '\u{00DA}', // 0xDA → LATIN CAPITAL LETTER U WITH ACUTE
386    '\u{00DB}', // 0xDB → LATIN CAPITAL LETTER U WITH CIRCUMFLEX
387    '\u{00DC}', // 0xDC → LATIN CAPITAL LETTER U WITH DIAERESIS
388    '\u{00DD}', // 0xDD → LATIN CAPITAL LETTER Y WITH ACUTE
389    '\u{00DE}', // 0xDE → LATIN CAPITAL LETTER THORN
390    '\u{00DF}', // 0xDF → LATIN SMALL LETTER SHARP S
391    // 0xE0–0xEF
392    '\u{00E0}', // 0xE0 → LATIN SMALL LETTER A WITH GRAVE
393    '\u{00E1}', // 0xE1 → LATIN SMALL LETTER A WITH ACUTE
394    '\u{00E2}', // 0xE2 → LATIN SMALL LETTER A WITH CIRCUMFLEX
395    '\u{00E3}', // 0xE3 → LATIN SMALL LETTER A WITH TILDE
396    '\u{00E4}', // 0xE4 → LATIN SMALL LETTER A WITH DIAERESIS
397    '\u{00E5}', // 0xE5 → LATIN SMALL LETTER A WITH RING ABOVE
398    '\u{00E6}', // 0xE6 → LATIN SMALL LETTER AE
399    '\u{00E7}', // 0xE7 → LATIN SMALL LETTER C WITH CEDILLA
400    '\u{00E8}', // 0xE8 → LATIN SMALL LETTER E WITH GRAVE
401    '\u{00E9}', // 0xE9 → LATIN SMALL LETTER E WITH ACUTE
402    '\u{00EA}', // 0xEA → LATIN SMALL LETTER E WITH CIRCUMFLEX
403    '\u{00EB}', // 0xEB → LATIN SMALL LETTER E WITH DIAERESIS
404    '\u{00EC}', // 0xEC → LATIN SMALL LETTER I WITH GRAVE
405    '\u{00ED}', // 0xED → LATIN SMALL LETTER I WITH ACUTE
406    '\u{00EE}', // 0xEE → LATIN SMALL LETTER I WITH CIRCUMFLEX
407    '\u{00EF}', // 0xEF → LATIN SMALL LETTER I WITH DIAERESIS
408    // 0xF0–0xFF
409    '\u{00F0}', // 0xF0 → LATIN SMALL LETTER ETH
410    '\u{00F1}', // 0xF1 → LATIN SMALL LETTER N WITH TILDE
411    '\u{00F2}', // 0xF2 → LATIN SMALL LETTER O WITH GRAVE
412    '\u{00F3}', // 0xF3 → LATIN SMALL LETTER O WITH ACUTE
413    '\u{00F4}', // 0xF4 → LATIN SMALL LETTER O WITH CIRCUMFLEX
414    '\u{00F5}', // 0xF5 → LATIN SMALL LETTER O WITH TILDE
415    '\u{00F6}', // 0xF6 → LATIN SMALL LETTER O WITH DIAERESIS
416    '\u{00F7}', // 0xF7 → DIVISION SIGN
417    '\u{00F8}', // 0xF8 → LATIN SMALL LETTER O WITH STROKE
418    '\u{00F9}', // 0xF9 → LATIN SMALL LETTER U WITH GRAVE
419    '\u{00FA}', // 0xFA → LATIN SMALL LETTER U WITH ACUTE
420    '\u{00FB}', // 0xFB → LATIN SMALL LETTER U WITH CIRCUMFLEX
421    '\u{00FC}', // 0xFC → LATIN SMALL LETTER U WITH DIAERESIS
422    '\u{00FD}', // 0xFD → LATIN SMALL LETTER Y WITH ACUTE
423    '\u{00FE}', // 0xFE → LATIN SMALL LETTER THORN
424    '\u{00FF}', // 0xFF → LATIN SMALL LETTER Y WITH DIAERESIS
425];
426
427#[cfg(test)]
428mod tests {
429    use super::*;
430
431    #[test]
432    fn test_pdfdoc_encoding_detection() {
433        let s = PdfString::from_bytes(b"Hello".to_vec());
434        assert_eq!(s.encoding(), PdfStringEncoding::PdfDocEncoding);
435    }
436
437    #[test]
438    fn test_utf16be_encoding_detection() {
439        let mut bytes = vec![0xFE, 0xFF];
440        // "Hi" in UTF-16BE
441        bytes.extend_from_slice(&[0x00, 0x48, 0x00, 0x69]);
442        let s = PdfString::from_bytes(bytes);
443        assert_eq!(s.encoding(), PdfStringEncoding::Utf16Be);
444    }
445
446    #[test]
447    fn test_pdfdoc_ascii_roundtrip() {
448        let s = PdfString::from_bytes(b"Hello, World!".to_vec());
449        assert_eq!(s.to_string_lossy(), "Hello, World!");
450    }
451
452    #[test]
453    fn test_utf16be_decode() {
454        let mut bytes = vec![0xFE, 0xFF];
455        // "ABC" in UTF-16BE
456        bytes.extend_from_slice(&[0x00, 0x41, 0x00, 0x42, 0x00, 0x43]);
457        let s = PdfString::from_bytes(bytes);
458        assert_eq!(s.to_string_lossy(), "ABC");
459    }
460
461    #[test]
462    fn test_utf16be_decode_non_ascii() {
463        let mut bytes = vec![0xFE, 0xFF];
464        // U+00E9 (é) in UTF-16BE
465        bytes.extend_from_slice(&[0x00, 0xE9]);
466        let s = PdfString::from_bytes(bytes);
467        assert_eq!(s.to_string_lossy(), "\u{00E9}");
468    }
469
470    #[test]
471    fn test_pdfdoc_high_bytes() {
472        // 0x80 → BULLET (U+2022)
473        let s = PdfString::from_bytes(vec![0x80]);
474        assert_eq!(s.to_string_lossy(), "\u{2022}");
475
476        // 0x84 → EM DASH (U+2014)
477        let s = PdfString::from_bytes(vec![0x84]);
478        assert_eq!(s.to_string_lossy(), "\u{2014}");
479
480        // 0x85 → EN DASH (U+2013)
481        let s = PdfString::from_bytes(vec![0x85]);
482        assert_eq!(s.to_string_lossy(), "\u{2013}");
483
484        // 0x8D → LEFT DOUBLE QUOTATION MARK (U+201C)
485        let s = PdfString::from_bytes(vec![0x8D]);
486        assert_eq!(s.to_string_lossy(), "\u{201C}");
487
488        // 0x8E → RIGHT DOUBLE QUOTATION MARK (U+201D)
489        let s = PdfString::from_bytes(vec![0x8E]);
490        assert_eq!(s.to_string_lossy(), "\u{201D}");
491
492        // 0xA0 → EURO SIGN (U+20AC)
493        let s = PdfString::from_bytes(vec![0xA0]);
494        assert_eq!(s.to_string_lossy(), "\u{20AC}");
495    }
496
497    #[test]
498    fn test_pdfdoc_special_low_bytes() {
499        // 0x08–0x17: control chars (straight mapping per ISO 32000-2 Annex D)
500        assert_eq!(pdfdoc_to_char(0x08), '\u{0008}'); // BS
501        assert_eq!(pdfdoc_to_char(0x09), '\t'); // HT
502        // 0x18–0x1F: typographic chars (ISO 32000-2 Annex D §D.2)
503        assert_eq!(pdfdoc_to_char(0x18), '\u{02D8}'); // BREVE
504        assert_eq!(pdfdoc_to_char(0x19), '\u{02C7}'); // CARON
505        assert_eq!(pdfdoc_to_char(0x1A), '\u{02C6}'); // MODIFIER LETTER CIRCUMFLEX ACCENT
506        assert_eq!(pdfdoc_to_char(0x1B), '\u{02D9}'); // DOT ABOVE
507        assert_eq!(pdfdoc_to_char(0x1C), '\u{02DD}'); // DOUBLE ACUTE ACCENT
508        assert_eq!(pdfdoc_to_char(0x1D), '\u{02DB}'); // OGONEK
509        assert_eq!(pdfdoc_to_char(0x1E), '\u{02DA}'); // RING ABOVE
510        assert_eq!(pdfdoc_to_char(0x1F), '\u{02DC}'); // SMALL TILDE
511    }
512
513    #[test]
514    fn test_pdfdoc_undefined_bytes() {
515        // 0x7F, 0x9F, 0xAD: undefined in PDFDocEncoding → U+0000 (per ISO 32000-2 Annex D)
516        assert_eq!(pdfdoc_to_char(0x7F), '\u{0000}');
517        assert_eq!(pdfdoc_to_char(0x9F), '\u{0000}');
518        assert_eq!(pdfdoc_to_char(0xAD), '\u{0000}');
519    }
520
521    #[test]
522    fn test_pdfdoc_latin1_range() {
523        // 0xC0–0xFF should map to U+00C0–U+00FF (Latin-1 Supplement)
524        // except 0xAD which is undefined
525        for byte in 0xC0u8..=0xFF {
526            let ch = pdfdoc_to_char(byte);
527            assert_eq!(ch as u32, byte as u32, "byte 0x{byte:02X}");
528        }
529    }
530
531    #[test]
532    fn test_pdfdoc_ascii_range() {
533        // 0x20–0x7E should map to their ASCII codepoints
534        for byte in 0x20u8..=0x7E {
535            let ch = pdfdoc_to_char(byte);
536            assert_eq!(ch as u32, byte as u32, "byte 0x{byte:02X}");
537        }
538    }
539
540    #[test]
541    fn test_pdfdoc_encoding_table_has_256_entries() {
542        assert_eq!(PDFDOC_ENCODING_TABLE.len(), 256);
543    }
544
545    #[test]
546    fn test_empty_string() {
547        let s = PdfString::from_bytes(Vec::new());
548        assert!(s.is_empty());
549        assert_eq!(s.len(), 0);
550        assert_eq!(s.to_string_lossy(), "");
551    }
552
553    #[test]
554    fn test_display_trait() {
555        let s = PdfString::from_bytes(b"test".to_vec());
556        assert_eq!(format!("{s}"), "test");
557    }
558
559    #[test]
560    fn test_equality() {
561        let a = PdfString::from_bytes(b"abc".to_vec());
562        let b = PdfString::from_bytes(b"abc".to_vec());
563        let c = PdfString::from_bytes(b"def".to_vec());
564        assert_eq!(a, b);
565        assert_ne!(a, c);
566    }
567
568    #[test]
569    fn test_pdf_string_is_send_sync() {
570        fn assert_send_sync<T: Send + Sync>() {}
571        assert_send_sync::<PdfString>();
572    }
573
574    #[test]
575    fn test_pdfdoc_ligatures() {
576        // 0x93 → LATIN SMALL LIGATURE FI (U+FB01)
577        assert_eq!(pdfdoc_to_char(0x93), '\u{FB01}');
578        // 0x94 → LATIN SMALL LIGATURE FL (U+FB02)
579        assert_eq!(pdfdoc_to_char(0x94), '\u{FB02}');
580    }
581
582    #[test]
583    fn test_pdfdoc_quotation_marks() {
584        assert_eq!(pdfdoc_to_char(0x8F), '\u{2018}'); // LEFT SINGLE QUOTATION MARK
585        assert_eq!(pdfdoc_to_char(0x90), '\u{2019}'); // RIGHT SINGLE QUOTATION MARK
586        assert_eq!(pdfdoc_to_char(0x91), '\u{201A}'); // SINGLE LOW-9 QUOTATION MARK
587        assert_eq!(pdfdoc_to_char(0x8C), '\u{201E}'); // DOUBLE LOW-9 QUOTATION MARK
588        assert_eq!(pdfdoc_to_char(0x8D), '\u{201C}'); // LEFT DOUBLE QUOTATION MARK
589        assert_eq!(pdfdoc_to_char(0x8E), '\u{201D}'); // RIGHT DOUBLE QUOTATION MARK
590    }
591
592    #[test]
593    fn test_utf16be_odd_byte_count_ignored() {
594        // UTF-16BE with odd trailing byte — chunks_exact(2) skips it
595        let bytes = vec![0xFE, 0xFF, 0x00, 0x41, 0x00];
596        let s = PdfString::from_bytes(bytes);
597        assert_eq!(s.to_string_lossy(), "A");
598    }
599
600    #[test]
601    fn test_pdfdoc_control_chars() {
602        // Bytes 0x00-0x07 map to U+0000-U+0007
603        for byte in 0x00u8..=0x07 {
604            let ch = pdfdoc_to_char(byte);
605            assert_eq!(
606                ch as u32, byte as u32,
607                "byte 0x{byte:02X} should map to U+{:04X}",
608                byte as u32
609            );
610        }
611    }
612
613    #[test]
614    fn test_pdfdoc_high_byte_mappings() {
615        // Verify several spec-defined high-byte mappings
616        assert_eq!(pdfdoc_to_char(0x80), '\u{2022}'); // BULLET
617        assert_eq!(pdfdoc_to_char(0x81), '\u{2020}'); // DAGGER
618        assert_eq!(pdfdoc_to_char(0x82), '\u{2021}'); // DOUBLE DAGGER
619        assert_eq!(pdfdoc_to_char(0x83), '\u{2026}'); // HORIZONTAL ELLIPSIS
620        assert_eq!(pdfdoc_to_char(0x86), '\u{0192}'); // LATIN SMALL LETTER F WITH HOOK
621        assert_eq!(pdfdoc_to_char(0x87), '\u{2044}'); // FRACTION SLASH
622        assert_eq!(pdfdoc_to_char(0x8A), '\u{2212}'); // MINUS SIGN
623        assert_eq!(pdfdoc_to_char(0x8B), '\u{2030}'); // PER MILLE SIGN
624        assert_eq!(pdfdoc_to_char(0x92), '\u{2122}'); // TRADE MARK SIGN
625        assert_eq!(pdfdoc_to_char(0xA0), '\u{20AC}'); // EURO SIGN
626    }
627
628    #[test]
629    fn test_utf16be_with_null_chars() {
630        // BOM + U+0000 (null) + U+0041 ('A')
631        let bytes = vec![0xFE, 0xFF, 0x00, 0x00, 0x00, 0x41];
632        let s = PdfString::from_bytes(bytes);
633        assert_eq!(s.to_string_lossy(), "\0A");
634    }
635
636    // -----------------------------------------------------------------------
637    // Tests ported from upstream fpdf_parser_decode_unittest.cpp
638    // (PDF_DecodeText / PDF_EncodeText equivalents)
639    // -----------------------------------------------------------------------
640
641    /// Upstream: TEST(ParserDecodeTest, DecodeText) — empty string
642    #[test]
643    fn test_parser_decode_text_empty() {
644        let s = PdfString::from_bytes(vec![]);
645        assert_eq!(s.to_string_lossy(), "");
646    }
647
648    /// Upstream: TEST(ParserDecodeTest, DecodeText) — ASCII text
649    #[test]
650    fn test_parser_decode_text_ascii() {
651        let s = PdfString::from_bytes(b"the quick\tfox".to_vec());
652        // In PDFDocEncoding, 0x09 maps to U+02C7 (CARON), not tab.
653        // Upstream C++ test uses L"the quick\tfox" which expects ASCII tab.
654        // rpdfium's PDFDocEncoding maps 0x09 to U+02C7 per ISO 32000-2.
655        let decoded = s.to_string_lossy();
656        // Verify each byte maps according to PDFDocEncoding
657        assert!(decoded.contains("the quick"));
658        assert!(decoded.contains("fox"));
659    }
660
661    /// Upstream: TEST(ParserDecodeTest, DecodeText) — UTF-16BE text
662    #[test]
663    fn test_parser_decode_text_utf16be() {
664        // BOM + U+0330 + U+0331
665        let bytes = vec![0xFE, 0xFF, 0x03, 0x30, 0x03, 0x31];
666        let s = PdfString::from_bytes(bytes);
667        assert_eq!(s.to_string_lossy(), "\u{0330}\u{0331}");
668    }
669
670    /// Upstream: TEST(ParserDecodeTest, DecodeText) — more UTF-16BE text
671    #[test]
672    fn test_parser_decode_text_utf16be_cjk() {
673        let bytes = vec![
674            0xFE, 0xFF, 0x7F, 0x51, 0x98, 0x75, 0x00, 0x20, 0x56, 0xFE, 0x72, 0x47, 0x00, 0x20,
675            0x8D, 0x44, 0x8B, 0xAF, 0x66, 0xF4, 0x59, 0x1A, 0x00, 0x20, 0x00, 0xBB,
676        ];
677        let s = PdfString::from_bytes(bytes);
678        assert_eq!(
679            s.to_string_lossy(),
680            "\u{7F51}\u{9875}\u{0020}\u{56FE}\u{7247}\u{0020}\u{8D44}\u{8BAF}\u{66F4}\u{591A}\u{0020}\u{00BB}"
681        );
682    }
683
684    /// Upstream: TEST(ParserDecodeTest, DecodeText) — supplementary UTF-16BE text
685    #[test]
686    fn test_parser_decode_text_utf16be_supplementary() {
687        // BOM + surrogate pair for U+1F3A8 (ARTIST PALETTE)
688        let bytes = vec![0xFE, 0xFF, 0xD8, 0x3C, 0xDF, 0xA8];
689        let s = PdfString::from_bytes(bytes);
690        let decoded = s.to_string_lossy();
691        // String::from_utf16_lossy handles surrogate pairs
692        assert!(
693            decoded == "\u{1F3A8}" || decoded.contains('\u{FFFD}'),
694            "expected paint palette emoji or replacement char, got: {decoded:?}"
695        );
696    }
697
698    /// Upstream: TEST(ParserDecodeTest, DecodeTextWithUnpairedSurrogates)
699    ///
700    /// Unpaired surrogates in UTF-16BE → replacement characters.
701    #[test]
702    fn test_parser_decode_text_unpaired_surrogates() {
703        // High surrogate alone: D800
704        let bytes = vec![0xFE, 0xFF, 0xD8, 0x00];
705        let s = PdfString::from_bytes(bytes);
706        let decoded = s.to_string_lossy();
707        // from_utf16_lossy replaces unpaired surrogates with U+FFFD
708        assert!(
709            decoded.contains('\u{FFFD}'),
710            "high surrogate alone should produce replacement char"
711        );
712
713        // Low surrogate alone: DC00
714        let bytes = vec![0xFE, 0xFF, 0xDC, 0x00];
715        let s = PdfString::from_bytes(bytes);
716        let decoded = s.to_string_lossy();
717        assert!(
718            decoded.contains('\u{FFFD}'),
719            "low surrogate alone should produce replacement char"
720        );
721    }
722
723    /// Upstream: TEST(ParserDecodeTest, RoundTripText) — PDFDocEncoding round-trip
724    ///
725    /// For each single-byte PDFDocEncoding value, decode → encode should recover
726    /// the original byte (undefined codepoints 0x7F, 0x9F, 0xAD map to U+0000
727    /// per ISO 32000-2 Annex D).
728    #[test]
729    fn test_parser_decode_text_pdfdoc_roundtrip() {
730        for byte in 0u8..=255 {
731            let s = PdfString::from_bytes(vec![byte]);
732            let decoded = s.to_string_lossy();
733
734            match byte {
735                0x7F | 0x9F | 0xAD => {
736                    // Undefined in PDFDocEncoding → U+0000 (per ISO 32000-2 Annex D)
737                    assert_eq!(
738                        decoded, "\u{0000}",
739                        "byte 0x{byte:02X} should map to U+0000"
740                    );
741                }
742                _ => {
743                    // The character should be valid and recoverable
744                    let ch = pdfdoc_to_char(byte);
745                    assert_eq!(
746                        decoded.chars().next(),
747                        Some(ch),
748                        "byte 0x{byte:02X} should decode to U+{:04X}",
749                        ch as u32
750                    );
751                }
752            }
753        }
754    }
755
756    /// Upstream: TEST(ParserDecodeTest, DecodeText) — UTF-8 with BOM
757    #[test]
758    fn test_parser_decode_text_utf8_bom() {
759        // UTF-8 BOM (0xEF 0xBB 0xBF) + U+0330 U+0331 encoded in UTF-8
760        let bytes = vec![0xEF, 0xBB, 0xBF, 0xCC, 0xB0, 0xCC, 0xB1];
761        let s = PdfString::from_bytes(bytes);
762        assert_eq!(s.encoding(), PdfStringEncoding::Utf8Bom);
763        assert_eq!(s.to_string_lossy(), "\u{0330}\u{0331}");
764    }
765
766    /// Upstream: TEST(ParserDecodeTest, DecodeText) — supplementary UTF-8 BOM
767    #[test]
768    fn test_parser_decode_text_utf8_bom_supplementary() {
769        // UTF-8 BOM + U+1F3A8 (ARTIST PALETTE 🎨) in UTF-8
770        let bytes = vec![0xEF, 0xBB, 0xBF, 0xF0, 0x9F, 0x8E, 0xA8];
771        let s = PdfString::from_bytes(bytes);
772        assert_eq!(s.encoding(), PdfStringEncoding::Utf8Bom);
773        assert_eq!(s.to_string_lossy(), "\u{1F3A8}");
774    }
775
776    /// Upstream: TEST(ParserDecodeTest, DecodeTextWithUnicodeEscapes) — UTF-8 BOM
777    ///
778    /// Language-tag escapes (U+001B...U+001B) are stripped after decoding.
779    #[test]
780    fn test_parser_decode_text_with_unicode_escapes_utf8_bom() {
781        // UTF-8 BOM + ESC "ja" ESC + U+0020 + U+5370 U+5237 (印刷)
782        // 0x1B 0x6A 0x61 = ESC j a  (language tag "ja")
783        // 0x1B = closing ESC
784        // 0x20 = SPACE, 0xE5 0x8D 0xB0 0xE5 0x88 0xB7 = 印刷 in UTF-8
785        let bytes = vec![
786            0xEF, 0xBB, 0xBF, 0x1B, 0x6A, 0x61, 0x1B, 0x20, 0xE5, 0x8D, 0xB0, 0xE5, 0x88, 0xB7,
787        ];
788        let s = PdfString::from_bytes(bytes);
789        assert_eq!(s.to_string_lossy(), "\u{0020}\u{5370}\u{5237}");
790    }
791
792    /// Upstream: TEST(ParserDecodeTest, DecodeTextWithUnicodeEscapes) — UTF-16BE
793    #[test]
794    fn test_parser_decode_text_with_unicode_escapes_utf16be() {
795        // UTF-16BE BOM + ESC "ja" ESC + U+0020 + U+5370 U+5237
796        let bytes = vec![
797            0xFE, 0xFF, 0x00, 0x1B, 0x6A, 0x61, 0x00, 0x1B, 0x00, 0x20, 0x53, 0x70, 0x52, 0x37,
798        ];
799        let s = PdfString::from_bytes(bytes);
800        assert_eq!(s.to_string_lossy(), "\u{0020}\u{5370}\u{5237}");
801    }
802
803    /// Upstream: TEST(ParserDecodeTest, DecodeTextWithUnicodeEscapes) — trailing char
804    #[test]
805    fn test_parser_decode_text_with_unicode_escapes_trailing_char() {
806        // UTF-16BE + ESC "ja" ESC + U+0020 + ESC "jaJP" ESC + U+5237
807        // The second language tag has 4 bytes between ESCs: "jaJP"
808        let bytes = vec![
809            0xFE, 0xFF, 0x00, 0x1B, 0x6A, 0x61, 0x4A, 0x50, 0x00, 0x1B, 0x00, 0x20, 0x52, 0x37,
810        ];
811        let s = PdfString::from_bytes(bytes);
812        assert_eq!(s.to_string_lossy(), "\u{0020}\u{5237}");
813    }
814
815    /// Upstream: TEST(ParserDecodeTest, DecodeTextWithInvalidUnicodeEscapes) — empty tags
816    #[test]
817    fn test_parser_decode_text_with_invalid_unicode_escapes_empty() {
818        // UTF-8 BOM + ESC ESC (empty language tag)
819        let s = PdfString::from_bytes(vec![0xEF, 0xBB, 0xBF, 0x1B, 0x1B]);
820        assert_eq!(s.to_string_lossy(), "");
821
822        // UTF-16BE + ESC ESC
823        let s = PdfString::from_bytes(vec![0xFE, 0xFF, 0x00, 0x1B, 0x00, 0x1B]);
824        assert_eq!(s.to_string_lossy(), "");
825
826        // UTF-16BE + ESC ESC + trailing byte (odd-pair — ignored by chunks_exact)
827        let s = PdfString::from_bytes(vec![0xFE, 0xFF, 0x00, 0x1B, 0x00, 0x1B, 0x20]);
828        assert_eq!(s.to_string_lossy(), "");
829    }
830
831    /// Upstream: TEST(ParserDecodeTest, DecodeTextWithInvalidUnicodeEscapes) — text after
832    #[test]
833    fn test_parser_decode_text_with_invalid_unicode_escapes_text_after() {
834        // UTF-8 BOM + ESC ESC + SPACE
835        let s = PdfString::from_bytes(vec![0xEF, 0xBB, 0xBF, 0x1B, 0x1B, 0x20]);
836        assert_eq!(s.to_string_lossy(), " ");
837
838        // UTF-16BE + ESC ESC + U+0020
839        let s = PdfString::from_bytes(vec![0xFE, 0xFF, 0x00, 0x1B, 0x00, 0x1B, 0x00, 0x20]);
840        assert_eq!(s.to_string_lossy(), " ");
841    }
842
843    /// Upstream: TEST(ParserDecodeTest, EncodeText) — empty
844    #[test]
845    fn test_parser_encode_text_empty() {
846        let s = PdfString::from_unicode("");
847        assert_eq!(s.as_bytes(), b"");
848    }
849
850    /// Upstream: TEST(ParserDecodeTest, EncodeText) — ASCII
851    #[test]
852    fn test_parser_encode_text_ascii() {
853        let s = PdfString::from_unicode("the quick\tfox");
854        assert_eq!(s.encoding(), PdfStringEncoding::PdfDocEncoding);
855        assert_eq!(s.as_bytes(), b"the quick\tfox");
856    }
857
858    /// Upstream: TEST(ParserDecodeTest, EncodeText) — Unicode
859    #[test]
860    fn test_parser_encode_text_unicode() {
861        // U+0330 U+0331 not in PDFDocEncoding → UTF-16BE with BOM
862        let s = PdfString::from_unicode("\u{0330}\u{0331}");
863        assert_eq!(s.encoding(), PdfStringEncoding::Utf16Be);
864        assert_eq!(s.as_bytes(), &[0xFE, 0xFF, 0x03, 0x30, 0x03, 0x31]);
865    }
866
867    /// Upstream: TEST(ParserDecodeTest, EncodeText) — supplementary
868    #[test]
869    fn test_parser_encode_text_supplementary() {
870        // U+1F3A8 (🎨) requires surrogate pair in UTF-16
871        let s = PdfString::from_unicode("\u{1F3A8}");
872        assert_eq!(s.encoding(), PdfStringEncoding::Utf16Be);
873        assert_eq!(s.as_bytes(), &[0xFE, 0xFF, 0xD8, 0x3C, 0xDF, 0xA8]);
874    }
875
876    /// Upstream: TEST(ParserDecodeTest, RoundTripText)
877    ///
878    /// Each PDFDocEncoding byte (0x00–0xFF) round-trips through encode→decode.
879    /// Bytes 0x7F, 0x9F, 0xAD are "undefined" (map to U+0000 per ISO 32000-2
880    /// Annex D); U+0000 re-encodes as PDFDocEncoding byte 0x00.
881    #[test]
882    fn test_parser_decode_text_pdfdoc_roundtrip_all_bytes() {
883        for byte in 0u8..=0xFF {
884            let original = PdfString::from_bytes(vec![byte]);
885            let decoded = original.to_string_lossy();
886            let reencoded = PdfString::from_unicode(&decoded);
887
888            match byte {
889                0x7F | 0x9F | 0xAD => {
890                    // Undefined bytes decode to U+0000; U+0000 re-encodes as
891                    // PDFDocEncoding byte 0x00.
892                    assert_eq!(
893                        reencoded.as_bytes(),
894                        &[0x00u8],
895                        "byte 0x{:02X} should re-encode as PDFDocEncoding 0x00",
896                        byte
897                    );
898                }
899                _ => {
900                    assert_eq!(
901                        reencoded.as_bytes(),
902                        &[byte],
903                        "byte 0x{:02X} should round-trip",
904                        byte
905                    );
906                }
907            }
908        }
909    }
910
911    /// char_to_pdfdoc: basic checks
912    #[test]
913    fn test_char_to_pdfdoc_basic() {
914        assert_eq!(char_to_pdfdoc(' '), Some(0x20));
915        assert_eq!(char_to_pdfdoc('A'), Some(0x41));
916        assert_eq!(char_to_pdfdoc('\u{FFFD}'), None); // undefined marker
917        assert_eq!(char_to_pdfdoc('\u{5370}'), None); // CJK — not in PDFDocEncoding
918    }
919}
rpdfium_core/bytestring.rs

rpdfium_core/
bytestring.rs