Skip to main content

pdf_lib_rs/utils/
strings.rs

1/// Convert a character to its char code (byte value).
2pub fn to_char_code(c: char) -> u8 {
3    c as u8
4}
5
6/// Convert a byte value to a two-digit uppercase hex string.
7pub fn to_hex_string(num: u8) -> String {
8    format!("{:02X}", num)
9}
10
11/// Convert a number to a hex string with minimum length, zero-padded.
12pub fn to_hex_string_of_min_length(num: u16, min_length: usize) -> String {
13    let hex = format!("{:X}", num);
14    if hex.len() < min_length {
15        let padding = "0".repeat(min_length - hex.len());
16        format!("{}{}", padding, hex)
17    } else {
18        hex
19    }
20}
21
22/// Convert a hex code string (e.g., "20") to the corresponding character.
23pub fn char_from_hex_code(hex: &str) -> char {
24    u8::from_str_radix(hex, 16).unwrap_or(0) as char
25}
26
27/// Copy a string's bytes into a buffer at the given offset. Returns bytes written.
28pub fn copy_string_into_buffer(s: &str, buffer: &mut [u8], offset: usize) -> usize {
29    let bytes = s.as_bytes();
30    let len = bytes.len();
31    buffer[offset..offset + len].copy_from_slice(bytes);
32    len
33}
34
35/// Convert a number to its string representation without scientific notation.
36/// This matches pdf-lib's `numberToString` which avoids exponential notation.
37pub fn number_to_string(value: f64) -> String {
38    if value.fract() == 0.0 && value.abs() < 1e20 {
39        // Integer-like values
40        format!("{}", value as i64)
41    } else if value.abs() >= 1e20 || (value != 0.0 && value.abs() < 1e-6) {
42        // Very large or very small numbers - format without scientific notation
43        format_no_exponent(value)
44    } else {
45        // Regular floating point
46        let s = format!("{}", value);
47        // Remove trailing zeros after decimal point but keep at least one digit
48        if s.contains('.') {
49            let trimmed = s.trim_end_matches('0');
50            let trimmed = trimmed.trim_end_matches('.');
51            trimmed.to_string()
52        } else {
53            s
54        }
55    }
56}
57
58fn format_no_exponent(value: f64) -> String {
59    // Use a large precision to avoid scientific notation
60    let s = format!("{:.50}", value);
61    // Trim trailing zeros
62    if s.contains('.') {
63        let trimmed = s.trim_end_matches('0');
64        let trimmed = trimmed.trim_end_matches('.');
65        trimmed.to_string()
66    } else {
67        s
68    }
69}
70
71/// Create a Vec<u8> from a string (each char's lower byte).
72pub fn typed_array_for(s: &str) -> Vec<u8> {
73    s.bytes().collect()
74}
75
76/// Convert a byte slice to a String (interpreting each byte as a char).
77pub fn array_as_string(bytes: &[u8]) -> String {
78    bytes.iter().map(|&b| b as char).collect()
79}
80
81/// Merge multiple byte slices into one Vec<u8>.
82pub fn merge_into_typed_array(parts: &[&[u8]]) -> Vec<u8> {
83    let total_len: usize = parts.iter().map(|p| p.len()).sum();
84    let mut result = Vec::with_capacity(total_len);
85    for part in parts {
86        result.extend_from_slice(part);
87    }
88    result
89}
90
91/// Check if bytes start with a UTF-16 BOM (big-endian or little-endian).
92pub fn has_utf16_bom(bytes: &[u8]) -> bool {
93    bytes.len() >= 2 && ((bytes[0] == 0xFE && bytes[1] == 0xFF) || (bytes[0] == 0xFF && bytes[1] == 0xFE))
94}
95
96/// Decode UTF-16 bytes (with BOM) to a String.
97pub fn utf16_decode(bytes: &[u8]) -> String {
98    if bytes.len() < 2 {
99        return String::new();
100    }
101
102    let big_endian = bytes[0] == 0xFE && bytes[1] == 0xFF;
103    let data = &bytes[2..]; // skip BOM
104
105    let mut code_units: Vec<u16> = Vec::with_capacity(data.len() / 2);
106    let mut i = 0;
107    while i + 1 < data.len() {
108        let unit = if big_endian {
109            ((data[i] as u16) << 8) | (data[i + 1] as u16)
110        } else {
111            ((data[i + 1] as u16) << 8) | (data[i] as u16)
112        };
113        code_units.push(unit);
114        i += 2;
115    }
116
117    String::from_utf16_lossy(&code_units)
118}
119
120/// Encode a string as UTF-16BE with BOM.
121pub fn utf16_encode(text: &str) -> Vec<u16> {
122    let mut result = vec![0xFEFF]; // BOM
123    for c in text.chars() {
124        let mut buf = [0u16; 2];
125        let encoded = c.encode_utf16(&mut buf);
126        result.extend_from_slice(encoded);
127    }
128    result
129}
130
131/// Decode bytes using PDFDocEncoding to a String.
132/// PDFDocEncoding is essentially Latin-1 for bytes 0x00-0xFF,
133/// with some special mappings in the 0x80-0x9F range.
134pub fn pdf_doc_encoding_decode(bytes: &[u8]) -> String {
135    // For simplicity, use a direct byte-to-char mapping.
136    // PDFDocEncoding maps 0x00-0x7F to Unicode directly,
137    // and 0xA0-0xFF to Unicode directly (Latin-1 supplement).
138    // The 0x80-0x9F range has special mappings, and some bytes
139    // in 0x00-0x1F are undefined (map to replacement char).
140    bytes.iter().map(|&b| {
141        match b {
142            // Standard ASCII range
143            0x00..=0x7F => b as char,
144            // 0x80-0x9F: Special PDFDocEncoding mappings
145            0x80 => '\u{2022}', // BULLET
146            0x81 => '\u{2020}', // DAGGER
147            0x82 => '\u{2021}', // DOUBLE DAGGER
148            0x83 => '\u{2026}', // HORIZONTAL ELLIPSIS
149            0x84 => '\u{2014}', // EM DASH
150            0x85 => '\u{2013}', // EN DASH
151            0x86 => '\u{0192}', // LATIN SMALL F WITH HOOK
152            0x87 => '\u{2044}', // FRACTION SLASH
153            0x88 => '\u{2039}', // SINGLE LEFT-POINTING ANGLE QUOTATION MARK
154            0x89 => '\u{203A}', // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
155            0x8A => '\u{2212}', // MINUS SIGN
156            0x8B => '\u{2030}', // PER MILLE SIGN
157            0x8C => '\u{201E}', // DOUBLE LOW-9 QUOTATION MARK
158            0x8D => '\u{201C}', // LEFT DOUBLE QUOTATION MARK
159            0x8E => '\u{201D}', // RIGHT DOUBLE QUOTATION MARK
160            0x8F => '\u{2018}', // LEFT SINGLE QUOTATION MARK
161            0x90 => '\u{2019}', // RIGHT SINGLE QUOTATION MARK
162            0x91 => '\u{201A}', // SINGLE LOW-9 QUOTATION MARK
163            0x92 => '\u{2122}', // TRADE MARK SIGN
164            0x93 => '\u{FB01}', // LATIN SMALL LIGATURE FI
165            0x94 => '\u{FB02}', // LATIN SMALL LIGATURE FL
166            0x95 => '\u{0141}', // LATIN CAPITAL LETTER L WITH STROKE
167            0x96 => '\u{0152}', // LATIN CAPITAL LIGATURE OE
168            0x97 => '\u{0160}', // LATIN CAPITAL LETTER S WITH CARON
169            0x98 => '\u{0178}', // LATIN CAPITAL LETTER Y WITH DIAERESIS
170            0x99 => '\u{017D}', // LATIN CAPITAL LETTER Z WITH CARON
171            0x9A => '\u{0131}', // LATIN SMALL LETTER DOTLESS I
172            0x9B => '\u{0142}', // LATIN SMALL LETTER L WITH STROKE
173            0x9C => '\u{0153}', // LATIN SMALL LIGATURE OE
174            0x9D => '\u{0161}', // LATIN SMALL LETTER S WITH CARON
175            0x9E => '\u{017E}', // LATIN SMALL LETTER Z WITH CARON
176            0x9F => '\u{FFFD}', // REPLACEMENT CHARACTER (undefined)
177            // 0xA0 is non-breaking space, mapped via Latin-1
178            0xA0 => '\u{00A0}',
179            0xA1 => '\u{00A1}',
180            0xA2..=0xAC => b as char,
181            0xAD => '\u{00AD}', // soft hyphen
182            0xAE..=0xFF => b as char,
183        }
184    }).collect()
185}
186
187#[cfg(test)]
188mod tests {
189    use super::*;
190
191    #[test]
192    fn test_to_char_code() {
193        assert_eq!(to_char_code('A'), 65);
194        assert_eq!(to_char_code(' '), 32);
195        assert_eq!(to_char_code('\n'), 10);
196    }
197
198    #[test]
199    fn test_to_hex_string() {
200        assert_eq!(to_hex_string(0), "00");
201        assert_eq!(to_hex_string(255), "FF");
202        assert_eq!(to_hex_string(16), "10");
203        assert_eq!(to_hex_string(9), "09");
204    }
205
206    #[test]
207    fn test_char_from_hex_code() {
208        assert_eq!(char_from_hex_code("20"), ' ');
209        assert_eq!(char_from_hex_code("41"), 'A');
210        assert_eq!(char_from_hex_code("42"), 'B');
211    }
212
213    #[test]
214    fn test_copy_string_into_buffer() {
215        let mut buf = vec![b' '; 10];
216        let written = copy_string_into_buffer("hello", &mut buf, 2);
217        assert_eq!(written, 5);
218        assert_eq!(&buf, b"  hello   ");
219    }
220
221    #[test]
222    fn test_number_to_string_integers() {
223        assert_eq!(number_to_string(21.0), "21");
224        assert_eq!(number_to_string(-43.0), "-43");
225        assert_eq!(number_to_string(0.0), "0");
226    }
227
228    #[test]
229    fn test_typed_array_for() {
230        assert_eq!(typed_array_for("ABC"), vec![65, 66, 67]);
231        assert_eq!(typed_array_for("   "), vec![32, 32, 32]);
232    }
233
234    #[test]
235    fn test_has_utf16_bom() {
236        assert!(has_utf16_bom(&[0xFE, 0xFF, 0x00, 0x41])); // BE
237        assert!(has_utf16_bom(&[0xFF, 0xFE, 0x41, 0x00])); // LE
238        assert!(!has_utf16_bom(&[0x41, 0x42]));
239        assert!(!has_utf16_bom(&[0xFE]));
240    }
241
242    #[test]
243    fn test_utf16_decode_be() {
244        // "Egg " in UTF-16BE with BOM
245        let bytes = vec![0xFE, 0xFF, 0x00, 0x45, 0x00, 0x67, 0x00, 0x67, 0x00, 0x20];
246        assert_eq!(utf16_decode(&bytes), "Egg ");
247    }
248
249    #[test]
250    fn test_utf16_decode_le() {
251        // "Egg " in UTF-16LE with BOM
252        let bytes = vec![0xFF, 0xFE, 0x45, 0x00, 0x67, 0x00, 0x67, 0x00, 0x20, 0x00];
253        assert_eq!(utf16_decode(&bytes), "Egg ");
254    }
255
256    #[test]
257    fn test_utf16_encode() {
258        let encoded = utf16_encode("");
259        assert_eq!(encoded, vec![0xFEFF]); // just BOM
260
261        let encoded = utf16_encode("A");
262        assert_eq!(encoded, vec![0xFEFF, 0x0041]);
263    }
264
265    #[test]
266    fn test_pdf_doc_encoding_decode_ascii() {
267        let bytes = vec![0x61, 0x45, 0x62, 0x73]; // "aEbs"
268        assert_eq!(pdf_doc_encoding_decode(&bytes), "aEbs");
269    }
270}