Skip to main content

qubit_codec/
c_string_literal_codec.rs

1/*******************************************************************************
2 *
3 *    Copyright (c) 2026 Haixing Hu.
4 *
5 *    SPDX-License-Identifier: Apache-2.0
6 *
7 *    Licensed under the Apache License, Version 2.0.
8 *
9 ******************************************************************************/
10//! C string literal byte codec.
11
12use crate::{
13    CodecError,
14    CodecResult,
15    Decoder,
16    Encoder,
17};
18
19/// Encodes and decodes byte-oriented C string literal fragments.
20///
21/// This codec is intended for textual formats that embed byte sequences with C
22/// escapes, such as `PK\003\004` or `\xd0\xcf`. It decodes into raw bytes and
23/// does not require surrounding quotes.
24#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
25pub struct CStringLiteralCodec;
26
27impl CStringLiteralCodec {
28    /// Creates a C string literal codec.
29    ///
30    /// # Returns
31    /// A stateless C string literal codec.
32    pub fn new() -> Self {
33        Self
34    }
35
36    /// Encodes bytes into a C string literal fragment.
37    ///
38    /// # Parameters
39    /// - `bytes`: Raw bytes to encode.
40    ///
41    /// # Returns
42    /// A C string literal fragment without surrounding quotes.
43    pub fn encode(&self, bytes: &[u8]) -> String {
44        let mut output = String::with_capacity(bytes.len());
45        for byte in bytes {
46            push_encoded_byte(*byte, &mut output);
47        }
48        output
49    }
50
51    /// Decodes a C string literal fragment into bytes.
52    ///
53    /// # Parameters
54    /// - `text`: C string literal fragment without surrounding quotes.
55    ///
56    /// # Returns
57    /// Decoded raw bytes.
58    ///
59    /// # Errors
60    /// Returns [`CodecError::InvalidEscape`] for malformed escape sequences,
61    /// [`CodecError::InvalidDigit`] for malformed fixed-width numeric escapes,
62    /// and [`CodecError::InvalidCharacter`] for unsupported raw source characters.
63    pub fn decode(&self, text: &str) -> CodecResult<Vec<u8>> {
64        let chars = text.char_indices().collect::<Vec<_>>();
65        let mut output = Vec::with_capacity(text.len());
66        let mut position = 0;
67        while let Some(&(index, character)) = chars.get(position) {
68            if character == '\\' {
69                decode_escape(text, &chars, &mut position, &mut output)?;
70                continue;
71            }
72            validate_source_character(index, character)?;
73            output.push(character as u8);
74            position += 1;
75        }
76        Ok(output)
77    }
78}
79
80impl Encoder<[u8]> for CStringLiteralCodec {
81    type Error = CodecError;
82    type Output = String;
83
84    /// Encodes bytes into a C string literal fragment.
85    fn encode(&self, input: &[u8]) -> Result<Self::Output, Self::Error> {
86        Ok(CStringLiteralCodec::encode(self, input))
87    }
88}
89
90impl Decoder<str> for CStringLiteralCodec {
91    type Error = CodecError;
92    type Output = Vec<u8>;
93
94    /// Decodes a C string literal fragment into bytes.
95    fn decode(&self, input: &str) -> Result<Self::Output, Self::Error> {
96        CStringLiteralCodec::decode(self, input)
97    }
98}
99
100/// Decodes one escape sequence at the current position.
101///
102/// # Parameters
103/// - `text`: Original input text.
104/// - `chars`: Indexed characters from `text`.
105/// - `position`: Current character position, pointing at `\`.
106/// - `output`: Destination byte buffer.
107///
108/// # Errors
109/// Returns [`CodecError`] when the escape marker is trailing or the escape
110/// sequence is malformed.
111fn decode_escape(text: &str, chars: &[(usize, char)], position: &mut usize, output: &mut Vec<u8>) -> CodecResult<()> {
112    let marker_index = chars[*position].0;
113    *position += 1;
114    let Some(&(_, escape)) = chars.get(*position) else {
115        return Err(invalid_escape(marker_index, "\\", "incomplete escape sequence"));
116    };
117    match escape {
118        ' ' => push_simple_escape(position, output, b' '),
119        '\'' => push_simple_escape(position, output, b'\''),
120        '"' => push_simple_escape(position, output, b'"'),
121        '?' => push_simple_escape(position, output, b'?'),
122        '\\' => push_simple_escape(position, output, b'\\'),
123        'a' => push_simple_escape(position, output, 0x07),
124        'b' => push_simple_escape(position, output, 0x08),
125        'f' => push_simple_escape(position, output, 0x0c),
126        'n' => push_simple_escape(position, output, b'\n'),
127        'r' => push_simple_escape(position, output, b'\r'),
128        't' => push_simple_escape(position, output, b'\t'),
129        'v' => push_simple_escape(position, output, 0x0b),
130        'x' | 'X' => {
131            *position += 1;
132            let value = parse_variable_hex_escape(chars, position, marker_index)?;
133            output.push(value);
134        }
135        'u' => {
136            *position += 1;
137            let value = parse_fixed_hex_escape(text, chars, position, marker_index, 4)?;
138            output.push(value);
139        }
140        'U' => {
141            *position += 1;
142            let value = parse_fixed_hex_escape(text, chars, position, marker_index, 8)?;
143            output.push(value);
144        }
145        '0'..='7' => {
146            let value = parse_octal_escape(chars, position);
147            output.push(value);
148        }
149        _ => {
150            return Err(invalid_escape(
151                marker_index,
152                &format!("\\{escape}"),
153                "unsupported escape sequence",
154            ));
155        }
156    }
157    Ok(())
158}
159
160/// Pushes a simple one-character escape result.
161///
162/// # Parameters
163/// - `position`: Current character position, pointing at the escape character.
164/// - `output`: Destination byte buffer.
165/// - `byte`: Byte produced by the escape sequence.
166fn push_simple_escape(position: &mut usize, output: &mut Vec<u8>, byte: u8) {
167    output.push(byte);
168    *position += 1;
169}
170
171/// Parses a variable-width hexadecimal byte escape.
172///
173/// # Parameters
174/// - `chars`: Indexed characters from the original input.
175/// - `position`: Current character position after `\x`.
176/// - `marker_index`: Byte index of the escape marker.
177///
178/// # Returns
179/// The decoded byte.
180///
181/// # Errors
182/// Returns [`CodecError::InvalidEscape`] when no hexadecimal digit follows
183/// `\x`.
184fn parse_variable_hex_escape(chars: &[(usize, char)], position: &mut usize, marker_index: usize) -> CodecResult<u8> {
185    let mut value = 0u8;
186    let mut digit_count = 0;
187    while digit_count < 2 {
188        let Some(&(_, character)) = chars.get(*position) else {
189            break;
190        };
191        let Some(digit) = hex_value(character) else {
192            break;
193        };
194        value = (value << 4) | digit;
195        *position += 1;
196        digit_count += 1;
197    }
198    if digit_count == 0 {
199        return Err(invalid_escape(
200            marker_index,
201            "\\x",
202            "expected at least one hexadecimal digit",
203        ));
204    }
205    Ok(value)
206}
207
208/// Parses a fixed-width universal byte escape.
209///
210/// # Parameters
211/// - `text`: Original input text.
212/// - `chars`: Indexed characters from `text`.
213/// - `position`: Current character position after `\u` or `\U`.
214/// - `marker_index`: Byte index of the escape marker.
215/// - `digits`: Required number of hexadecimal digits.
216///
217/// # Returns
218/// The decoded byte.
219///
220/// # Errors
221/// Returns [`CodecError::InvalidEscape`] when the escape is too short or too
222/// large for one byte, or [`CodecError::InvalidDigit`] when a required digit is
223/// not hexadecimal.
224fn parse_fixed_hex_escape(
225    text: &str,
226    chars: &[(usize, char)],
227    position: &mut usize,
228    marker_index: usize,
229    digits: usize,
230) -> CodecResult<u8> {
231    let mut value = 0u32;
232    for _ in 0..digits {
233        let Some(&(index, character)) = chars.get(*position) else {
234            let escape = text.get(marker_index..).unwrap_or("\\");
235            return Err(invalid_escape(
236                marker_index,
237                escape,
238                "incomplete universal character escape",
239            ));
240        };
241        let Some(digit) = hex_value(character) else {
242            return Err(CodecError::InvalidDigit {
243                radix: 16,
244                index,
245                character,
246            });
247        };
248        value = (value << 4) | u32::from(digit);
249        *position += 1;
250    }
251    if value > u32::from(u8::MAX) {
252        let escape = text
253            .get(marker_index..chars[*position - 1].0 + chars[*position - 1].1.len_utf8())
254            .unwrap_or("\\u");
255        return Err(invalid_escape(
256            marker_index,
257            escape,
258            "universal character value must fit in one byte",
259        ));
260    }
261    Ok(value as u8)
262}
263
264/// Parses an octal byte escape.
265///
266/// # Parameters
267/// - `chars`: Indexed characters from the original input.
268/// - `position`: Current character position, pointing at the first octal digit.
269///
270/// # Returns
271/// The decoded byte. Values above `0o377` are truncated to their low byte to
272/// match byte-oriented C literal usage.
273fn parse_octal_escape(chars: &[(usize, char)], position: &mut usize) -> u8 {
274    let mut value = 0u16;
275    let mut digit_count = 0;
276    while digit_count < 3 {
277        let Some(&(_, character)) = chars.get(*position) else {
278            break;
279        };
280        let Some(digit) = octal_value(character) else {
281            break;
282        };
283        value = (value << 3) | u16::from(digit);
284        *position += 1;
285        digit_count += 1;
286    }
287    value as u8
288}
289
290/// Validates a raw source character.
291///
292/// # Parameters
293/// - `index`: Byte index of `character` in the original input.
294/// - `character`: Raw, unescaped source character.
295///
296/// # Errors
297/// Returns [`CodecError::InvalidCharacter`] when the raw character is not a
298/// supported ASCII C string source character.
299fn validate_source_character(index: usize, character: char) -> CodecResult<()> {
300    if is_source_character(character) {
301        return Ok(());
302    }
303    Err(CodecError::InvalidCharacter {
304        index,
305        character,
306        reason: "raw source character must be printable ASCII or allowed whitespace".to_owned(),
307    })
308}
309
310/// Tests whether a raw character may appear unescaped.
311///
312/// # Parameters
313/// - `character`: Character to inspect.
314///
315/// # Returns
316/// `true` when `character` is accepted as a raw C string source character.
317fn is_source_character(character: char) -> bool {
318    matches!(character, '\t' | '\n' | '\u{0b}' | '\u{0c}' | ' '..='~')
319}
320
321/// Encodes one byte into the destination string.
322///
323/// # Parameters
324/// - `byte`: Byte to encode.
325/// - `output`: Destination string.
326fn push_encoded_byte(byte: u8, output: &mut String) {
327    match byte {
328        b'\'' => output.push_str("\\'"),
329        b'"' => output.push_str("\\\""),
330        b'?' => output.push_str("\\?"),
331        b'\\' => output.push_str("\\\\"),
332        0x07 => output.push_str("\\a"),
333        0x08 => output.push_str("\\b"),
334        0x0c => output.push_str("\\f"),
335        b'\n' => output.push_str("\\n"),
336        b'\r' => output.push_str("\\r"),
337        b'\t' => output.push_str("\\t"),
338        0x0b => output.push_str("\\v"),
339        b' '..=b'~' => output.push(byte as char),
340        _ => {
341            output.push('\\');
342            output.push('x');
343            output.push(uppercase_hex_digit(byte >> 4));
344            output.push(uppercase_hex_digit(byte & 0x0f));
345        }
346    }
347}
348
349/// Converts one hexadecimal character to its nibble value.
350///
351/// # Parameters
352/// - `character`: Character to inspect.
353///
354/// # Returns
355/// Nibble value, or `None` when `character` is not hexadecimal.
356fn hex_value(character: char) -> Option<u8> {
357    match character {
358        '0'..='9' => Some(character as u8 - b'0'),
359        'a'..='f' => Some(character as u8 - b'a' + 10),
360        'A'..='F' => Some(character as u8 - b'A' + 10),
361        _ => None,
362    }
363}
364
365/// Converts one octal character to its value.
366///
367/// # Parameters
368/// - `character`: Character to inspect.
369///
370/// # Returns
371/// Octal digit value, or `None` when `character` is not octal.
372fn octal_value(character: char) -> Option<u8> {
373    match character {
374        '0'..='7' => Some(character as u8 - b'0'),
375        _ => None,
376    }
377}
378
379/// Converts one nibble to an uppercase hexadecimal digit.
380///
381/// # Parameters
382/// - `value`: Nibble value. Values above `0x0f` are masked to their low nibble.
383///
384/// # Returns
385/// Uppercase hexadecimal digit.
386fn uppercase_hex_digit(value: u8) -> char {
387    match value & 0x0f {
388        0x0 => '0',
389        0x1 => '1',
390        0x2 => '2',
391        0x3 => '3',
392        0x4 => '4',
393        0x5 => '5',
394        0x6 => '6',
395        0x7 => '7',
396        0x8 => '8',
397        0x9 => '9',
398        0x0a => 'A',
399        0x0b => 'B',
400        0x0c => 'C',
401        0x0d => 'D',
402        0x0e => 'E',
403        _ => 'F',
404    }
405}
406
407/// Builds an invalid escape error.
408///
409/// # Parameters
410/// - `index`: Byte index of the escape marker in the original input.
411/// - `escape`: Escape fragment that caused the error.
412/// - `reason`: Human-readable rejection reason.
413///
414/// # Returns
415/// An invalid escape error.
416fn invalid_escape(index: usize, escape: &str, reason: &str) -> CodecError {
417    CodecError::InvalidEscape {
418        index,
419        escape: escape.to_owned(),
420        reason: reason.to_owned(),
421    }
422}