Skip to main content

qubit_codec/
c_string_literal_codec.rs

1/*******************************************************************************
2 *
3 *    Copyright (c) 2026 Haixing Hu.
4 *
5 *    SPDX-License-Identifier: Apache-2.0
6 *
7 *    Licensed under the Apache License, Version 2.0.
8 *
9 ******************************************************************************/
10//! C string literal byte codec.
11
12use crate::{
13    CodecError,
14    CodecResult,
15    Decoder,
16    Encoder,
17};
18
19/// Encodes and decodes byte-oriented C string literal fragments.
20///
21/// This codec is intended for textual formats that embed byte sequences with C
22/// escapes, such as `PK\003\004` or `\xd0\xcf`. It decodes into raw bytes and
23/// does not require surrounding quotes.
24#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
25pub struct CStringLiteralCodec;
26
27impl CStringLiteralCodec {
28    /// Creates a C string literal codec.
29    ///
30    /// # Returns
31    /// A stateless C string literal codec.
32    pub fn new() -> Self {
33        Self
34    }
35
36    /// Encodes bytes into a C string literal fragment.
37    ///
38    /// # Parameters
39    /// - `bytes`: Raw bytes to encode.
40    ///
41    /// # Returns
42    /// A C string literal fragment without surrounding quotes.
43    pub fn encode(&self, bytes: &[u8]) -> String {
44        let mut output = String::with_capacity(bytes.len());
45        for byte in bytes {
46            push_encoded_byte(*byte, &mut output);
47        }
48        output
49    }
50
51    /// Decodes a C string literal fragment into bytes.
52    ///
53    /// # Parameters
54    /// - `text`: C string literal fragment without surrounding quotes.
55    ///
56    /// # Returns
57    /// Decoded raw bytes.
58    ///
59    /// # Errors
60    /// Returns [`CodecError::InvalidEscape`] for malformed escape sequences,
61    /// [`CodecError::InvalidDigit`] for malformed fixed-width numeric escapes,
62    /// and [`CodecError::InvalidCharacter`] for unsupported raw source characters.
63    pub fn decode(&self, text: &str) -> CodecResult<Vec<u8>> {
64        let chars = text.char_indices().collect::<Vec<_>>();
65        let mut output = Vec::with_capacity(text.len());
66        let mut position = 0;
67        while let Some(&(index, character)) = chars.get(position) {
68            if character == '\\' {
69                decode_escape(text, &chars, &mut position, &mut output)?;
70                continue;
71            }
72            validate_source_character(index, character)?;
73            output.push(character as u8);
74            position += 1;
75        }
76        Ok(output)
77    }
78}
79
80impl Encoder<[u8]> for CStringLiteralCodec {
81    type Error = CodecError;
82    type Output = String;
83
84    /// Encodes bytes into a C string literal fragment.
85    fn encode(&self, input: &[u8]) -> Result<Self::Output, Self::Error> {
86        Ok(CStringLiteralCodec::encode(self, input))
87    }
88}
89
90impl Decoder<str> for CStringLiteralCodec {
91    type Error = CodecError;
92    type Output = Vec<u8>;
93
94    /// Decodes a C string literal fragment into bytes.
95    fn decode(&self, input: &str) -> Result<Self::Output, Self::Error> {
96        CStringLiteralCodec::decode(self, input)
97    }
98}
99
100/// Decodes one escape sequence at the current position.
101///
102/// # Parameters
103/// - `text`: Original input text.
104/// - `chars`: Indexed characters from `text`.
105/// - `position`: Current character position, pointing at `\`.
106/// - `output`: Destination byte buffer.
107///
108/// # Errors
109/// Returns [`CodecError`] when the escape marker is trailing or the escape
110/// sequence is malformed.
111fn decode_escape(
112    text: &str,
113    chars: &[(usize, char)],
114    position: &mut usize,
115    output: &mut Vec<u8>,
116) -> CodecResult<()> {
117    let marker_index = chars[*position].0;
118    *position += 1;
119    let Some(&(_, escape)) = chars.get(*position) else {
120        return Err(invalid_escape(
121            marker_index,
122            "\\",
123            "incomplete escape sequence",
124        ));
125    };
126    match escape {
127        '\'' => push_simple_escape(position, output, b'\''),
128        '"' => push_simple_escape(position, output, b'"'),
129        '?' => push_simple_escape(position, output, b'?'),
130        '\\' => push_simple_escape(position, output, b'\\'),
131        'a' => push_simple_escape(position, output, 0x07),
132        'b' => push_simple_escape(position, output, 0x08),
133        'f' => push_simple_escape(position, output, 0x0c),
134        'n' => push_simple_escape(position, output, b'\n'),
135        'r' => push_simple_escape(position, output, b'\r'),
136        't' => push_simple_escape(position, output, b'\t'),
137        'v' => push_simple_escape(position, output, 0x0b),
138        'x' | 'X' => {
139            *position += 1;
140            let value = parse_variable_hex_escape(chars, position, marker_index)?;
141            output.push(value);
142        }
143        'u' => {
144            *position += 1;
145            let value = parse_fixed_hex_escape(text, chars, position, marker_index, 4)?;
146            output.push(value);
147        }
148        'U' => {
149            *position += 1;
150            let value = parse_fixed_hex_escape(text, chars, position, marker_index, 8)?;
151            output.push(value);
152        }
153        '0'..='7' => {
154            let value = parse_octal_escape(chars, position);
155            output.push(value);
156        }
157        _ => {
158            return Err(invalid_escape(
159                marker_index,
160                &format!("\\{escape}"),
161                "unsupported escape sequence",
162            ));
163        }
164    }
165    Ok(())
166}
167
168/// Pushes a simple one-character escape result.
169///
170/// # Parameters
171/// - `position`: Current character position, pointing at the escape character.
172/// - `output`: Destination byte buffer.
173/// - `byte`: Byte produced by the escape sequence.
174fn push_simple_escape(position: &mut usize, output: &mut Vec<u8>, byte: u8) {
175    output.push(byte);
176    *position += 1;
177}
178
179/// Parses a variable-width hexadecimal byte escape.
180///
181/// # Parameters
182/// - `chars`: Indexed characters from the original input.
183/// - `position`: Current character position after `\x`.
184/// - `marker_index`: Byte index of the escape marker.
185///
186/// # Returns
187/// The decoded byte.
188///
189/// # Errors
190/// Returns [`CodecError::InvalidEscape`] when no hexadecimal digit follows
191/// `\x`.
192fn parse_variable_hex_escape(
193    chars: &[(usize, char)],
194    position: &mut usize,
195    marker_index: usize,
196) -> CodecResult<u8> {
197    let mut value = 0u8;
198    let mut digit_count = 0;
199    while digit_count < 2 {
200        let Some(&(_, character)) = chars.get(*position) else {
201            break;
202        };
203        let Some(digit) = hex_value(character) else {
204            break;
205        };
206        value = (value << 4) | digit;
207        *position += 1;
208        digit_count += 1;
209    }
210    if digit_count == 0 {
211        return Err(invalid_escape(
212            marker_index,
213            "\\x",
214            "expected at least one hexadecimal digit",
215        ));
216    }
217    Ok(value)
218}
219
220/// Parses a fixed-width universal byte escape.
221///
222/// # Parameters
223/// - `text`: Original input text.
224/// - `chars`: Indexed characters from `text`.
225/// - `position`: Current character position after `\u` or `\U`.
226/// - `marker_index`: Byte index of the escape marker.
227/// - `digits`: Required number of hexadecimal digits.
228///
229/// # Returns
230/// The decoded byte.
231///
232/// # Errors
233/// Returns [`CodecError::InvalidEscape`] when the escape is too short or too
234/// large for one byte, or [`CodecError::InvalidDigit`] when a required digit is
235/// not hexadecimal.
236fn parse_fixed_hex_escape(
237    text: &str,
238    chars: &[(usize, char)],
239    position: &mut usize,
240    marker_index: usize,
241    digits: usize,
242) -> CodecResult<u8> {
243    let mut value = 0u32;
244    for _ in 0..digits {
245        let Some(&(index, character)) = chars.get(*position) else {
246            let escape = text.get(marker_index..).unwrap_or("\\");
247            return Err(invalid_escape(
248                marker_index,
249                escape,
250                "incomplete universal character escape",
251            ));
252        };
253        let Some(digit) = hex_value(character) else {
254            return Err(CodecError::InvalidDigit {
255                radix: 16,
256                index,
257                character,
258            });
259        };
260        value = (value << 4) | u32::from(digit);
261        *position += 1;
262    }
263    if value > u32::from(u8::MAX) {
264        let escape = text
265            .get(marker_index..chars[*position - 1].0 + chars[*position - 1].1.len_utf8())
266            .unwrap_or("\\u");
267        return Err(invalid_escape(
268            marker_index,
269            escape,
270            "universal character value must fit in one byte",
271        ));
272    }
273    Ok(value as u8)
274}
275
276/// Parses an octal byte escape.
277///
278/// # Parameters
279/// - `chars`: Indexed characters from the original input.
280/// - `position`: Current character position, pointing at the first octal digit.
281///
282/// # Returns
283/// The decoded byte. Values above `0o377` are truncated to their low byte to
284/// match byte-oriented C literal usage.
285fn parse_octal_escape(chars: &[(usize, char)], position: &mut usize) -> u8 {
286    let mut value = 0u16;
287    let mut digit_count = 0;
288    while digit_count < 3 {
289        let Some(&(_, character)) = chars.get(*position) else {
290            break;
291        };
292        let Some(digit) = octal_value(character) else {
293            break;
294        };
295        value = (value << 3) | u16::from(digit);
296        *position += 1;
297        digit_count += 1;
298    }
299    value as u8
300}
301
302/// Validates a raw source character.
303///
304/// # Parameters
305/// - `index`: Byte index of `character` in the original input.
306/// - `character`: Raw, unescaped source character.
307///
308/// # Errors
309/// Returns [`CodecError::InvalidCharacter`] when the raw character is not a
310/// supported ASCII C string source character.
311fn validate_source_character(index: usize, character: char) -> CodecResult<()> {
312    if is_source_character(character) {
313        return Ok(());
314    }
315    Err(CodecError::InvalidCharacter {
316        index,
317        character,
318        reason: "raw source character must be printable ASCII or allowed whitespace".to_owned(),
319    })
320}
321
322/// Tests whether a raw character may appear unescaped.
323///
324/// # Parameters
325/// - `character`: Character to inspect.
326///
327/// # Returns
328/// `true` when `character` is accepted as a raw C string source character.
329fn is_source_character(character: char) -> bool {
330    matches!(character, '\t' | '\n' | '\u{0b}' | '\u{0c}' | ' '..='~')
331}
332
333/// Encodes one byte into the destination string.
334///
335/// # Parameters
336/// - `byte`: Byte to encode.
337/// - `output`: Destination string.
338fn push_encoded_byte(byte: u8, output: &mut String) {
339    match byte {
340        b'\'' => output.push_str("\\'"),
341        b'"' => output.push_str("\\\""),
342        b'?' => output.push_str("\\?"),
343        b'\\' => output.push_str("\\\\"),
344        0x07 => output.push_str("\\a"),
345        0x08 => output.push_str("\\b"),
346        0x0c => output.push_str("\\f"),
347        b'\n' => output.push_str("\\n"),
348        b'\r' => output.push_str("\\r"),
349        b'\t' => output.push_str("\\t"),
350        0x0b => output.push_str("\\v"),
351        b' '..=b'~' => output.push(byte as char),
352        _ => {
353            output.push('\\');
354            output.push('x');
355            output.push(uppercase_hex_digit(byte >> 4));
356            output.push(uppercase_hex_digit(byte & 0x0f));
357        }
358    }
359}
360
361/// Converts one hexadecimal character to its nibble value.
362///
363/// # Parameters
364/// - `character`: Character to inspect.
365///
366/// # Returns
367/// Nibble value, or `None` when `character` is not hexadecimal.
368fn hex_value(character: char) -> Option<u8> {
369    match character {
370        '0'..='9' => Some(character as u8 - b'0'),
371        'a'..='f' => Some(character as u8 - b'a' + 10),
372        'A'..='F' => Some(character as u8 - b'A' + 10),
373        _ => None,
374    }
375}
376
377/// Converts one octal character to its value.
378///
379/// # Parameters
380/// - `character`: Character to inspect.
381///
382/// # Returns
383/// Octal digit value, or `None` when `character` is not octal.
384fn octal_value(character: char) -> Option<u8> {
385    match character {
386        '0'..='7' => Some(character as u8 - b'0'),
387        _ => None,
388    }
389}
390
391/// Converts one nibble to an uppercase hexadecimal digit.
392///
393/// # Parameters
394/// - `value`: Nibble value. Values above `0x0f` are masked to their low nibble.
395///
396/// # Returns
397/// Uppercase hexadecimal digit.
398fn uppercase_hex_digit(value: u8) -> char {
399    match value & 0x0f {
400        0x0 => '0',
401        0x1 => '1',
402        0x2 => '2',
403        0x3 => '3',
404        0x4 => '4',
405        0x5 => '5',
406        0x6 => '6',
407        0x7 => '7',
408        0x8 => '8',
409        0x9 => '9',
410        0x0a => 'A',
411        0x0b => 'B',
412        0x0c => 'C',
413        0x0d => 'D',
414        0x0e => 'E',
415        _ => 'F',
416    }
417}
418
419/// Builds an invalid escape error.
420///
421/// # Parameters
422/// - `index`: Byte index of the escape marker in the original input.
423/// - `escape`: Escape fragment that caused the error.
424/// - `reason`: Human-readable rejection reason.
425///
426/// # Returns
427/// An invalid escape error.
428fn invalid_escape(index: usize, escape: &str, reason: &str) -> CodecError {
429    CodecError::InvalidEscape {
430        index,
431        escape: escape.to_owned(),
432        reason: reason.to_owned(),
433    }
434}