qubit_codec/
c_string_literal_codec.rs

1/*******************************************************************************
2 *
3 *    Copyright (c) 2026 Haixing Hu.
4 *
5 *    SPDX-License-Identifier: Apache-2.0
6 *
7 *    Licensed under the Apache License, Version 2.0.
8 *
9 ******************************************************************************/
10//! C string literal byte codec.
11
12use crate::{
13    CodecError,
14    CodecResult,
15    Decoder,
16    Encoder,
17};
18
19/// Encodes and decodes byte-oriented C string literal fragments.
20///
21/// This codec is intended for textual formats that embed byte sequences with C
22/// escapes, such as `PK\003\004` or `\xd0\xcf`. It decodes into raw bytes and
23/// does not require surrounding quotes.
24#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
25pub struct CStringLiteralCodec;
26
27impl CStringLiteralCodec {
28    /// Creates a C string literal codec.
29    ///
30    /// # Returns
31    /// A stateless C string literal codec.
32    pub fn new() -> Self {
33        Self
34    }
35
36    /// Encodes bytes into a C string literal fragment.
37    ///
38    /// # Parameters
39    /// - `bytes`: Raw bytes to encode.
40    ///
41    /// # Returns
42    /// A C string literal fragment without surrounding quotes.
43    pub fn encode(&self, bytes: &[u8]) -> String {
44        let mut output = String::with_capacity(bytes.len());
45        for byte in bytes {
46            push_encoded_byte(*byte, &mut output);
47        }
48        output
49    }
50
51    /// Decodes a C string literal fragment into bytes.
52    ///
53    /// # Parameters
54    /// - `text`: C string literal fragment without surrounding quotes.
55    ///
56    /// # Returns
57    /// Decoded raw bytes.
58    ///
59    /// # Errors
60    /// Returns [`CodecError::InvalidEscape`] for malformed escape sequences,
61    /// [`CodecError::InvalidDigit`] for malformed fixed-width numeric escapes,
62    /// and [`CodecError::InvalidCharacter`] for unsupported raw source characters.
63    pub fn decode(&self, text: &str) -> CodecResult<Vec<u8>> {
64        let chars = text.char_indices().collect::<Vec<_>>();
65        let mut output = Vec::with_capacity(text.len());
66        let mut position = 0;
67        while let Some(&(index, character)) = chars.get(position) {
68            if character == '\\' {
69                decode_escape(text, &chars, &mut position, &mut output)?;
70                continue;
71            }
72            validate_source_character(index, character)?;
73            output.push(character as u8);
74            position += 1;
75        }
76        Ok(output)
77    }
78}
79
80impl Encoder<[u8]> for CStringLiteralCodec {
81    type Error = CodecError;
82    type Output = String;
83
84    /// Encodes bytes into a C string literal fragment.
85    fn encode(&self, input: &[u8]) -> Result<Self::Output, Self::Error> {
86        Ok(CStringLiteralCodec::encode(self, input))
87    }
88}
89
90impl Decoder<str> for CStringLiteralCodec {
91    type Error = CodecError;
92    type Output = Vec<u8>;
93
94    /// Decodes a C string literal fragment into bytes.
95    fn decode(&self, input: &str) -> Result<Self::Output, Self::Error> {
96        CStringLiteralCodec::decode(self, input)
97    }
98}
99
100/// Decodes one escape sequence at the current position.
101///
102/// # Parameters
103/// - `text`: Original input text.
104/// - `chars`: Indexed characters from `text`.
105/// - `position`: Current character position, pointing at `\`.
106/// - `output`: Destination byte buffer.
107///
108/// # Errors
109/// Returns [`CodecError`] when the escape marker is trailing or the escape
110/// sequence is malformed.
111fn decode_escape(
112    text: &str,
113    chars: &[(usize, char)],
114    position: &mut usize,
115    output: &mut Vec<u8>,
116) -> CodecResult<()> {
117    let marker_index = chars[*position].0;
118    *position += 1;
119    let Some(&(_, escape)) = chars.get(*position) else {
120        return Err(invalid_escape(
121            marker_index,
122            "\\",
123            "incomplete escape sequence",
124        ));
125    };
126    match escape {
127        ' ' => push_simple_escape(position, output, b' '),
128        '\'' => push_simple_escape(position, output, b'\''),
129        '"' => push_simple_escape(position, output, b'"'),
130        '?' => push_simple_escape(position, output, b'?'),
131        '\\' => push_simple_escape(position, output, b'\\'),
132        'a' => push_simple_escape(position, output, 0x07),
133        'b' => push_simple_escape(position, output, 0x08),
134        'f' => push_simple_escape(position, output, 0x0c),
135        'n' => push_simple_escape(position, output, b'\n'),
136        'r' => push_simple_escape(position, output, b'\r'),
137        't' => push_simple_escape(position, output, b'\t'),
138        'v' => push_simple_escape(position, output, 0x0b),
139        'x' | 'X' => {
140            *position += 1;
141            let value = parse_variable_hex_escape(chars, position, marker_index)?;
142            output.push(value);
143        }
144        'u' => {
145            *position += 1;
146            let value = parse_fixed_hex_escape(text, chars, position, marker_index, 4)?;
147            output.push(value);
148        }
149        'U' => {
150            *position += 1;
151            let value = parse_fixed_hex_escape(text, chars, position, marker_index, 8)?;
152            output.push(value);
153        }
154        '0'..='7' => {
155            let value = parse_octal_escape(chars, position);
156            output.push(value);
157        }
158        _ => {
159            return Err(invalid_escape(
160                marker_index,
161                &format!("\\{escape}"),
162                "unsupported escape sequence",
163            ));
164        }
165    }
166    Ok(())
167}
168
169/// Pushes a simple one-character escape result.
170///
171/// # Parameters
172/// - `position`: Current character position, pointing at the escape character.
173/// - `output`: Destination byte buffer.
174/// - `byte`: Byte produced by the escape sequence.
175fn push_simple_escape(position: &mut usize, output: &mut Vec<u8>, byte: u8) {
176    output.push(byte);
177    *position += 1;
178}
179
180/// Parses a variable-width hexadecimal byte escape.
181///
182/// # Parameters
183/// - `chars`: Indexed characters from the original input.
184/// - `position`: Current character position after `\x`.
185/// - `marker_index`: Byte index of the escape marker.
186///
187/// # Returns
188/// The decoded byte.
189///
190/// # Errors
191/// Returns [`CodecError::InvalidEscape`] when no hexadecimal digit follows
192/// `\x`.
193fn parse_variable_hex_escape(
194    chars: &[(usize, char)],
195    position: &mut usize,
196    marker_index: usize,
197) -> CodecResult<u8> {
198    let mut value = 0u8;
199    let mut digit_count = 0;
200    while digit_count < 2 {
201        let Some(&(_, character)) = chars.get(*position) else {
202            break;
203        };
204        let Some(digit) = hex_value(character) else {
205            break;
206        };
207        value = (value << 4) | digit;
208        *position += 1;
209        digit_count += 1;
210    }
211    if digit_count == 0 {
212        return Err(invalid_escape(
213            marker_index,
214            "\\x",
215            "expected at least one hexadecimal digit",
216        ));
217    }
218    Ok(value)
219}
220
221/// Parses a fixed-width universal byte escape.
222///
223/// # Parameters
224/// - `text`: Original input text.
225/// - `chars`: Indexed characters from `text`.
226/// - `position`: Current character position after `\u` or `\U`.
227/// - `marker_index`: Byte index of the escape marker.
228/// - `digits`: Required number of hexadecimal digits.
229///
230/// # Returns
231/// The decoded byte.
232///
233/// # Errors
234/// Returns [`CodecError::InvalidEscape`] when the escape is too short or too
235/// large for one byte, or [`CodecError::InvalidDigit`] when a required digit is
236/// not hexadecimal.
237fn parse_fixed_hex_escape(
238    text: &str,
239    chars: &[(usize, char)],
240    position: &mut usize,
241    marker_index: usize,
242    digits: usize,
243) -> CodecResult<u8> {
244    let mut value = 0u32;
245    for _ in 0..digits {
246        let Some(&(index, character)) = chars.get(*position) else {
247            let escape = text.get(marker_index..).unwrap_or("\\");
248            return Err(invalid_escape(
249                marker_index,
250                escape,
251                "incomplete universal character escape",
252            ));
253        };
254        let Some(digit) = hex_value(character) else {
255            return Err(CodecError::InvalidDigit {
256                radix: 16,
257                index,
258                character,
259            });
260        };
261        value = (value << 4) | u32::from(digit);
262        *position += 1;
263    }
264    if value > u32::from(u8::MAX) {
265        let escape = text
266            .get(marker_index..chars[*position - 1].0 + chars[*position - 1].1.len_utf8())
267            .unwrap_or("\\u");
268        return Err(invalid_escape(
269            marker_index,
270            escape,
271            "universal character value must fit in one byte",
272        ));
273    }
274    Ok(value as u8)
275}
276
277/// Parses an octal byte escape.
278///
279/// # Parameters
280/// - `chars`: Indexed characters from the original input.
281/// - `position`: Current character position, pointing at the first octal digit.
282///
283/// # Returns
284/// The decoded byte. Values above `0o377` are truncated to their low byte to
285/// match byte-oriented C literal usage.
286fn parse_octal_escape(chars: &[(usize, char)], position: &mut usize) -> u8 {
287    let mut value = 0u16;
288    let mut digit_count = 0;
289    while digit_count < 3 {
290        let Some(&(_, character)) = chars.get(*position) else {
291            break;
292        };
293        let Some(digit) = octal_value(character) else {
294            break;
295        };
296        value = (value << 3) | u16::from(digit);
297        *position += 1;
298        digit_count += 1;
299    }
300    value as u8
301}
302
303/// Validates a raw source character.
304///
305/// # Parameters
306/// - `index`: Byte index of `character` in the original input.
307/// - `character`: Raw, unescaped source character.
308///
309/// # Errors
310/// Returns [`CodecError::InvalidCharacter`] when the raw character is not a
311/// supported ASCII C string source character.
312fn validate_source_character(index: usize, character: char) -> CodecResult<()> {
313    if is_source_character(character) {
314        return Ok(());
315    }
316    Err(CodecError::InvalidCharacter {
317        index,
318        character,
319        reason: "raw source character must be printable ASCII or allowed whitespace".to_owned(),
320    })
321}
322
323/// Tests whether a raw character may appear unescaped.
324///
325/// # Parameters
326/// - `character`: Character to inspect.
327///
328/// # Returns
329/// `true` when `character` is accepted as a raw C string source character.
330fn is_source_character(character: char) -> bool {
331    matches!(character, '\t' | '\n' | '\u{0b}' | '\u{0c}' | ' '..='~')
332}
333
334/// Encodes one byte into the destination string.
335///
336/// # Parameters
337/// - `byte`: Byte to encode.
338/// - `output`: Destination string.
339fn push_encoded_byte(byte: u8, output: &mut String) {
340    match byte {
341        b'\'' => output.push_str("\\'"),
342        b'"' => output.push_str("\\\""),
343        b'?' => output.push_str("\\?"),
344        b'\\' => output.push_str("\\\\"),
345        0x07 => output.push_str("\\a"),
346        0x08 => output.push_str("\\b"),
347        0x0c => output.push_str("\\f"),
348        b'\n' => output.push_str("\\n"),
349        b'\r' => output.push_str("\\r"),
350        b'\t' => output.push_str("\\t"),
351        0x0b => output.push_str("\\v"),
352        b' '..=b'~' => output.push(byte as char),
353        _ => {
354            output.push('\\');
355            output.push('x');
356            output.push(uppercase_hex_digit(byte >> 4));
357            output.push(uppercase_hex_digit(byte & 0x0f));
358        }
359    }
360}
361
362/// Converts one hexadecimal character to its nibble value.
363///
364/// # Parameters
365/// - `character`: Character to inspect.
366///
367/// # Returns
368/// Nibble value, or `None` when `character` is not hexadecimal.
369fn hex_value(character: char) -> Option<u8> {
370    match character {
371        '0'..='9' => Some(character as u8 - b'0'),
372        'a'..='f' => Some(character as u8 - b'a' + 10),
373        'A'..='F' => Some(character as u8 - b'A' + 10),
374        _ => None,
375    }
376}
377
378/// Converts one octal character to its value.
379///
380/// # Parameters
381/// - `character`: Character to inspect.
382///
383/// # Returns
384/// Octal digit value, or `None` when `character` is not octal.
385fn octal_value(character: char) -> Option<u8> {
386    match character {
387        '0'..='7' => Some(character as u8 - b'0'),
388        _ => None,
389    }
390}
391
392/// Converts one nibble to an uppercase hexadecimal digit.
393///
394/// # Parameters
395/// - `value`: Nibble value. Values above `0x0f` are masked to their low nibble.
396///
397/// # Returns
398/// Uppercase hexadecimal digit.
399fn uppercase_hex_digit(value: u8) -> char {
400    match value & 0x0f {
401        0x0 => '0',
402        0x1 => '1',
403        0x2 => '2',
404        0x3 => '3',
405        0x4 => '4',
406        0x5 => '5',
407        0x6 => '6',
408        0x7 => '7',
409        0x8 => '8',
410        0x9 => '9',
411        0x0a => 'A',
412        0x0b => 'B',
413        0x0c => 'C',
414        0x0d => 'D',
415        0x0e => 'E',
416        _ => 'F',
417    }
418}
419
420/// Builds an invalid escape error.
421///
422/// # Parameters
423/// - `index`: Byte index of the escape marker in the original input.
424/// - `escape`: Escape fragment that caused the error.
425/// - `reason`: Human-readable rejection reason.
426///
427/// # Returns
428/// An invalid escape error.
429fn invalid_escape(index: usize, escape: &str, reason: &str) -> CodecError {
430    CodecError::InvalidEscape {
431        index,
432        escape: escape.to_owned(),
433        reason: reason.to_owned(),
434    }
435}
qubit_codec/c_string_literal_codec.rs

qubit_codec/
c_string_literal_codec.rs