Skip to main content

qubit_codec_misc/
c_string_literal_codec.rs

1// =============================================================================
2//    Copyright (c) 2026 Haixing Hu.
3//
4//    SPDX-License-Identifier: Apache-2.0
5//
6//    Licensed under the Apache License, Version 2.0.
7// =============================================================================
8//! C string literal byte codec.
9
10use crate::{
11    Codec,
12    MiscCodecError,
13    MiscCodecResult,
14    ValueDecoder,
15    ValueEncoder,
16};
17
18const UPPER_HEX_DIGITS: [char; 16] = [
19    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E',
20    'F',
21];
22
23/// Encodes and decodes byte-oriented C string literal fragments.
24///
25/// This codec is intended for textual formats that embed byte sequences with C
26/// escapes, such as `PK\003\004` or `\xd0\xcf`. It decodes into raw bytes and
27/// does not require surrounding quotes.
28///
29/// Its low-level [`Codec<Value = u8, Unit = u8>`] implementation handles one
30/// raw byte or one C escape fragment. Whole-fragment iteration remains part of
31/// the owned [`encode`](Self::encode) and [`decode`](Self::decode) helpers.
32#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
33pub struct CStringLiteralCodec;
34
35impl CStringLiteralCodec {
36    /// Creates a C string literal codec.
37    ///
38    /// # Returns
39    /// A stateless C string literal codec.
40    #[inline]
41    pub fn new() -> Self {
42        Self
43    }
44
45    /// Encodes bytes into a C string literal fragment.
46    ///
47    /// # Parameters
48    /// - `bytes`: Raw bytes to encode.
49    ///
50    /// # Returns
51    /// A C string literal fragment without surrounding quotes.
52    #[inline]
53    pub fn encode(&self, bytes: &[u8]) -> String {
54        let mut output = String::with_capacity(bytes.len());
55        for byte in bytes {
56            push_encoded_byte(*byte, &mut output);
57        }
58        output
59    }
60
61    /// Decodes a C string literal fragment into bytes.
62    ///
63    /// # Parameters
64    /// - `text`: C string literal fragment without surrounding quotes.
65    ///
66    /// # Returns
67    /// Decoded raw bytes.
68    ///
69    /// # Errors
70    /// Returns [`MiscCodecError::InvalidEscape`] for malformed escape
71    /// sequences, [`MiscCodecError::InvalidDigit`] for malformed
72    /// fixed-width numeric escapes,
73    /// and [`MiscCodecError::InvalidCharacter`] for unsupported raw source
74    /// characters.
75    #[inline]
76    pub fn decode(&self, text: &str) -> MiscCodecResult<Vec<u8>> {
77        let input = text.as_bytes();
78        let mut output = Vec::with_capacity(text.len());
79        let mut index = 0;
80        while index < input.len() {
81            let (decoded, consumed) = decode_c_string_literal_unit(
82                input,
83                index,
84                CStringLiteralParseContext::CompleteText(text),
85            )?;
86            debug_assert!(consumed > 0);
87            output.push(decoded);
88            index += consumed;
89        }
90        Ok(output)
91    }
92}
93
94impl ValueEncoder<[u8]> for CStringLiteralCodec {
95    type Error = MiscCodecError;
96    type Output = String;
97
98    /// Encodes bytes into a C string literal fragment.
99    #[inline]
100    fn encode(&self, input: &[u8]) -> Result<Self::Output, Self::Error> {
101        Ok(CStringLiteralCodec::encode(self, input))
102    }
103}
104
105impl ValueDecoder<str> for CStringLiteralCodec {
106    type Error = MiscCodecError;
107    type Output = Vec<u8>;
108
109    /// Decodes a C string literal fragment into bytes.
110    #[inline]
111    fn decode(&self, input: &str) -> Result<Self::Output, Self::Error> {
112        CStringLiteralCodec::decode(self, input)
113    }
114}
115
116unsafe impl Codec for CStringLiteralCodec {
117    type Value = u8;
118    type Unit = u8;
119    type DecodeError = MiscCodecError;
120    type EncodeError = MiscCodecError;
121
122    /// Returns the shortest representation length for one byte.
123    #[inline(always)]
124    fn min_units_per_value(&self) -> core::num::NonZeroUsize {
125        core::num::NonZeroUsize::MIN
126    }
127
128    /// Returns the longest supported universal escape length for one byte.
129    #[inline(always)]
130    fn max_units_per_value(&self) -> core::num::NonZeroUsize {
131        unsafe { core::num::NonZeroUsize::new_unchecked(10) }
132    }
133
134    /// Decodes one raw byte or one C escape fragment.
135    #[inline]
136    unsafe fn decode_unchecked(
137        &self,
138        input: &[u8],
139        index: usize,
140    ) -> Result<(u8, core::num::NonZeroUsize), Self::DecodeError> {
141        debug_assert!(index < input.len());
142
143        let (value, consumed) = decode_c_string_literal_byte(input, index)?;
144        debug_assert!(consumed > 0);
145        // SAFETY: `decode_c_string_literal_byte` returns a non-zero width for
146        // every successful raw byte or escape.
147        let consumed =
148            unsafe { core::num::NonZeroUsize::new_unchecked(consumed) };
149        Ok((value, consumed))
150    }
151
152    /// Encodes one byte as a raw byte or C escape fragment.
153    #[inline]
154    unsafe fn encode_unchecked(
155        &self,
156        value: &u8,
157        output: &mut [u8],
158        index: usize,
159    ) -> Result<usize, Self::EncodeError> {
160        let required = match *value {
161            b'\'' | b'"' | b'?' | b'\\' | 0x07 | 0x08 | 0x0c | b'\n'
162            | b'\r' | b'\t' | 0x0b => 2,
163            b' '..=b'~' => 1,
164            _ => 4,
165        };
166        debug_assert!(index + required <= output.len());
167
168        Ok(write_encoded_byte(*value, output, index))
169    }
170}
171
172/// Parsing context for one C string literal unit.
173///
174/// Complete text parsing preserves owned decoder diagnostics, while streaming
175/// byte parsing reports incomplete fragments so buffered callers can retry.
176#[derive(Debug, Clone, Copy)]
177enum CStringLiteralParseContext<'a> {
178    /// Parsing a complete UTF-8 literal fragment.
179    CompleteText(&'a str),
180    /// Parsing one byte unit for a streaming codec caller.
181    StreamingBytes,
182}
183
184impl CStringLiteralParseContext<'_> {
185    /// Tests whether parsing is for a complete text fragment.
186    ///
187    /// # Returns
188    /// `true` when incomplete trailing escapes should be reported as malformed
189    /// complete input instead of as retryable incomplete input.
190    #[inline(always)]
191    fn is_complete_text(self) -> bool {
192        matches!(self, Self::CompleteText(_))
193    }
194
195    /// Builds the error for a trailing escape marker.
196    ///
197    /// # Parameters
198    /// - `marker_index`: Byte index of the escape marker.
199    /// - `available`: Available unit count from `marker_index`.
200    ///
201    /// # Returns
202    /// A malformed escape error for complete text, or an incomplete-input error
203    /// for streaming byte parsing.
204    fn trailing_escape_error(
205        self,
206        marker_index: usize,
207        available: usize,
208    ) -> MiscCodecError {
209        match self {
210            Self::CompleteText(_) => {
211                invalid_escape(marker_index, "\\", "incomplete escape sequence")
212            }
213            Self::StreamingBytes => MiscCodecError::Incomplete {
214                required: 2,
215                available,
216            },
217        }
218    }
219
220    /// Gets the source character at a byte index for diagnostics.
221    ///
222    /// # Parameters
223    /// - `input`: Encoded byte units.
224    /// - `index`: Byte index to inspect.
225    ///
226    /// # Returns
227    /// The UTF-8 source character for complete text, or the byte mapped to a
228    /// Unicode scalar value for byte parsing.
229    fn source_character(self, input: &[u8], index: usize) -> char {
230        match self {
231            Self::CompleteText(text) => text
232                .get(index..)
233                .and_then(|rest| rest.chars().next())
234                .unwrap_or(char::from(input[index])),
235            Self::StreamingBytes => char::from(input[index]),
236        }
237    }
238
239    /// Builds a raw source character rejection reason.
240    ///
241    /// # Returns
242    /// The diagnostic reason matching the parsing context.
243    #[inline(always)]
244    fn raw_source_reason(self) -> &'static str {
245        match self {
246            Self::CompleteText(_) => {
247                "raw source character must be printable ASCII or allowed whitespace"
248            }
249            Self::StreamingBytes => {
250                "raw source byte must be printable ASCII or allowed whitespace"
251            }
252        }
253    }
254
255    /// Builds an escape fragment for diagnostics.
256    ///
257    /// # Parameters
258    /// - `input`: Encoded byte units.
259    /// - `start`: Start byte index.
260    /// - `end`: Exclusive fallback byte end index for byte parsing.
261    ///
262    /// # Returns
263    /// A displayable escape fragment.
264    fn escape_fragment(self, input: &[u8], start: usize, end: usize) -> String {
265        match self {
266            Self::CompleteText(text) => text
267                .get(start..end)
268                .or(text.get(start..))
269                .unwrap_or("\\")
270                .to_owned(),
271            Self::StreamingBytes => escape_fragment(input, start, end),
272        }
273    }
274}
275
276/// Encodes one byte into the destination string.
277///
278/// # Parameters
279/// - `byte`: Byte to encode.
280/// - `output`: Destination string.
281#[inline]
282fn push_encoded_byte(byte: u8, output: &mut String) {
283    match byte {
284        b'\'' => output.push_str("\\'"),
285        b'"' => output.push_str("\\\""),
286        b'?' => output.push_str("\\?"),
287        b'\\' => output.push_str("\\\\"),
288        0x07 => output.push_str("\\a"),
289        0x08 => output.push_str("\\b"),
290        0x0c => output.push_str("\\f"),
291        b'\n' => output.push_str("\\n"),
292        b'\r' => output.push_str("\\r"),
293        b'\t' => output.push_str("\\t"),
294        0x0b => output.push_str("\\v"),
295        b' '..=b'~' => output.push(byte as char),
296        _ => {
297            output.push('\\');
298            output.push('x');
299            output.push(uppercase_hex_digit(byte >> 4));
300            output.push(uppercase_hex_digit(byte & 0x0f));
301        }
302    }
303}
304
305/// Decodes one byte-oriented C string literal fragment from `input`.
306///
307/// # Parameters
308/// - `input`: Encoded byte units.
309/// - `index`: Start index in `input`.
310///
311/// # Returns
312/// Decoded byte and consumed unit count.
313///
314/// # Errors
315/// Returns [`MiscCodecError`] when the raw byte or escape fragment is invalid.
316#[inline]
317fn decode_c_string_literal_byte(
318    input: &[u8],
319    index: usize,
320) -> MiscCodecResult<(u8, usize)> {
321    decode_c_string_literal_unit(
322        input,
323        index,
324        CStringLiteralParseContext::StreamingBytes,
325    )
326}
327
328/// Decodes one C string literal unit from `input`.
329///
330/// # Parameters
331/// - `input`: Encoded byte units.
332/// - `index`: Start index in `input`.
333/// - `context`: Complete-text or streaming-byte parsing context.
334///
335/// # Returns
336/// Decoded byte and consumed unit count.
337///
338/// # Errors
339/// Returns [`MiscCodecError`] when the raw byte or escape fragment is invalid.
340#[inline]
341fn decode_c_string_literal_unit(
342    input: &[u8],
343    index: usize,
344    context: CStringLiteralParseContext<'_>,
345) -> MiscCodecResult<(u8, usize)> {
346    let available = input.len().saturating_sub(index);
347    if available == 0 {
348        return Err(MiscCodecError::Incomplete {
349            required: 1,
350            available,
351        });
352    }
353    let byte = input[index];
354    if byte != b'\\' {
355        validate_source_unit(input, index, byte, context)?;
356        return Ok((byte, 1));
357    }
358    if available < 2 {
359        return Err(context.trailing_escape_error(index, available));
360    }
361    let escape = input[index + 1];
362    match escape {
363        b' ' => Ok((b' ', 2)),
364        b'\'' => Ok((b'\'', 2)),
365        b'"' => Ok((b'"', 2)),
366        b'?' => Ok((b'?', 2)),
367        b'\\' => Ok((b'\\', 2)),
368        b'a' => Ok((0x07, 2)),
369        b'b' => Ok((0x08, 2)),
370        b'f' => Ok((0x0c, 2)),
371        b'n' => Ok((b'\n', 2)),
372        b'r' => Ok((b'\r', 2)),
373        b't' => Ok((b'\t', 2)),
374        b'v' => Ok((0x0b, 2)),
375        b'x' | b'X' => {
376            if !context.is_complete_text() {
377                ensure_variable_hex_escape_complete(input, index, available)?;
378            }
379            parse_variable_hex_escape_units(input, index)
380        }
381        b'u' => {
382            if !context.is_complete_text() {
383                ensure_fixed_escape_complete(available, 6)?;
384            }
385            parse_fixed_hex_escape_units(input, index, 4, context)
386        }
387        b'U' => {
388            if !context.is_complete_text() {
389                ensure_fixed_escape_complete(available, 10)?;
390            }
391            parse_fixed_hex_escape_units(input, index, 8, context)
392        }
393        b'0'..=b'7' => {
394            ensure_octal_escape_complete(input, index, available)?;
395            Ok(parse_octal_escape_units(input, index))
396        }
397        _ => Err(invalid_escape(
398            index,
399            &context.escape_fragment(input, index, index + 2),
400            "unsupported escape sequence",
401        )),
402    }
403}
404
405/// Ensures a variable-width `\x` escape has enough units to decide one value.
406///
407/// # Parameters
408/// - `input`: Encoded byte units.
409/// - `index`: Start index of the escape marker.
410/// - `available`: Available unit count from `index`.
411///
412/// # Errors
413/// Returns [`MiscCodecError::Incomplete`] when more units are required.
414#[inline]
415fn ensure_variable_hex_escape_complete(
416    _input: &[u8],
417    _index: usize,
418    available: usize,
419) -> MiscCodecResult<()> {
420    if available < 3 {
421        return Err(MiscCodecError::Incomplete {
422            required: 3,
423            available,
424        });
425    }
426    Ok(())
427}
428
429/// Ensures a fixed-width universal byte escape has enough units.
430///
431/// # Parameters
432/// - `available`: Available unit count from `index`.
433/// - `required`: Required unit count for this escape form.
434///
435/// # Errors
436/// Returns [`MiscCodecError::Incomplete`] when more units are required.
437#[inline]
438fn ensure_fixed_escape_complete(
439    available: usize,
440    required: usize,
441) -> MiscCodecResult<()> {
442    if available < required {
443        return Err(MiscCodecError::Incomplete {
444            required,
445            available,
446        });
447    }
448    Ok(())
449}
450
451/// Ensures an octal escape has enough units to decide one value.
452///
453/// # Parameters
454/// - `input`: Encoded byte units.
455/// - `index`: Start index of the escape marker.
456/// - `available`: Available unit count from `index`.
457///
458/// # Errors
459/// Returns [`MiscCodecError::Incomplete`] when more units are required.
460#[inline]
461fn ensure_octal_escape_complete(
462    _input: &[u8],
463    _index: usize,
464    _available: usize,
465) -> MiscCodecResult<()> {
466    Ok(())
467}
468
469/// Validates a raw source unit.
470///
471/// # Parameters
472/// - `input`: Encoded byte units.
473/// - `index`: Byte index in the encoded input.
474/// - `byte`: Raw source byte.
475/// - `context`: Parsing context used for diagnostics.
476///
477/// # Errors
478/// Returns [`MiscCodecError::InvalidCharacter`] when the byte is not allowed as
479/// a raw C string source byte.
480#[inline]
481fn validate_source_unit(
482    input: &[u8],
483    index: usize,
484    byte: u8,
485    context: CStringLiteralParseContext<'_>,
486) -> MiscCodecResult<()> {
487    if matches!(byte, b'\t' | b'\n' | 0x0b | 0x0c | b' '..=b'~') {
488        return Ok(());
489    }
490    Err(MiscCodecError::InvalidCharacter {
491        index,
492        character: context.source_character(input, index),
493        reason: context.raw_source_reason().to_owned(),
494    })
495}
496
497/// Parses a byte-oriented `\x` escape from `input`.
498///
499/// # Parameters
500/// - `input`: Encoded byte units.
501/// - `marker_index`: Byte index of the escape marker.
502///
503/// # Returns
504/// Decoded byte and consumed unit count.
505///
506/// # Errors
507/// Returns [`MiscCodecError::InvalidEscape`] when no hexadecimal digit follows
508/// `\x`.
509#[inline]
510fn parse_variable_hex_escape_units(
511    input: &[u8],
512    marker_index: usize,
513) -> MiscCodecResult<(u8, usize)> {
514    let mut value = 0u8;
515    let mut digit_count = 0usize;
516    let mut index = marker_index + 2;
517    while digit_count < 2 {
518        let Some(&byte) = input.get(index) else {
519            break;
520        };
521        let Some(digit) = hex_value(char::from(byte)) else {
522            break;
523        };
524        value = (value << 4) | digit;
525        index += 1;
526        digit_count += 1;
527    }
528    if digit_count == 0 {
529        return Err(invalid_escape(
530            marker_index,
531            "\\x",
532            "expected at least one hexadecimal digit",
533        ));
534    }
535    Ok((value, 2 + digit_count))
536}
537
538/// Parses a fixed-width universal byte escape from `input`.
539///
540/// # Parameters
541/// - `input`: Encoded byte units.
542/// - `marker_index`: Byte index of the escape marker.
543/// - `digits`: Required hexadecimal digit count.
544///
545/// # Returns
546/// Decoded byte and consumed unit count.
547///
548/// # Errors
549/// Returns [`MiscCodecError::InvalidEscape`] when the escape is incomplete or
550/// larger than one byte, or [`MiscCodecError::InvalidDigit`] when a required
551/// digit is not hexadecimal.
552#[inline]
553fn parse_fixed_hex_escape_units(
554    input: &[u8],
555    marker_index: usize,
556    digits: usize,
557    context: CStringLiteralParseContext<'_>,
558) -> MiscCodecResult<(u8, usize)> {
559    let mut value = 0u32;
560    let mut index = marker_index + 2;
561    for _ in 0..digits {
562        let Some(_) = input.get(index) else {
563            return Err(invalid_escape(
564                marker_index,
565                &context.escape_fragment(input, marker_index, input.len()),
566                "incomplete universal character escape",
567            ));
568        };
569        let character = context.source_character(input, index);
570        let Some(digit) = hex_value(character) else {
571            return Err(MiscCodecError::InvalidDigit {
572                radix: 16,
573                index,
574                character,
575            });
576        };
577        value = (value << 4) | u32::from(digit);
578        index += 1;
579    }
580    if value > u32::from(u8::MAX) {
581        return Err(invalid_escape(
582            marker_index,
583            &context.escape_fragment(input, marker_index, index),
584            "universal character value must fit in one byte",
585        ));
586    }
587    Ok((value as u8, 2 + digits))
588}
589
590/// Parses an octal byte escape from `input`.
591///
592/// # Parameters
593/// - `input`: Encoded byte units.
594/// - `marker_index`: Byte index of the escape marker.
595///
596/// # Returns
597/// Decoded byte and consumed unit count. Values above `0o377` are truncated to
598/// their low byte to match the owned decoder.
599#[inline]
600fn parse_octal_escape_units(input: &[u8], marker_index: usize) -> (u8, usize) {
601    let mut value = 0u16;
602    let mut digit_count = 0usize;
603    let mut index = marker_index + 1;
604    while digit_count < 3 {
605        let Some(&byte) = input.get(index) else {
606            break;
607        };
608        let Some(digit) = octal_value(char::from(byte)) else {
609            break;
610        };
611        value = (value << 3) | u16::from(digit);
612        index += 1;
613        digit_count += 1;
614    }
615    (value as u8, 1 + digit_count)
616}
617
618/// Encodes one byte into `output`.
619///
620/// # Parameters
621/// - `byte`: Byte to encode.
622/// - `output`: Destination unit buffer.
623/// - `index`: Start index in `output`.
624///
625/// # Returns
626/// Number of units written.
627#[inline]
628fn write_encoded_byte(byte: u8, output: &mut [u8], index: usize) -> usize {
629    match byte {
630        b'\'' => write_ascii_escape(output, index, b'\''),
631        b'"' => write_ascii_escape(output, index, b'"'),
632        b'?' => write_ascii_escape(output, index, b'?'),
633        b'\\' => write_ascii_escape(output, index, b'\\'),
634        0x07 => write_ascii_escape(output, index, b'a'),
635        0x08 => write_ascii_escape(output, index, b'b'),
636        0x0c => write_ascii_escape(output, index, b'f'),
637        b'\n' => write_ascii_escape(output, index, b'n'),
638        b'\r' => write_ascii_escape(output, index, b'r'),
639        b'\t' => write_ascii_escape(output, index, b't'),
640        0x0b => write_ascii_escape(output, index, b'v'),
641        b' '..=b'~' => {
642            output[index] = byte;
643            1
644        }
645        _ => {
646            output[index] = b'\\';
647            output[index + 1] = b'x';
648            output[index + 2] = uppercase_hex_digit(byte >> 4) as u8;
649            output[index + 3] = uppercase_hex_digit(byte & 0x0f) as u8;
650            4
651        }
652    }
653}
654
655/// Writes a two-unit backslash escape.
656///
657/// # Parameters
658/// - `output`: Destination unit buffer.
659/// - `index`: Start index in `output`.
660/// - `escape`: ASCII escape letter after the backslash.
661///
662/// # Returns
663/// Number of units written.
664#[inline(always)]
665fn write_ascii_escape(output: &mut [u8], index: usize, escape: u8) -> usize {
666    output[index] = b'\\';
667    output[index + 1] = escape;
668    2
669}
670
671/// Builds an ASCII-ish escape fragment from encoded units.
672///
673/// # Parameters
674/// - `input`: Encoded byte units.
675/// - `start`: Start index.
676/// - `end`: Exclusive end index.
677///
678/// # Returns
679/// String fragment used in diagnostics.
680fn escape_fragment(input: &[u8], start: usize, end: usize) -> String {
681    let bounded_end = end.min(input.len());
682    input[start..bounded_end]
683        .iter()
684        .map(|byte| char::from(*byte))
685        .collect()
686}
687
688/// Converts one hexadecimal character to its nibble value.
689///
690/// # Parameters
691/// - `character`: Character to inspect.
692///
693/// # Returns
694/// Nibble value, or `None` when `character` is not hexadecimal.
695#[inline(always)]
696fn hex_value(character: char) -> Option<u8> {
697    match character {
698        '0'..='9' => Some(character as u8 - b'0'),
699        'a'..='f' => Some(character as u8 - b'a' + 10),
700        'A'..='F' => Some(character as u8 - b'A' + 10),
701        _ => None,
702    }
703}
704
705/// Converts one octal character to its value.
706///
707/// # Parameters
708/// - `character`: Character to inspect.
709///
710/// # Returns
711/// Octal digit value, or `None` when `character` is not octal.
712#[inline(always)]
713fn octal_value(character: char) -> Option<u8> {
714    match character {
715        '0'..='7' => Some(character as u8 - b'0'),
716        _ => None,
717    }
718}
719
720/// Converts one nibble to an uppercase hexadecimal digit.
721///
722/// # Parameters
723/// - `value`: Nibble value. Values above `0x0f` are masked to their low nibble.
724///
725/// # Returns
726/// Uppercase hexadecimal digit.
727#[inline(always)]
728fn uppercase_hex_digit(value: u8) -> char {
729    UPPER_HEX_DIGITS[(value & 0x0f) as usize]
730}
731
732/// Builds an invalid escape error.
733///
734/// # Parameters
735/// - `index`: Byte index of the escape marker in the original input.
736/// - `escape`: Escape fragment that caused the error.
737/// - `reason`: Human-readable rejection reason.
738///
739/// # Returns
740/// An invalid escape error.
741fn invalid_escape(index: usize, escape: &str, reason: &str) -> MiscCodecError {
742    MiscCodecError::InvalidEscape {
743        index,
744        escape: escape.to_owned(),
745        reason: reason.to_owned(),
746    }
747}