tick_encoding/
lib.rs

1#![cfg_attr(feature = "safe", deny(unsafe_code))]
2#![cfg_attr(not(feature = "std"), no_std)]
3#![cfg_attr(feature = "std", doc = include_str!("../README.md"))]
4
5pub(crate) mod decoder;
6pub(crate) mod encoder;
7pub mod iter;
8
9#[cfg(feature = "alloc")]
10extern crate alloc;
11
12#[cfg(feature = "alloc")]
13use alloc::{borrow::Cow, string::String, vec::Vec};
14
15/// Encode the given input as a string, escaping any bytes that require it.
16/// If no bytes require escaping, then the result will be borrowed from
17/// the input.
18///
19/// ## Example
20///
21/// ```
22/// # #![cfg(feature = "alloc")]
23/// let encoded = tick_encoding::encode(b"hello world!");
24/// assert_eq!(encoded, "hello world!");
25///
26/// let encoded = tick_encoding::encode(&[0x00, 0xFF]);
27/// assert_eq!(encoded, "`00`FF");
28/// ```
29#[cfg(feature = "alloc")]
30pub fn encode(input: &[u8]) -> Cow<str> {
31    // Get the first index that needs to be escaped
32    let escape_index = input.iter().position(|byte| requires_escape(*byte));
33
34    match escape_index {
35        Some(index) => {
36            // We know everything up to `index` does not require escaping
37            let validated = &input[..index];
38            debug_assert!(validated.is_ascii());
39
40            // SAFETY: We know the input up to this point is valid ASCII and
41            // UTF-8, since nothing up to this point needs escaping
42            let validated = from_utf8_unchecked_potentially_unsafe(validated);
43
44            let mut output = String::with_capacity(input.len() + 1);
45            output.push_str(validated);
46
47            // Encode the remainder of the input
48            let requires_encoding = &input[index..];
49            encode_to_string(requires_encoding, &mut output);
50            Cow::Owned(output)
51        }
52        None => {
53            debug_assert!(input.is_ascii());
54
55            // SAFETY: We know the entire input is valid ASCII and UTF-8, and
56            // additionally doesn't require any bytes to be escaped
57            Cow::Borrowed(from_utf8_unchecked_potentially_unsafe(input))
58        }
59    }
60}
61
62/// Return an iterator that encodes the bytes from the input iterator.
63///
64/// ## Example
65///
66/// ```
67/// let iter = tick_encoding::encode_iter(b"x: \x00".iter().copied());
68/// assert_eq!(iter.collect::<String>(), "x: `00");
69/// ```
70pub fn encode_iter<I>(iter: I) -> iter::EncodeIter<I::IntoIter>
71where
72    I: IntoIterator<Item = u8>,
73{
74    iter::EncodeIter::new(iter.into_iter())
75}
76
77/// Decode the given encoded input into a byte array. If no bytes need to
78/// be un-escapeed, then the result will be borrowed from the input.
79///
80/// Returns an error if the input isn't a valid ASCII string, or isn't a
81/// valid canonical tick-encoding.
82///
83/// ## Example
84///
85/// ```
86/// # #![cfg(feature = "alloc")]
87/// let decoded = tick_encoding::decode(b"hello world!").unwrap();
88/// assert_eq!(decoded, "hello world!".as_bytes());
89///
90/// let decoded = tick_encoding::decode(b"`00`FF").unwrap();
91/// assert_eq!(decoded, [0x00, 0xFF].as_slice());
92/// ```
93#[cfg(feature = "alloc")]
94pub fn decode(input: &[u8]) -> Result<Cow<[u8]>, DecodeError> {
95    // Get the first index that isn't already a valid unescaped byte
96    let escape_index = input.iter().position(|byte| requires_escape(*byte));
97
98    match escape_index {
99        Some(index) => {
100            // We know everything up to `index` does not need to be unescaped
101            let validated = &input[..index];
102
103            let mut output = Vec::with_capacity(validated.len() + 1);
104            output.extend_from_slice(validated);
105
106            // Decode the remainder of the input
107            let requires_decoding = &input[index..];
108            decode_to_vec(requires_decoding, &mut output)?;
109            Ok(Cow::Owned(output))
110        }
111        None => Ok(Cow::Borrowed(input)),
112    }
113}
114
115/// Return an iterator that decodes the tick-encoded characters from the input
116/// iterator. Returns `Some(Err(_))` if the input character sequence is invalid,
117/// then returns `None` after that.
118///
119/// ## Example
120///
121/// ```
122/// let iter = tick_encoding::decode_iter(b"`00`01".iter().copied());
123/// assert_eq!(iter.collect::<Result<Vec<_>, _>>().unwrap(), vec![0x00, 0x01]);
124/// ```
125pub fn decode_iter<I>(iter: I) -> iter::DecodeIter<I::IntoIter>
126where
127    I: IntoIterator<Item = u8>,
128{
129    iter::DecodeIter::new(iter.into_iter())
130}
131
132/// Take a byte slice containing a tick-encoded ASCII string, and decode it
133/// in-place, writing back into the same byte slice. Returns a sub-slice
134/// containing just the decoded bytes (the bytes past the returned sub-slice
135/// are left unchanged).
136///
137/// ## Example
138///
139/// ```rust
140/// let mut buffer = b"bytes: `00`01`02`03".to_vec();
141/// let decoded = tick_encoding::decode_in_place(&mut buffer).unwrap();
142/// assert_eq!(decoded, b"bytes: \x00\x01\x02\x03");
143/// ```
144pub fn decode_in_place(input: &mut [u8]) -> Result<&mut [u8], DecodeError> {
145    // Get the first index that isn't already a valid unescaped byte
146    let Some(escape_index) = input.iter().position(|byte| requires_escape(*byte)) else {
147        // Nothing needs to be unescaped
148        return Ok(input);
149    };
150
151    // Walk through the rest of the input. The bytes between `0..head` have been
152    // decoded, and the bytes between `tail..input.len()` are still encoded.
153    // Since the encoded form is always as long as the decoded form or longer,
154    // `head` will always be less than or equal to `tail`.
155    //
156    // This technique is very similar to the one from `in-place-string-map` (see
157    // https://crates.io/crates/in-place-string-map), but works on a byte slice
158    // instead.
159    let mut head = escape_index;
160    let mut tail = escape_index;
161    while tail < input.len() {
162        if input[tail] == b'`' {
163            let escaped = input.get(tail + 1).ok_or(DecodeError::UnexpectedEnd)?;
164            match escaped {
165                b'`' => {
166                    input[head] = b'`';
167                    tail += 2;
168                    head += 1;
169                }
170                high => {
171                    let low = input.get(tail + 2).ok_or(DecodeError::UnexpectedEnd)?;
172                    let byte = hex_bytes_to_byte(*high, *low)?;
173                    input[head] = byte;
174                    tail += 3;
175                    head += 1;
176                }
177            }
178        } else if requires_escape(input[tail]) {
179            return Err(DecodeError::InvalidByte(input[tail]));
180        } else {
181            input[head] = input[tail];
182            tail += 1;
183            head += 1;
184        }
185    }
186
187    let decoded = &mut input[..head];
188    Ok(decoded)
189}
190
191/// Returns true if the given byte must be escaped with a backtick.
192///
193/// The following ASCII bytes **do not** require escaping, and are left
194/// un-escaped in a tick-encoded string:
195///
196/// - Tab (`\t`, 0x09)
197/// - Newline (`\n`, 0x0A)
198/// - Carriage return (`\r`, 0x0D)
199/// - Space (` `, 0x20)
200/// - Printable characters except backtick (0x21 to 0x59, 0x61 to 0x7E)
201pub const fn requires_escape(byte: u8) -> bool {
202    match byte {
203        b'`' => true,
204        b'\t' | b'\n' | b'\r' | b' '..=b'~' => false,
205        _ => true,
206    }
207}
208
209/// Encode the given input, and append the result to `output`. Returns
210/// the number of bytes / characters appended (only ASCII characters are
211/// appended).
212///
213/// ## Example
214///
215/// ```
216/// # #![cfg(feature = "alloc")]
217/// let mut output = String::new();
218/// let count = tick_encoding::encode_to_string("hello, world! 🙂".as_bytes(), &mut output);
219/// assert_eq!(output, "hello, world! `F0`9F`99`82");
220/// assert_eq!(count, 26);
221/// ```
222#[cfg(feature = "alloc")]
223pub fn encode_to_string(input: &[u8], output: &mut String) -> usize {
224    let mut written = 0;
225    output.reserve(input.len());
226    for &byte in input {
227        if byte == b'`' {
228            output.push_str("``");
229            written += 2;
230        } else if requires_escape(byte) {
231            let [high, low] = byte_to_hex_chars(byte);
232            output.push('`');
233            output.push(high);
234            output.push(low);
235
236            written += 3;
237        } else {
238            output.push(byte as char);
239            written += 1;
240        }
241    }
242
243    written
244}
245
246/// Encode the given input, and append the result to `output`. Returns
247/// the number of bytes appended.
248///
249/// ## Example
250///
251/// ```
252/// let mut output = vec![];
253/// let count = tick_encoding::encode_to_vec("hello, world! 🙂".as_bytes(), &mut output);
254/// assert_eq!(output, b"hello, world! `F0`9F`99`82");
255/// assert_eq!(count, 26);
256/// ```
257#[cfg(feature = "alloc")]
258pub fn encode_to_vec(input: &[u8], output: &mut Vec<u8>) -> usize {
259    let mut written = 0;
260    output.reserve(input.len());
261    for &byte in input {
262        if byte == b'`' {
263            output.extend_from_slice(b"``");
264            written += 2;
265        } else if requires_escape(byte) {
266            let [high, low] = byte_to_hex_bytes(byte);
267            output.extend_from_slice(&[b'`', high, low]);
268
269            written += 3;
270        } else {
271            output.push(byte);
272            written += 1;
273        }
274    }
275
276    written
277}
278
279/// Decode the given tick-encoded ASCII input, and append the result to
280/// `output`. Returns the number of bytes appended. Returns an error
281/// if the result isn't a valid ASCII string, or isn't a valid canonical
282/// tick-encoding.
283///
284/// ## Example
285///
286/// ```
287/// let mut output = vec![];
288/// let count = tick_encoding::decode_to_vec(b"hello, world! `F0`9F`99`82", &mut output).unwrap();
289/// let output_str = core::str::from_utf8(&output).unwrap();
290/// assert_eq!(output_str, "hello, world! 🙂");
291/// assert_eq!(count, 18);
292/// ```
293#[cfg(feature = "alloc")]
294pub fn decode_to_vec(input: &[u8], output: &mut Vec<u8>) -> Result<usize, DecodeError> {
295    let mut written = 0;
296    let mut iter = input.iter();
297    while let Some(&byte) = iter.next() {
298        if byte == b'`' {
299            let escaped = iter.next().ok_or(DecodeError::UnexpectedEnd)?;
300            match escaped {
301                b'`' => {
302                    output.push(b'`');
303                    written += 1;
304                }
305                high => {
306                    let low = iter.next().ok_or(DecodeError::UnexpectedEnd)?;
307                    let byte = hex_bytes_to_byte(*high, *low)?;
308                    output.push(byte);
309                    written += 1;
310                }
311            }
312        } else if requires_escape(byte) {
313            return Err(DecodeError::InvalidByte(byte));
314        } else {
315            output.push(byte);
316            written += 1;
317        }
318    }
319
320    Ok(written)
321}
322
323const fn byte_to_hex_bytes(byte: u8) -> [u8; 2] {
324    let high = byte >> 4;
325    let low = byte & 0x0F;
326
327    let high_byte = match high {
328        0..=9 => b'0' + high,
329        10..=15 => b'A' + high - 10,
330        _ => unreachable!(),
331    };
332    let low_byte = match low {
333        0..=9 => b'0' + low,
334        10..=15 => b'A' + low - 10,
335        _ => unreachable!(),
336    };
337
338    [high_byte, low_byte]
339}
340
341const fn byte_to_hex_chars(byte: u8) -> [char; 2] {
342    let [high_byte, low_byte] = byte_to_hex_bytes(byte);
343    [high_byte as char, low_byte as char]
344}
345
346const fn hex_bytes_to_byte(high: u8, low: u8) -> Result<u8, DecodeError> {
347    enum HexCharResult {
348        Valid(u8),
349        Lowercase,
350        Invalid,
351    }
352
353    let high_value = match high {
354        b'0'..=b'9' => HexCharResult::Valid(high - b'0'),
355        b'A'..=b'F' => HexCharResult::Valid(high - b'A' + 10),
356        b'a'..=b'f' => HexCharResult::Lowercase,
357        _ => HexCharResult::Invalid,
358    };
359
360    let low_value = match low {
361        b'0'..=b'9' => HexCharResult::Valid(low - b'0'),
362        b'A'..=b'F' => HexCharResult::Valid(low - b'A' + 10),
363        b'a'..=b'f' => HexCharResult::Lowercase,
364        _ => HexCharResult::Invalid,
365    };
366
367    let byte = match (high_value, low_value) {
368        (HexCharResult::Valid(high_value), HexCharResult::Valid(low_value)) => {
369            (high_value << 4) | low_value
370        }
371        (HexCharResult::Invalid, _) | (_, HexCharResult::Invalid) => {
372            return Err(DecodeError::InvalidHex(EscapedHex(high, low)));
373        }
374        (HexCharResult::Lowercase, _) | (_, HexCharResult::Lowercase) => {
375            return Err(DecodeError::LowercaseHex(EscapedHex(high, low)));
376        }
377    };
378
379    if byte == b'`' || !requires_escape(byte) {
380        return Err(DecodeError::UnexpectedEscape(
381            EscapedHex(high, low),
382            byte as char,
383        ));
384    }
385
386    Ok(byte)
387}
388
389#[cfg(feature = "safe")]
390fn from_utf8_unchecked_potentially_unsafe(bytes: &[u8]) -> &str {
391    core::str::from_utf8(bytes).unwrap()
392}
393
394#[cfg(not(feature = "safe"))]
395fn from_utf8_unchecked_potentially_unsafe(bytes: &[u8]) -> &str {
396    debug_assert!(bytes.is_ascii());
397    unsafe { core::str::from_utf8_unchecked(bytes) }
398}
399
400/// An error trying to decode a tick-encoded string.
401#[derive(Debug)]
402#[cfg_attr(feature = "std", derive(thiserror::Error))]
403pub enum DecodeError {
404    /// Encountered an invalid byte in the string. This could either by a
405    /// non-ASCII byte or an ASCII byte that requires escaping (see
406    /// [`requires_escape`]).
407    #[cfg_attr(feature = "std", error("invalid encoded byte 0x{0:02x}"))]
408    InvalidByte(u8),
409    /// Reached the end of the string following a backtick (\`). A backtick
410    /// must be followed by either another backtick or a 2-digit hex value.
411    #[cfg_attr(feature = "std", error("unexpected end after `"))]
412    UnexpectedEnd,
413    /// Tried to decode a 2-digit hex value, but the value does not require
414    /// escaping (see [`requires_escape`]).
415    #[cfg_attr(feature = "std", error("unexpected escape {0}, expected {1}"))]
416    UnexpectedEscape(EscapedHex, char),
417    /// Tried to decode a 2-digit hex value, but the hex value contained
418    /// the values `[a-f]`. Escaped hex values must use `[A-F]`.
419    #[cfg_attr(feature = "std", error("expected uppercase hex sequence, found {0}"))]
420    LowercaseHex(EscapedHex),
421    /// Tried to decode a 2-digit hex value, but an invalid hex digit
422    /// was found. Escaped hex values must use the characters `[0-9A-F]`.
423    #[cfg_attr(feature = "std", error("invalid hex sequence {0}"))]
424    InvalidHex(EscapedHex),
425}
426
427/// A two-digit escaped hex sequence, prefixed with a backtick.
428pub struct EscapedHex(pub u8, pub u8);
429
430impl core::fmt::Debug for EscapedHex {
431    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
432        let Self(high, low) = self;
433        if requires_escape(*high) || requires_escape(*low) {
434            f.debug_tuple("EscapedHex")
435                .field(&self.0)
436                .field(&self.1)
437                .finish()
438        } else {
439            f.debug_tuple("EscapedHex")
440                .field(&(*high as char))
441                .field(&(*low as char))
442                .finish()
443        }
444    }
445}
446
447impl core::fmt::Display for EscapedHex {
448    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
449        let Self(high, low) = self;
450        if requires_escape(*high) || requires_escape(*low) {
451            write!(f, "0x{high:02X} 0x{low:02X}")
452        } else {
453            write!(f, "`{}{}", *high as char, *low as char)
454        }
455    }
456}