Skip to main content

tick_encoding/
lib.rs

1#![cfg_attr(feature = "safe", deny(unsafe_code))]
2#![cfg_attr(not(feature = "std"), no_std)]
3#![cfg_attr(feature = "std", doc = include_str!("../README.md"))]
4
5pub(crate) mod decoder;
6pub(crate) mod encoder;
7pub mod iter;
8
9#[cfg(feature = "alloc")]
10extern crate alloc;
11
12#[cfg(feature = "alloc")]
13use alloc::{borrow::Cow, string::String, vec::Vec};
14
15/// Lookup table for knowing if a byte requires escaping.
16const REQUIRES_ESCAPE_TABLE: [bool; 256] = {
17    let mut table = [true; 256]; // Default: requires escape
18
19    // Whitespace that doesn't require escaping
20    table[b'\t' as usize] = false;
21    table[b'\n' as usize] = false;
22    table[b'\r' as usize] = false;
23
24    // Printable ASCII (space through tilde) except backtick
25    let mut i = b' ';
26    while i <= b'~' {
27        if i != b'`' {
28            table[i as usize] = false;
29        }
30        i += 1;
31    }
32
33    table
34};
35
36const HEX_NIBBLE_DECODE_INVALID_ERR: u8 = 0xFF;
37const HEX_NIBBLE_DECODE_LOWERCASE_ERR: u8 = 0xFE;
38
39/// Lookup table for hex ASCII character to nibble.
40///
41/// Values:
42/// - 0x00-0x0F: Valid uppercase hex digit
43/// - `HEX_LOWERCASE`: Lowercase hex digit (a-f)
44/// - `HEX_INVALID`: Invalid character
45const HEX_NIBBLE_DECODE_TABLE: [u8; 256] = {
46    let mut table = [HEX_NIBBLE_DECODE_INVALID_ERR; 256];
47
48    // Digits '0'-'9' -> 0-9
49    let mut i = b'0';
50    while i <= b'9' {
51        table[i as usize] = i - b'0';
52        i += 1;
53    }
54
55    // Uppercase 'A'-'F' -> 10-15
56    i = b'A';
57    while i <= b'F' {
58        table[i as usize] = i - b'A' + 10;
59        i += 1;
60    }
61
62    // Lowercase 'a'-'f' -> lowercase error
63    i = b'a';
64    while i <= b'f' {
65        table[i as usize] = HEX_NIBBLE_DECODE_LOWERCASE_ERR;
66        i += 1;
67    }
68
69    table
70};
71
72/// Encode the given input as a string, escaping any bytes that require it.
73/// If no bytes require escaping, then the result will be borrowed from
74/// the input.
75///
76/// ## Example
77///
78/// ```
79/// # #![cfg(feature = "alloc")]
80/// let encoded = tick_encoding::encode(b"hello world!");
81/// assert_eq!(encoded, "hello world!");
82///
83/// let encoded = tick_encoding::encode(&[0x00, 0xFF]);
84/// assert_eq!(encoded, "`00`FF");
85/// ```
86#[cfg(feature = "alloc")]
87#[must_use]
88pub fn encode(input: &[u8]) -> Cow<'_, str> {
89    // Get the first index that needs to be escaped
90    input
91        .iter()
92        .position(|byte| requires_escape(*byte))
93        // If no escape needed, borrow input. Otherwise encode from that index
94        .map_or_else(
95            || {
96                debug_assert!(input.is_ascii());
97
98                // SAFETY: We know the entire input is valid ASCII and UTF-8, and
99                // additionally doesn't require any bytes to be escaped
100                Cow::Borrowed(from_utf8_unchecked_potentially_unsafe(input))
101            },
102            |index| {
103                // We know everything up to `index` does not require escaping
104                let validated = &input[..index];
105                debug_assert!(validated.is_ascii());
106
107                // SAFETY: We know the input up to this point is valid ASCII and
108                // UTF-8, since nothing up to this point needs escaping
109                let validated = from_utf8_unchecked_potentially_unsafe(validated);
110
111                let mut output = String::with_capacity(input.len() + 1);
112                output.push_str(validated);
113
114                // Encode the remainder of the input
115                let requires_encoding = &input[index..];
116                encode_to_string(requires_encoding, &mut output);
117                Cow::Owned(output)
118            },
119        )
120}
121
122/// Return an iterator that encodes the bytes from the input iterator.
123///
124/// ## Example
125///
126/// ```
127/// let iter = tick_encoding::encode_iter(b"x: \x00".iter().copied());
128/// assert_eq!(iter.collect::<String>(), "x: `00");
129/// ```
130pub fn encode_iter<I>(iter: I) -> iter::EncodeIter<I::IntoIter>
131where
132    I: IntoIterator<Item = u8>,
133{
134    iter::EncodeIter::new(iter.into_iter())
135}
136
137/// Decode the given encoded input into a byte array. If no bytes need to
138/// be un-escaped, then the result will be borrowed from the input.
139///
140/// Returns an error if the input isn't a valid ASCII string, or isn't a
141/// valid canonical tick-encoding.
142///
143/// # Errors
144///
145/// Returns a [`DecodeError`] if the input is not valid tick-encoded data.
146///
147/// ## Example
148///
149/// ```
150/// # #![cfg(feature = "alloc")]
151/// let decoded = tick_encoding::decode(b"hello world!").unwrap();
152/// assert_eq!(decoded, "hello world!".as_bytes());
153///
154/// let decoded = tick_encoding::decode(b"`00`FF").unwrap();
155/// assert_eq!(decoded, [0x00, 0xFF].as_slice());
156/// ```
157#[cfg(feature = "alloc")]
158pub fn decode(input: &[u8]) -> Result<Cow<'_, [u8]>, DecodeError> {
159    // Get the first index that isn't already a valid unescaped byte
160    let escape_index = input.iter().position(|byte| requires_escape(*byte));
161
162    match escape_index {
163        Some(index) => {
164            // We know everything up to `index` does not need to be unescaped
165            let validated = &input[..index];
166
167            // Start by copying the validated input as-is. For the capacity,
168            // we use a formula that gives the minimum length of the output
169            // (i.e. assuming every remaining byte is escaped)
170            let output_est_capacity = validated.len() + (input.len() - validated.len() + 2) / 3;
171            let mut output = Vec::with_capacity(output_est_capacity);
172            output.extend_from_slice(validated);
173
174            // Decode the remainder of the input
175            let requires_decoding = &input[index..];
176            decode_to_vec(requires_decoding, &mut output)?;
177            Ok(Cow::Owned(output))
178        }
179        None => Ok(Cow::Borrowed(input)),
180    }
181}
182
183/// Return an iterator that decodes the tick-encoded characters from the input
184/// iterator. Returns `Some(Err(_))` if the input character sequence is invalid,
185/// then returns `None` after that.
186///
187/// ## Example
188///
189/// ```
190/// let iter = tick_encoding::decode_iter(b"`00`01".iter().copied());
191/// assert_eq!(iter.collect::<Result<Vec<_>, _>>().unwrap(), vec![0x00, 0x01]);
192/// ```
193pub fn decode_iter<I>(iter: I) -> iter::DecodeIter<I::IntoIter>
194where
195    I: IntoIterator<Item = u8>,
196{
197    iter::DecodeIter::new(iter.into_iter())
198}
199
200/// Decode a tick-encoded ASCII string in-place.
201///
202/// Takes a byte slice containing a tick-encoded ASCII string, and decodes it
203/// in-place, writing back into the same byte slice. Returns a sub-slice
204/// containing just the decoded bytes (the bytes past the returned sub-slice
205/// are left unchanged).
206///
207/// # Errors
208///
209/// Returns a [`DecodeError`] if the input is not valid tick-encoded data.
210///
211/// ## Example
212///
213/// ```rust
214/// let mut buffer = b"bytes: `00`01`02`03".to_vec();
215/// let decoded = tick_encoding::decode_in_place(&mut buffer).unwrap();
216/// assert_eq!(decoded, b"bytes: \x00\x01\x02\x03");
217/// ```
218pub fn decode_in_place(input: &mut [u8]) -> Result<&mut [u8], DecodeError> {
219    // Get the first index that isn't already a valid unescaped byte
220    let Some(escape_index) = input.iter().position(|byte| requires_escape(*byte)) else {
221        // Nothing needs to be unescaped
222        return Ok(input);
223    };
224
225    // Walk through the rest of the input. The bytes between `0..head` have been
226    // decoded, and the bytes between `tail..input.len()` are still encoded.
227    // Since the encoded form is always as long as the decoded form or longer,
228    // `head` will always be less than or equal to `tail`.
229    //
230    // This technique is very similar to the one from `in-place-string-map` (see
231    // https://crates.io/crates/in-place-string-map), but works on a byte slice
232    // instead.
233    let mut head = escape_index;
234    let mut tail = escape_index;
235    while tail < input.len() {
236        if input[tail] == b'`' {
237            let escaped = input.get(tail + 1).ok_or(DecodeError::UnexpectedEnd)?;
238            match escaped {
239                b'`' => {
240                    input[head] = b'`';
241                    tail += 2;
242                    head += 1;
243                }
244                high => {
245                    let low = input.get(tail + 2).ok_or(DecodeError::UnexpectedEnd)?;
246                    let byte = hex_bytes_to_byte(*high, *low)?;
247                    input[head] = byte;
248                    tail += 3;
249                    head += 1;
250                }
251            }
252        } else if requires_escape(input[tail]) {
253            return Err(DecodeError::InvalidByte(input[tail]));
254        } else {
255            input[head] = input[tail];
256            tail += 1;
257            head += 1;
258        }
259    }
260
261    let decoded = &mut input[..head];
262    Ok(decoded)
263}
264
265/// Returns true if the given byte must be escaped with a backtick.
266///
267/// The following ASCII bytes **do not** require escaping, and are left
268/// un-escaped in a tick-encoded string:
269///
270/// - Tab (`\t`, 0x09)
271/// - Newline (`\n`, 0x0A)
272/// - Carriage return (`\r`, 0x0D)
273/// - Space (` `, 0x20)
274/// - Printable characters except backtick (0x21 to 0x59, 0x61 to 0x7E)
275#[inline]
276#[must_use]
277pub const fn requires_escape(byte: u8) -> bool {
278    REQUIRES_ESCAPE_TABLE[byte as usize]
279}
280
281/// Encode the given input, and append the result to `output`. Returns
282/// the number of bytes / characters appended (only ASCII characters are
283/// appended).
284///
285/// ## Example
286///
287/// ```
288/// # #![cfg(feature = "alloc")]
289/// let mut output = String::new();
290/// let count = tick_encoding::encode_to_string("hello, world! 🙂".as_bytes(), &mut output);
291/// assert_eq!(output, "hello, world! `F0`9F`99`82");
292/// assert_eq!(count, 26);
293/// ```
294#[cfg(feature = "alloc")]
295pub fn encode_to_string(input: &[u8], output: &mut String) -> usize {
296    let mut written = 0;
297    output.reserve(input.len());
298    for &byte in input {
299        if byte == b'`' {
300            output.push_str("``");
301            written += 2;
302        } else if requires_escape(byte) {
303            let [high, low] = byte_to_hex_chars(byte);
304            output.push('`');
305            output.push(high);
306            output.push(low);
307
308            written += 3;
309        } else {
310            output.push(byte as char);
311            written += 1;
312        }
313    }
314
315    written
316}
317
318/// Encode the given input, and append the result to `output`. Returns
319/// the number of bytes appended.
320///
321/// ## Example
322///
323/// ```
324/// let mut output = vec![];
325/// let count = tick_encoding::encode_to_vec("hello, world! 🙂".as_bytes(), &mut output);
326/// assert_eq!(output, b"hello, world! `F0`9F`99`82");
327/// assert_eq!(count, 26);
328/// ```
329#[cfg(feature = "alloc")]
330pub fn encode_to_vec(input: &[u8], output: &mut Vec<u8>) -> usize {
331    let mut written = 0;
332    output.reserve(input.len());
333    for &byte in input {
334        if byte == b'`' {
335            output.extend_from_slice(b"``");
336            written += 2;
337        } else if requires_escape(byte) {
338            let [high, low] = byte_to_hex_bytes(byte);
339            output.extend_from_slice(&[b'`', high, low]);
340
341            written += 3;
342        } else {
343            output.push(byte);
344            written += 1;
345        }
346    }
347
348    written
349}
350
351/// Decode tick-encoded ASCII input and append the result to a vector.
352///
353/// Returns the number of bytes appended. Returns an error if the result
354/// isn't a valid ASCII string, or isn't a valid canonical tick-encoding.
355///
356/// # Errors
357///
358/// Returns a [`DecodeError`] if the input is not valid tick-encoded data.
359///
360/// ## Example
361///
362/// ```
363/// let mut output = vec![];
364/// let count = tick_encoding::decode_to_vec(b"hello, world! `F0`9F`99`82", &mut output).unwrap();
365/// let output_str = core::str::from_utf8(&output).unwrap();
366/// assert_eq!(output_str, "hello, world! 🙂");
367/// assert_eq!(count, 18);
368/// ```
369#[cfg(feature = "alloc")]
370pub fn decode_to_vec(input: &[u8], output: &mut Vec<u8>) -> Result<usize, DecodeError> {
371    let mut written = 0;
372    let mut iter = input.iter();
373    while let Some(&byte) = iter.next() {
374        if byte == b'`' {
375            let escaped = iter.next().ok_or(DecodeError::UnexpectedEnd)?;
376            match escaped {
377                b'`' => {
378                    output.push(b'`');
379                    written += 1;
380                }
381                high => {
382                    let low = iter.next().ok_or(DecodeError::UnexpectedEnd)?;
383                    let byte = hex_bytes_to_byte(*high, *low)?;
384                    output.push(byte);
385                    written += 1;
386                }
387            }
388        } else if requires_escape(byte) {
389            return Err(DecodeError::InvalidByte(byte));
390        } else {
391            output.push(byte);
392            written += 1;
393        }
394    }
395
396    Ok(written)
397}
398
399/// Convert a nibble to its uppercase hex ASCII character.
400#[inline]
401const fn nibble_to_hex(n: u8) -> u8 {
402    // 0-9 → '0'-'9'
403    // 10-15 → 'A'-'F' (add 7 to skip the ASCII gap between '9' and 'A')
404    n + b'0' + ((n > 9) as u8) * 7
405}
406
407/// Convert a byte to its two-character uppercase hex representation.
408#[inline]
409const fn byte_to_hex_bytes(byte: u8) -> [u8; 2] {
410    [nibble_to_hex(byte >> 4), nibble_to_hex(byte & 0x0F)]
411}
412
413const fn byte_to_hex_chars(byte: u8) -> [char; 2] {
414    let [high_byte, low_byte] = byte_to_hex_bytes(byte);
415    [high_byte as char, low_byte as char]
416}
417
418/// Decode two hex ASCII characters into a single byte.
419///
420/// Returns an error if:
421/// - Either character is not a valid hex digit (`InvalidHex`)
422/// - Either character is lowercase a-f (`LowercaseHex`)
423/// - The decoded byte doesn't require escaping (`UnexpectedEscape`)
424#[inline]
425const fn hex_bytes_to_byte(high: u8, low: u8) -> Result<u8, DecodeError> {
426    let high_value = HEX_NIBBLE_DECODE_TABLE[high as usize];
427    let low_value = HEX_NIBBLE_DECODE_TABLE[low as usize];
428
429    match (high_value, low_value) {
430        // Both valid hex digits (0x00-0x0F)
431        (0..=0x0F, 0..=0x0F) => {
432            let byte = (high_value << 4) | low_value;
433
434            if byte == b'`' || !requires_escape(byte) {
435                return Err(DecodeError::UnexpectedEscape(
436                    EscapedHex(high, low),
437                    byte as char,
438                ));
439            }
440
441            Ok(byte)
442        }
443        // At least one invalid character
444        (HEX_NIBBLE_DECODE_INVALID_ERR, _) | (_, HEX_NIBBLE_DECODE_INVALID_ERR) => {
445            Err(DecodeError::InvalidHex(EscapedHex(high, low)))
446        }
447        // Must be lowercase
448        _ => Err(DecodeError::LowercaseHex(EscapedHex(high, low))),
449    }
450}
451
452#[cfg(feature = "safe")]
453fn from_utf8_unchecked_potentially_unsafe(bytes: &[u8]) -> &str {
454    core::str::from_utf8(bytes).unwrap()
455}
456
457#[cfg(not(feature = "safe"))]
458fn from_utf8_unchecked_potentially_unsafe(bytes: &[u8]) -> &str {
459    debug_assert!(bytes.is_ascii());
460    unsafe { core::str::from_utf8_unchecked(bytes) }
461}
462
463/// An error trying to decode a tick-encoded string.
464#[derive(Debug)]
465#[cfg_attr(feature = "std", derive(thiserror::Error))]
466pub enum DecodeError {
467    /// Encountered an invalid byte in the string. This could either by a
468    /// non-ASCII byte or an ASCII byte that requires escaping (see
469    /// [`requires_escape`]).
470    #[cfg_attr(feature = "std", error("invalid encoded byte 0x{0:02x}"))]
471    InvalidByte(u8),
472    /// Reached the end of the string following a backtick (\`). A backtick
473    /// must be followed by either another backtick or a 2-digit hex value.
474    #[cfg_attr(feature = "std", error("unexpected end after `"))]
475    UnexpectedEnd,
476    /// Tried to decode a 2-digit hex value, but the value does not require
477    /// escaping (see [`requires_escape`]).
478    #[cfg_attr(feature = "std", error("unexpected escape {0}, expected {1}"))]
479    UnexpectedEscape(EscapedHex, char),
480    /// Tried to decode a 2-digit hex value, but the hex value contained
481    /// the values `[a-f]`. Escaped hex values must use `[A-F]`.
482    #[cfg_attr(feature = "std", error("expected uppercase hex sequence, found {0}"))]
483    LowercaseHex(EscapedHex),
484    /// Tried to decode a 2-digit hex value, but an invalid hex digit
485    /// was found. Escaped hex values must use the characters `[0-9A-F]`.
486    #[cfg_attr(feature = "std", error("invalid hex sequence {0}"))]
487    InvalidHex(EscapedHex),
488}
489
490/// A two-digit escaped hex sequence, prefixed with a backtick.
491pub struct EscapedHex(pub u8, pub u8);
492
493impl core::fmt::Debug for EscapedHex {
494    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
495        let Self(high, low) = self;
496        if requires_escape(*high) || requires_escape(*low) {
497            f.debug_tuple("EscapedHex")
498                .field(&self.0)
499                .field(&self.1)
500                .finish()
501        } else {
502            f.debug_tuple("EscapedHex")
503                .field(&(*high as char))
504                .field(&(*low as char))
505                .finish()
506        }
507    }
508}
509
510impl core::fmt::Display for EscapedHex {
511    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
512        let Self(high, low) = self;
513        if requires_escape(*high) || requires_escape(*low) {
514            write!(f, "0x{high:02X} 0x{low:02X}")
515        } else {
516            write!(f, "`{}{}", *high as char, *low as char)
517        }
518    }
519}