tick_encoding/lib.rs
1#![cfg_attr(feature = "safe", deny(unsafe_code))]
2#![cfg_attr(not(feature = "std"), no_std)]
3#![cfg_attr(feature = "std", doc = include_str!("../README.md"))]
4
5pub(crate) mod decoder;
6pub(crate) mod encoder;
7pub mod iter;
8
9#[cfg(feature = "alloc")]
10extern crate alloc;
11
12#[cfg(feature = "alloc")]
13use alloc::{borrow::Cow, string::String, vec::Vec};
14
15/// Lookup table for knowing if a byte requires escaping.
16const REQUIRES_ESCAPE_TABLE: [bool; 256] = {
17 let mut table = [true; 256]; // Default: requires escape
18
19 // Whitespace that doesn't require escaping
20 table[b'\t' as usize] = false;
21 table[b'\n' as usize] = false;
22 table[b'\r' as usize] = false;
23
24 // Printable ASCII (space through tilde) except backtick
25 let mut i = b' ';
26 while i <= b'~' {
27 if i != b'`' {
28 table[i as usize] = false;
29 }
30 i += 1;
31 }
32
33 table
34};
35
36const HEX_NIBBLE_DECODE_INVALID_ERR: u8 = 0xFF;
37const HEX_NIBBLE_DECODE_LOWERCASE_ERR: u8 = 0xFE;
38
39/// Lookup table for hex ASCII character to nibble.
40///
41/// Values:
42/// - 0x00-0x0F: Valid uppercase hex digit
43/// - `HEX_LOWERCASE`: Lowercase hex digit (a-f)
44/// - `HEX_INVALID`: Invalid character
45const HEX_NIBBLE_DECODE_TABLE: [u8; 256] = {
46 let mut table = [HEX_NIBBLE_DECODE_INVALID_ERR; 256];
47
48 // Digits '0'-'9' -> 0-9
49 let mut i = b'0';
50 while i <= b'9' {
51 table[i as usize] = i - b'0';
52 i += 1;
53 }
54
55 // Uppercase 'A'-'F' -> 10-15
56 i = b'A';
57 while i <= b'F' {
58 table[i as usize] = i - b'A' + 10;
59 i += 1;
60 }
61
62 // Lowercase 'a'-'f' -> lowercase error
63 i = b'a';
64 while i <= b'f' {
65 table[i as usize] = HEX_NIBBLE_DECODE_LOWERCASE_ERR;
66 i += 1;
67 }
68
69 table
70};
71
72/// Encode the given input as a string, escaping any bytes that require it.
73/// If no bytes require escaping, then the result will be borrowed from
74/// the input.
75///
76/// ## Example
77///
78/// ```
79/// # #![cfg(feature = "alloc")]
80/// let encoded = tick_encoding::encode(b"hello world!");
81/// assert_eq!(encoded, "hello world!");
82///
83/// let encoded = tick_encoding::encode(&[0x00, 0xFF]);
84/// assert_eq!(encoded, "`00`FF");
85/// ```
86#[cfg(feature = "alloc")]
87#[must_use]
88pub fn encode(input: &[u8]) -> Cow<'_, str> {
89 // Get the first index that needs to be escaped
90 input
91 .iter()
92 .position(|byte| requires_escape(*byte))
93 // If no escape needed, borrow input. Otherwise encode from that index
94 .map_or_else(
95 || {
96 debug_assert!(input.is_ascii());
97
98 // SAFETY: We know the entire input is valid ASCII and UTF-8, and
99 // additionally doesn't require any bytes to be escaped
100 Cow::Borrowed(from_utf8_unchecked_potentially_unsafe(input))
101 },
102 |index| {
103 // We know everything up to `index` does not require escaping
104 let validated = &input[..index];
105 debug_assert!(validated.is_ascii());
106
107 // SAFETY: We know the input up to this point is valid ASCII and
108 // UTF-8, since nothing up to this point needs escaping
109 let validated = from_utf8_unchecked_potentially_unsafe(validated);
110
111 let mut output = String::with_capacity(input.len() + 1);
112 output.push_str(validated);
113
114 // Encode the remainder of the input
115 let requires_encoding = &input[index..];
116 encode_to_string(requires_encoding, &mut output);
117 Cow::Owned(output)
118 },
119 )
120}
121
122/// Return an iterator that encodes the bytes from the input iterator.
123///
124/// ## Example
125///
126/// ```
127/// let iter = tick_encoding::encode_iter(b"x: \x00".iter().copied());
128/// assert_eq!(iter.collect::<String>(), "x: `00");
129/// ```
130pub fn encode_iter<I>(iter: I) -> iter::EncodeIter<I::IntoIter>
131where
132 I: IntoIterator<Item = u8>,
133{
134 iter::EncodeIter::new(iter.into_iter())
135}
136
137/// Decode the given encoded input into a byte array. If no bytes need to
138/// be un-escaped, then the result will be borrowed from the input.
139///
140/// Returns an error if the input isn't a valid ASCII string, or isn't a
141/// valid canonical tick-encoding.
142///
143/// # Errors
144///
145/// Returns a [`DecodeError`] if the input is not valid tick-encoded data.
146///
147/// ## Example
148///
149/// ```
150/// # #![cfg(feature = "alloc")]
151/// let decoded = tick_encoding::decode(b"hello world!").unwrap();
152/// assert_eq!(decoded, "hello world!".as_bytes());
153///
154/// let decoded = tick_encoding::decode(b"`00`FF").unwrap();
155/// assert_eq!(decoded, [0x00, 0xFF].as_slice());
156/// ```
157#[cfg(feature = "alloc")]
158pub fn decode(input: &[u8]) -> Result<Cow<'_, [u8]>, DecodeError> {
159 // Get the first index that isn't already a valid unescaped byte
160 let escape_index = input.iter().position(|byte| requires_escape(*byte));
161
162 match escape_index {
163 Some(index) => {
164 // We know everything up to `index` does not need to be unescaped
165 let validated = &input[..index];
166
167 // Start by copying the validated input as-is. For the capacity,
168 // we use a formula that gives the minimum length of the output
169 // (i.e. assuming every remaining byte is escaped)
170 let output_est_capacity = validated.len() + (input.len() - validated.len() + 2) / 3;
171 let mut output = Vec::with_capacity(output_est_capacity);
172 output.extend_from_slice(validated);
173
174 // Decode the remainder of the input
175 let requires_decoding = &input[index..];
176 decode_to_vec(requires_decoding, &mut output)?;
177 Ok(Cow::Owned(output))
178 }
179 None => Ok(Cow::Borrowed(input)),
180 }
181}
182
183/// Return an iterator that decodes the tick-encoded characters from the input
184/// iterator. Returns `Some(Err(_))` if the input character sequence is invalid,
185/// then returns `None` after that.
186///
187/// ## Example
188///
189/// ```
190/// let iter = tick_encoding::decode_iter(b"`00`01".iter().copied());
191/// assert_eq!(iter.collect::<Result<Vec<_>, _>>().unwrap(), vec![0x00, 0x01]);
192/// ```
193pub fn decode_iter<I>(iter: I) -> iter::DecodeIter<I::IntoIter>
194where
195 I: IntoIterator<Item = u8>,
196{
197 iter::DecodeIter::new(iter.into_iter())
198}
199
200/// Decode a tick-encoded ASCII string in-place.
201///
202/// Takes a byte slice containing a tick-encoded ASCII string, and decodes it
203/// in-place, writing back into the same byte slice. Returns a sub-slice
204/// containing just the decoded bytes (the bytes past the returned sub-slice
205/// are left unchanged).
206///
207/// # Errors
208///
209/// Returns a [`DecodeError`] if the input is not valid tick-encoded data.
210///
211/// ## Example
212///
213/// ```rust
214/// let mut buffer = b"bytes: `00`01`02`03".to_vec();
215/// let decoded = tick_encoding::decode_in_place(&mut buffer).unwrap();
216/// assert_eq!(decoded, b"bytes: \x00\x01\x02\x03");
217/// ```
218pub fn decode_in_place(input: &mut [u8]) -> Result<&mut [u8], DecodeError> {
219 // Get the first index that isn't already a valid unescaped byte
220 let Some(escape_index) = input.iter().position(|byte| requires_escape(*byte)) else {
221 // Nothing needs to be unescaped
222 return Ok(input);
223 };
224
225 // Walk through the rest of the input. The bytes between `0..head` have been
226 // decoded, and the bytes between `tail..input.len()` are still encoded.
227 // Since the encoded form is always as long as the decoded form or longer,
228 // `head` will always be less than or equal to `tail`.
229 //
230 // This technique is very similar to the one from `in-place-string-map` (see
231 // https://crates.io/crates/in-place-string-map), but works on a byte slice
232 // instead.
233 let mut head = escape_index;
234 let mut tail = escape_index;
235 while tail < input.len() {
236 if input[tail] == b'`' {
237 let escaped = input.get(tail + 1).ok_or(DecodeError::UnexpectedEnd)?;
238 match escaped {
239 b'`' => {
240 input[head] = b'`';
241 tail += 2;
242 head += 1;
243 }
244 high => {
245 let low = input.get(tail + 2).ok_or(DecodeError::UnexpectedEnd)?;
246 let byte = hex_bytes_to_byte(*high, *low)?;
247 input[head] = byte;
248 tail += 3;
249 head += 1;
250 }
251 }
252 } else if requires_escape(input[tail]) {
253 return Err(DecodeError::InvalidByte(input[tail]));
254 } else {
255 input[head] = input[tail];
256 tail += 1;
257 head += 1;
258 }
259 }
260
261 let decoded = &mut input[..head];
262 Ok(decoded)
263}
264
265/// Returns true if the given byte must be escaped with a backtick.
266///
267/// The following ASCII bytes **do not** require escaping, and are left
268/// un-escaped in a tick-encoded string:
269///
270/// - Tab (`\t`, 0x09)
271/// - Newline (`\n`, 0x0A)
272/// - Carriage return (`\r`, 0x0D)
273/// - Space (` `, 0x20)
274/// - Printable characters except backtick (0x21 to 0x59, 0x61 to 0x7E)
275#[inline]
276#[must_use]
277pub const fn requires_escape(byte: u8) -> bool {
278 REQUIRES_ESCAPE_TABLE[byte as usize]
279}
280
281/// Encode the given input, and append the result to `output`. Returns
282/// the number of bytes / characters appended (only ASCII characters are
283/// appended).
284///
285/// ## Example
286///
287/// ```
288/// # #![cfg(feature = "alloc")]
289/// let mut output = String::new();
290/// let count = tick_encoding::encode_to_string("hello, world! 🙂".as_bytes(), &mut output);
291/// assert_eq!(output, "hello, world! `F0`9F`99`82");
292/// assert_eq!(count, 26);
293/// ```
294#[cfg(feature = "alloc")]
295pub fn encode_to_string(input: &[u8], output: &mut String) -> usize {
296 let mut written = 0;
297 output.reserve(input.len());
298 for &byte in input {
299 if byte == b'`' {
300 output.push_str("``");
301 written += 2;
302 } else if requires_escape(byte) {
303 let [high, low] = byte_to_hex_chars(byte);
304 output.push('`');
305 output.push(high);
306 output.push(low);
307
308 written += 3;
309 } else {
310 output.push(byte as char);
311 written += 1;
312 }
313 }
314
315 written
316}
317
318/// Encode the given input, and append the result to `output`. Returns
319/// the number of bytes appended.
320///
321/// ## Example
322///
323/// ```
324/// let mut output = vec![];
325/// let count = tick_encoding::encode_to_vec("hello, world! 🙂".as_bytes(), &mut output);
326/// assert_eq!(output, b"hello, world! `F0`9F`99`82");
327/// assert_eq!(count, 26);
328/// ```
329#[cfg(feature = "alloc")]
330pub fn encode_to_vec(input: &[u8], output: &mut Vec<u8>) -> usize {
331 let mut written = 0;
332 output.reserve(input.len());
333 for &byte in input {
334 if byte == b'`' {
335 output.extend_from_slice(b"``");
336 written += 2;
337 } else if requires_escape(byte) {
338 let [high, low] = byte_to_hex_bytes(byte);
339 output.extend_from_slice(&[b'`', high, low]);
340
341 written += 3;
342 } else {
343 output.push(byte);
344 written += 1;
345 }
346 }
347
348 written
349}
350
351/// Decode tick-encoded ASCII input and append the result to a vector.
352///
353/// Returns the number of bytes appended. Returns an error if the result
354/// isn't a valid ASCII string, or isn't a valid canonical tick-encoding.
355///
356/// # Errors
357///
358/// Returns a [`DecodeError`] if the input is not valid tick-encoded data.
359///
360/// ## Example
361///
362/// ```
363/// let mut output = vec![];
364/// let count = tick_encoding::decode_to_vec(b"hello, world! `F0`9F`99`82", &mut output).unwrap();
365/// let output_str = core::str::from_utf8(&output).unwrap();
366/// assert_eq!(output_str, "hello, world! 🙂");
367/// assert_eq!(count, 18);
368/// ```
369#[cfg(feature = "alloc")]
370pub fn decode_to_vec(input: &[u8], output: &mut Vec<u8>) -> Result<usize, DecodeError> {
371 let mut written = 0;
372 let mut iter = input.iter();
373 while let Some(&byte) = iter.next() {
374 if byte == b'`' {
375 let escaped = iter.next().ok_or(DecodeError::UnexpectedEnd)?;
376 match escaped {
377 b'`' => {
378 output.push(b'`');
379 written += 1;
380 }
381 high => {
382 let low = iter.next().ok_or(DecodeError::UnexpectedEnd)?;
383 let byte = hex_bytes_to_byte(*high, *low)?;
384 output.push(byte);
385 written += 1;
386 }
387 }
388 } else if requires_escape(byte) {
389 return Err(DecodeError::InvalidByte(byte));
390 } else {
391 output.push(byte);
392 written += 1;
393 }
394 }
395
396 Ok(written)
397}
398
399/// Convert a nibble to its uppercase hex ASCII character.
400#[inline]
401const fn nibble_to_hex(n: u8) -> u8 {
402 // 0-9 → '0'-'9'
403 // 10-15 → 'A'-'F' (add 7 to skip the ASCII gap between '9' and 'A')
404 n + b'0' + ((n > 9) as u8) * 7
405}
406
407/// Convert a byte to its two-character uppercase hex representation.
408#[inline]
409const fn byte_to_hex_bytes(byte: u8) -> [u8; 2] {
410 [nibble_to_hex(byte >> 4), nibble_to_hex(byte & 0x0F)]
411}
412
413const fn byte_to_hex_chars(byte: u8) -> [char; 2] {
414 let [high_byte, low_byte] = byte_to_hex_bytes(byte);
415 [high_byte as char, low_byte as char]
416}
417
418/// Decode two hex ASCII characters into a single byte.
419///
420/// Returns an error if:
421/// - Either character is not a valid hex digit (`InvalidHex`)
422/// - Either character is lowercase a-f (`LowercaseHex`)
423/// - The decoded byte doesn't require escaping (`UnexpectedEscape`)
424#[inline]
425const fn hex_bytes_to_byte(high: u8, low: u8) -> Result<u8, DecodeError> {
426 let high_value = HEX_NIBBLE_DECODE_TABLE[high as usize];
427 let low_value = HEX_NIBBLE_DECODE_TABLE[low as usize];
428
429 match (high_value, low_value) {
430 // Both valid hex digits (0x00-0x0F)
431 (0..=0x0F, 0..=0x0F) => {
432 let byte = (high_value << 4) | low_value;
433
434 if byte == b'`' || !requires_escape(byte) {
435 return Err(DecodeError::UnexpectedEscape(
436 EscapedHex(high, low),
437 byte as char,
438 ));
439 }
440
441 Ok(byte)
442 }
443 // At least one invalid character
444 (HEX_NIBBLE_DECODE_INVALID_ERR, _) | (_, HEX_NIBBLE_DECODE_INVALID_ERR) => {
445 Err(DecodeError::InvalidHex(EscapedHex(high, low)))
446 }
447 // Must be lowercase
448 _ => Err(DecodeError::LowercaseHex(EscapedHex(high, low))),
449 }
450}
451
452#[cfg(feature = "safe")]
453fn from_utf8_unchecked_potentially_unsafe(bytes: &[u8]) -> &str {
454 core::str::from_utf8(bytes).unwrap()
455}
456
457#[cfg(not(feature = "safe"))]
458fn from_utf8_unchecked_potentially_unsafe(bytes: &[u8]) -> &str {
459 debug_assert!(bytes.is_ascii());
460 unsafe { core::str::from_utf8_unchecked(bytes) }
461}
462
463/// An error trying to decode a tick-encoded string.
464#[derive(Debug)]
465#[cfg_attr(feature = "std", derive(thiserror::Error))]
466pub enum DecodeError {
467 /// Encountered an invalid byte in the string. This could either by a
468 /// non-ASCII byte or an ASCII byte that requires escaping (see
469 /// [`requires_escape`]).
470 #[cfg_attr(feature = "std", error("invalid encoded byte 0x{0:02x}"))]
471 InvalidByte(u8),
472 /// Reached the end of the string following a backtick (\`). A backtick
473 /// must be followed by either another backtick or a 2-digit hex value.
474 #[cfg_attr(feature = "std", error("unexpected end after `"))]
475 UnexpectedEnd,
476 /// Tried to decode a 2-digit hex value, but the value does not require
477 /// escaping (see [`requires_escape`]).
478 #[cfg_attr(feature = "std", error("unexpected escape {0}, expected {1}"))]
479 UnexpectedEscape(EscapedHex, char),
480 /// Tried to decode a 2-digit hex value, but the hex value contained
481 /// the values `[a-f]`. Escaped hex values must use `[A-F]`.
482 #[cfg_attr(feature = "std", error("expected uppercase hex sequence, found {0}"))]
483 LowercaseHex(EscapedHex),
484 /// Tried to decode a 2-digit hex value, but an invalid hex digit
485 /// was found. Escaped hex values must use the characters `[0-9A-F]`.
486 #[cfg_attr(feature = "std", error("invalid hex sequence {0}"))]
487 InvalidHex(EscapedHex),
488}
489
490/// A two-digit escaped hex sequence, prefixed with a backtick.
491pub struct EscapedHex(pub u8, pub u8);
492
493impl core::fmt::Debug for EscapedHex {
494 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
495 let Self(high, low) = self;
496 if requires_escape(*high) || requires_escape(*low) {
497 f.debug_tuple("EscapedHex")
498 .field(&self.0)
499 .field(&self.1)
500 .finish()
501 } else {
502 f.debug_tuple("EscapedHex")
503 .field(&(*high as char))
504 .field(&(*low as char))
505 .finish()
506 }
507 }
508}
509
510impl core::fmt::Display for EscapedHex {
511 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
512 let Self(high, low) = self;
513 if requires_escape(*high) || requires_escape(*low) {
514 write!(f, "0x{high:02X} 0x{low:02X}")
515 } else {
516 write!(f, "`{}{}", *high as char, *low as char)
517 }
518 }
519}