reifydb_core/util/encoding/binary.rs
1// Copyright (c) reifydb.com 2025
2// This file is licensed under the AGPL-3.0-or-later, see license.md file
3
4// This file includes and modifies code from the toydb project (https://github.com/erikgrinaker/toydb),
5// originally licensed under the Apache License, Version 2.0.
6// Original copyright:
7// Copyright (c) 2024 Erik Grinaker
8//
9// The original Apache License can be found at:
10// http://www.apache.org/licenses/LICENSE-2.0
11
12use crate::util::CowVec;
13
14/// Decodes a raw byte vector from a Unicode string. Code points in the
15/// range U+0080 to U+00FF are converted back to bytes 0x80 to 0xff.
16/// This allows using e.g. \xff in the input string literal, and getting
17/// back a 0xff byte in the byte vector. Otherwise, char(0xff) yields
18/// the UTF-8 bytes 0xc3bf, which is the U+00FF code point as UTF-8.
19/// These characters are effectively represented as ISO-8859-1 rather
20/// than UTF-8, but it allows precise use of the entire u8 value range.
21pub fn decode_binary(s: &str) -> CowVec<u8> {
22 let mut buf = [0; 4];
23 let mut bytes = Vec::new();
24 for c in s.chars() {
25 // u32 is the Unicode code point, not the UTF-8 encoding.
26 match c as u32 {
27 b @ 0x80..=0xff => bytes.push(b as u8),
28 _ => bytes.extend(c.encode_utf8(&mut buf).as_bytes()),
29 }
30 }
31 CowVec::new(bytes)
32}