bonsaidb_core/document/
id.rs

1use std::borrow::Cow;
2use std::fmt::{Display, Write};
3use std::hash::Hash;
4use std::mem::size_of;
5use std::ops::Deref;
6use std::str::FromStr;
7
8use actionable::Identifier;
9use serde::de::Visitor;
10use serde::{Deserialize, Serialize};
11use tinyvec::{Array, TinyVec};
12
13use crate::key::{ByteSource, Key, KeyEncoding, KeyKind, KeyVisitor};
14
15/// The serialized representation of a document's unique ID.
16#[derive(Default, Ord, Hash, Eq, PartialEq, PartialOrd, Clone)]
17pub struct DocumentId(TinyVec<[u8; Self::INLINE_SIZE]>);
18
19impl Deref for DocumentId {
20    type Target = [u8];
21
22    fn deref(&self) -> &[u8] {
23        &self.0
24    }
25}
26
27impl std::fmt::Debug for DocumentId {
28    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
29        f.write_str("DocumentId(")?;
30        arc_bytes::print_bytes(self, f)?;
31        f.write_char(')')
32    }
33}
34
35impl Display for DocumentId {
36    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
37        if let Ok(string) = std::str::from_utf8(self.as_ref()) {
38            if string.bytes().all(|b| (32..=127).contains(&b)) {
39                return f.write_str(string);
40            }
41        }
42
43        if let Some((first_nonzero_byte, _)) = self
44            .as_ref()
45            .iter()
46            .copied()
47            .enumerate()
48            .find(|(_index, b)| *b != 0)
49        {
50            if first_nonzero_byte > 0 {
51                write!(f, "{first_nonzero_byte:x}$")?;
52            } else {
53                f.write_char('$')?;
54            }
55
56            for (index, byte) in self[first_nonzero_byte..].iter().enumerate() {
57                if index > 0 {
58                    write!(f, "{byte:02x}")?;
59                } else {
60                    write!(f, "{byte:x}")?;
61                }
62            }
63            Ok(())
64        } else {
65            // All zeroes
66            write!(f, "{:x}$", self.len())
67        }
68    }
69}
70
71impl<'a> From<DocumentId> for Identifier<'a> {
72    fn from(id: DocumentId) -> Self {
73        Identifier::from(id.to_vec())
74    }
75}
76
77impl<'a> From<&'a DocumentId> for Identifier<'a> {
78    fn from(id: &'a DocumentId) -> Self {
79        Identifier::from(&**id)
80    }
81}
82
83#[test]
84fn document_id_identifier_tests() {
85    assert_eq!(
86        Identifier::from(DocumentId::new("hello").unwrap()),
87        Identifier::from("hello")
88    );
89    assert_eq!(
90        Identifier::from(DocumentId::from_u64(1)),
91        Identifier::from(1)
92    );
93}
94
95/// An invalid hexadecimal character was encountered.
96#[derive(thiserror::Error, Debug)]
97#[error("invalid hexadecimal bytes")]
98pub struct InvalidHexadecimal;
99
100const fn decode_hex_nibble(byte: u8) -> Result<u8, InvalidHexadecimal> {
101    match byte {
102        b'0'..=b'9' => Ok(byte - b'0'),
103        b'A'..=b'F' => Ok(byte - b'A' + 10),
104        b'a'..=b'f' => Ok(byte - b'a' + 10),
105        _ => Err(InvalidHexadecimal),
106    }
107}
108
109impl FromStr for DocumentId {
110    type Err = crate::Error;
111
112    fn from_str(s: &str) -> Result<Self, Self::Err> {
113        if s.is_empty() {
114            return Ok(Self::default());
115        }
116
117        let bytes = s.as_bytes();
118        if let Some((pound_offset, _)) = s.bytes().enumerate().find(|(_index, b)| *b == b'$') {
119            if pound_offset > 5 {
120                return Err(crate::Error::DocumentIdTooLong);
121            }
122
123            let preceding_zeroes = if pound_offset > 0 {
124                let mut length = TinyVec::<[u8; 1]>::new();
125                decode_big_endian_hex(&bytes[0..pound_offset], &mut length)?;
126                let mut zeroes = [0_u8; size_of::<usize>()];
127                let offset = zeroes.len() - length.len();
128                zeroes[offset..].copy_from_slice(&length);
129                usize::from_be_bytes(zeroes)
130            } else {
131                0
132            };
133
134            let mut id = TinyVec::new();
135            decode_big_endian_hex(&bytes[pound_offset + 1..], &mut id)?;
136            if preceding_zeroes > 0 {
137                let total_length = preceding_zeroes + id.len();
138                if total_length > Self::MAX_LENGTH {
139                    return Err(crate::Error::DocumentIdTooLong);
140                }
141                // The full length indicated a longer ID, so we need to prefix some null bytes.
142                id.splice(0..0, std::iter::repeat(0).take(preceding_zeroes));
143            }
144            Ok(Self(id))
145        } else if bytes.len() > Self::MAX_LENGTH {
146            Err(crate::Error::DocumentIdTooLong)
147        } else {
148            // UTF-8 representable
149            Self::try_from(bytes)
150        }
151    }
152}
153
154fn decode_big_endian_hex<A: Array<Item = u8>>(
155    bytes: &[u8],
156    output: &mut TinyVec<A>,
157) -> Result<(), crate::Error> {
158    let mut chunks = if bytes.len() & 1 == 0 {
159        bytes.chunks_exact(2)
160    } else {
161        // Odd amount of bytes, special case the first char
162        output.push(decode_hex_nibble(bytes[0])?);
163        bytes[1..].chunks_exact(2)
164    };
165    for chunk in &mut chunks {
166        let upper = decode_hex_nibble(chunk[0])?;
167        let lower = decode_hex_nibble(chunk[1])?;
168        output.push(upper << 4 | lower);
169    }
170    if !chunks.remainder().is_empty() {
171        return Err(crate::Error::from(InvalidHexadecimal));
172    }
173    Ok(())
174}
175
176#[test]
177fn document_id_parsing() {
178    fn test_id(bytes: &[u8], display: &str) {
179        let id = DocumentId::try_from(bytes).unwrap();
180        let as_string = id.to_string();
181        assert_eq!(as_string, display);
182        let parsed = DocumentId::from_str(&as_string).unwrap();
183        assert_eq!(&*parsed, bytes);
184    }
185
186    test_id(b"hello", "hello");
187    test_id(b"\x00\x0a\xaf\xfa", "1$aaffa");
188    test_id(&1_u128.to_be_bytes(), "f$1");
189    test_id(&17_u8.to_be_bytes(), "$11");
190    test_id(&[0_u8; 63], "3f$");
191    // The above test is the same as this one, at the time of writing, but in
192    // case we update MAX_LENGTH in the future, this extra test will ensure the
193    // max-length formatting is always tested.
194    test_id(
195        &vec![0_u8; DocumentId::MAX_LENGTH],
196        &format!("{:x}$", DocumentId::MAX_LENGTH),
197    );
198}
199
200impl<'a> TryFrom<&'a [u8]> for DocumentId {
201    type Error = crate::Error;
202
203    fn try_from(bytes: &'a [u8]) -> Result<Self, Self::Error> {
204        if bytes.len() <= Self::MAX_LENGTH {
205            Ok(Self(TinyVec::from(bytes)))
206        } else {
207            Err(crate::Error::DocumentIdTooLong)
208        }
209    }
210}
211
212impl<'a> TryFrom<Cow<'a, [u8]>> for DocumentId {
213    type Error = crate::Error;
214
215    fn try_from(bytes: Cow<'a, [u8]>) -> Result<Self, Self::Error> {
216        Self::try_from(bytes.as_ref())
217    }
218}
219
220impl<const N: usize> TryFrom<[u8; N]> for DocumentId {
221    type Error = crate::Error;
222
223    fn try_from(bytes: [u8; N]) -> Result<Self, Self::Error> {
224        Self::try_from(&bytes[..])
225    }
226}
227
228impl DocumentId {
229    const INLINE_SIZE: usize = 16;
230    /// The maximum size able to be stored in a document's unique id.
231    pub const MAX_LENGTH: usize = 65_535;
232
233    /// Returns a new instance with `value` as the identifier..
234    pub fn new<PrimaryKey: for<'k> Key<'k>, PrimaryKeyRef: KeyEncoding<PrimaryKey> + ?Sized>(
235        value: &PrimaryKeyRef,
236    ) -> Result<Self, crate::Error> {
237        let bytes = value
238            .as_ord_bytes()
239            .map_err(|err| crate::Error::other("key serialization", err))?;
240        Self::try_from(&bytes[..])
241    }
242
243    /// Returns a new document ID for a u64. This is equivalent to
244    /// `DocumentId::new(id)`, but since this function accepts a non-generic
245    /// type, it can help with type inference in some expressions.
246    #[must_use]
247    #[allow(clippy::missing_panics_doc)] // Unwrap is impossible to fail.
248    pub fn from_u64(id: u64) -> Self {
249        Self::try_from(&id.to_be_bytes()[..]).unwrap()
250    }
251
252    /// Returns a new document ID for a u32. This is equivalent to
253    /// `DocumentId::new(id)`, but since this function accepts a non-generic
254    /// type, it can help with type inference in some expressions.
255    #[must_use]
256    #[allow(clippy::missing_panics_doc)] // Unwrap is impossible to fail.
257    pub fn from_u32(id: u32) -> Self {
258        Self::try_from(&id.to_be_bytes()[..]).unwrap()
259    }
260
261    /// Returns the contained value, deserialized back to its original type.
262    pub fn deserialize<'k, PrimaryKey: Key<'k>>(&'k self) -> Result<PrimaryKey, crate::Error> {
263        PrimaryKey::from_ord_bytes(ByteSource::Borrowed(self.as_ref()))
264            .map_err(|err| crate::Error::other("key serialization", err))
265    }
266}
267
268impl Serialize for DocumentId {
269    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
270    where
271        S: serde::Serializer,
272    {
273        serializer.serialize_bytes(self.as_ref())
274    }
275}
276
277impl<'de> Deserialize<'de> for DocumentId {
278    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
279    where
280        D: serde::Deserializer<'de>,
281    {
282        deserializer.deserialize_byte_buf(DocumentIdVisitor)
283    }
284}
285
286struct DocumentIdVisitor;
287
288impl<'de> Visitor<'de> for DocumentIdVisitor {
289    type Value = DocumentId;
290
291    fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
292        formatter.write_str("a document id (bytes)")
293    }
294
295    fn visit_bytes<E>(self, v: &[u8]) -> Result<Self::Value, E>
296    where
297        E: serde::de::Error,
298    {
299        Ok(DocumentId(TinyVec::from(v)))
300    }
301}
302
303impl<'k> Key<'k> for DocumentId {
304    const CAN_OWN_BYTES: bool = false;
305
306    fn from_ord_bytes<'e>(bytes: ByteSource<'k, 'e>) -> Result<Self, Self::Error> {
307        Self::try_from(bytes.as_ref())
308    }
309}
310
311impl<PrimaryKey> KeyEncoding<PrimaryKey> for DocumentId
312where
313    PrimaryKey: for<'pk> Key<'pk>,
314{
315    type Error = crate::Error;
316
317    const LENGTH: Option<usize> = None;
318
319    fn describe<Visitor>(visitor: &mut Visitor)
320    where
321        Visitor: KeyVisitor,
322    {
323        visitor.visit_type(KeyKind::Bytes);
324    }
325
326    fn as_ord_bytes(&self) -> Result<Cow<'_, [u8]>, Self::Error> {
327        Ok(Cow::Borrowed(self))
328    }
329}