tree_buf/internal/types/
string.rs

1use crate::internal::encodings::varint::*;
2use crate::prelude::*;
3use rle::RLE;
4use std::borrow::Borrow;
5use std::vec::IntoIter;
6
7// TODO: Consider compressed unicode (SCSU?) for String in general,
8// but in particular for schema strings. As schema strings need only
9// be compared and usually not displayed we can do bit-for-bit comparisons
10// (Make sure that's true for SCSU, which may allow multiple encodings!)
11
12#[cfg(feature = "encode")]
13pub fn encode_str<O: EncodeOptions>(value: &str, stream: &mut EncoderStream<'_, O>) {
14    encode_usize(value.len(), stream);
15    stream.bytes.extend_from_slice(value.as_bytes());
16}
17
18#[cfg(feature = "decode")]
19fn decode_str_len<'a>(len: usize, bytes: &'a [u8], offset: &'_ mut usize) -> DecodeResult<&'a str> {
20    let utf8 = decode_bytes(len, bytes, offset)?;
21    Ok(std::str::from_utf8(utf8)?)
22}
23
24#[cfg(feature = "decode")]
25pub fn decode_str<'a>(bytes: &'a [u8], offset: &'_ mut usize) -> DecodeResult<&'a str> {
26    let len = decode_prefix_varint(bytes, offset)? as usize;
27    decode_str_len(len, bytes, offset)
28}
29
30#[cfg(feature = "encode")]
31impl Encodable for String {
32    type EncoderArray = Vec<&'static String>;
33    fn encode_root<O: EncodeOptions>(&self, stream: &mut EncoderStream<'_, O>) -> RootTypeId {
34        let value = self.as_str();
35        match value.len() {
36            0 => RootTypeId::Str0,
37            1 => {
38                stream.bytes.push(value.as_bytes()[0]);
39                RootTypeId::Str1
40            }
41            2 => {
42                stream.bytes.extend_from_slice(value.as_bytes());
43                RootTypeId::Str2
44            }
45            3 => {
46                stream.bytes.extend_from_slice(value.as_bytes());
47                RootTypeId::Str3
48            }
49            _ => {
50                let b = value.as_bytes();
51                encode_prefix_varint(b.len() as u64, stream.bytes);
52                stream.bytes.extend_from_slice(b);
53                RootTypeId::Str
54            }
55        }
56    }
57}
58
59#[cfg(feature = "encode")]
60impl EncoderArray<String> for Vec<&'static String> {
61    fn buffer_one<'a, 'b: 'a>(&'a mut self, value: &'b String) {
62        // TODO: Working around lifetime issues for lack of GAT
63        // A quick check makes this appear to be sound, since the signature
64        // requires that the value outlive self.
65        //
66        // The big safety problem is that whe then give these references
67        // away when flushing. We happen to know that nothing saves the references,
68        // but when things like threading come into play it's hard to know.
69        //
70        // TODO: Use extend_lifetime crate
71        self.push(unsafe { std::mem::transmute(value) });
72    }
73
74    fn flush<O: EncodeOptions>(self, stream: &mut EncoderStream<'_, O>) -> ArrayTypeId {
75        profile_method!(flush);
76
77        let compressors = (Utf8Compressor, RLE::new((Utf8Compressor,)), Dictionary::new((Utf8Compressor,)));
78
79        compress(&self[..], stream, &compressors)
80    }
81}
82
83#[cfg(feature = "decode")]
84impl Decodable for String {
85    // TODO: Use lifetimes to make this decode lazy rather than IntoIter
86    type DecoderArray = IntoIter<String>;
87    fn decode(sticks: DynRootBranch<'_>, _options: &impl DecodeOptions) -> DecodeResult<Self> {
88        profile_method!(decode);
89        match sticks {
90            DynRootBranch::String(s) => Ok(s.to_owned()),
91            _ => Err(DecodeError::SchemaMismatch),
92        }
93    }
94}
95
96#[cfg(feature = "decode")]
97impl InfallibleDecoderArray for IntoIter<String> {
98    type Decode = String;
99
100    fn new_infallible(sticks: DynArrayBranch<'_>, options: &impl DecodeOptions) -> DecodeResult<Self> {
101        profile_method!(new_infallible);
102
103        match sticks {
104            DynArrayBranch::String(bytes) => {
105                profile_section!(str_utf8);
106
107                let strs = decode_all(&bytes, |b, o| decode_str(b, o).and_then(|v| Ok(v.to_owned())))?;
108                Ok(strs.into_iter())
109            }
110            DynArrayBranch::RLE { runs, values } => {
111                let rle = RleIterator::new(runs, values, options, |values| Self::new_infallible(values, options))?;
112                let all = rle.collect::<Vec<_>>();
113                Ok(all.into_iter())
114            }
115            DynArrayBranch::Dictionary { indices, values } => {
116                let dict = DictionaryIterator::new(indices, values, options, |values| Self::new_infallible(values, options))?;
117                let all = dict.collect::<Vec<_>>();
118                Ok(all.into_iter())
119            }
120            _ => Err(DecodeError::SchemaMismatch),
121        }
122    }
123    fn decode_next_infallible(&mut self) -> Self::Decode {
124        self.next().unwrap_or_default()
125    }
126}
127
128#[cfg(feature = "encode")]
129pub(crate) struct Utf8Compressor;
130
131// TODO: The Borrow<String> here is interesting. Can we get rid of other lifetimes?
132#[cfg(feature = "encode")]
133impl<T: Borrow<String>> Compressor<T> for Utf8Compressor {
134    fn fast_size_for<O: EncodeOptions>(&self, data: &[T], _options: &O) -> Result<usize, ()> {
135        profile_method!(fast_size_for);
136        let mut total = 0;
137        for s in data {
138            total += size_for_varint(s.borrow().len() as u64);
139            total += s.borrow().as_bytes().len();
140        }
141        Ok(total + size_for_varint(total as u64))
142    }
143    fn compress<O: EncodeOptions>(&self, data: &[T], stream: &mut EncoderStream<'_, O>) -> Result<ArrayTypeId, ()> {
144        profile_method!(compress);
145
146        stream.encode_with_len(|stream| {
147            for value in data.iter() {
148                encode_prefix_varint(value.borrow().len() as u64, stream.bytes);
149                stream.bytes.extend_from_slice(value.borrow().as_bytes());
150            }
151        });
152
153        Ok(ArrayTypeId::Utf8)
154    }
155}