utf8_builder/
lib.rs

1/*!
2# UTF-8 Builder
3
4Build and validate UTF-8 data from chunks. Each chunk doesn't have to be a complete UTF-8 data.
5
6## Motives and Examples
7
8When we want our Rust program to input a UTF-8 data, we can store all data in the memory and use `String::from_utf8(vec)` to validate it and convert it into a `String` instance.
9
10However, it would be better if we perform UTF-8 validation while fetching and storing the data into the memory. In such a way, if the data is not UTF-8, we don't have to waste the memory space and time to store all of it.
11
12```rust
13use utf8_builder::Utf8Builder;
14
15const TEXT1: &str = "is is English.";
16const TEXT2: &str = "這是中文。";
17
18let mut builder = Utf8Builder::new();
19
20builder.push(b'T').unwrap();
21builder.push_char('h').unwrap();
22builder.push_str(TEXT1).unwrap();
23builder.push_chunk(TEXT2.as_bytes()).unwrap();
24
25let result = builder.finalize().unwrap();
26
27assert_eq!(format!("Th{}{}", TEXT1, TEXT2), result);
28```
29
30## No Std
31
32Disable the default features to compile this crate without std.
33
34```toml
35[dependencies.utf8-builder]
36version = "*"
37default-features = false
38```
39*/
40
41#![cfg_attr(not(feature = "std"), no_std)]
42
43extern crate alloc;
44
45mod error;
46
47use core::cmp::Ordering;
48
49use alloc::string::String;
50use alloc::vec::Vec;
51
52pub use error::Utf8Error;
53
54/// A builder for Building and validating UTF-8 data from chunks.
55#[derive(Debug, Clone, Default)]
56pub struct Utf8Builder {
57    buffer: Vec<u8>,
58    /// the length for the incomplete character
59    sl: u8,
60    /// the valid expected length for the incomplete character
61    sel: u8,
62}
63
64impl Utf8Builder {
65    /// Constructs a new, empty `Utf8Builder`.
66    #[inline]
67    pub const fn new() -> Self {
68        Utf8Builder {
69            buffer: Vec::new(),
70            sl: 0,
71            sel: 0,
72        }
73    }
74
75    /// Constructs a new, empty `with_capacity` with a specific capacity.
76    #[inline]
77    pub fn with_capacity(capacity: usize) -> Self {
78        Utf8Builder {
79            buffer: Vec::with_capacity(capacity),
80            sl: 0,
81            sel: 0,
82        }
83    }
84
85    /// Reserves capacity for at least `additional` more elements to be inserted in the given `Utf8Builder`.
86    #[inline]
87    pub fn reserve(&mut self, additional: usize) {
88        self.buffer.reserve(additional);
89    }
90
91    /// Returns the number of elements in the buffer.
92    #[inline]
93    pub fn len(&self) -> usize {
94        self.buffer.len()
95    }
96
97    /// Returns `true` if the builder contains no data.
98    #[inline]
99    pub fn is_empty(&self) -> bool {
100        self.buffer.is_empty()
101    }
102}
103
104impl Utf8Builder {
105    /// Returns whether the current data are valid UTF-8
106    #[inline]
107    pub fn is_valid(&self) -> bool {
108        self.sl == 0
109    }
110
111    /// Try to get the `String` instance.
112    #[inline]
113    pub fn finalize(self) -> Result<String, Utf8Error> {
114        if self.is_valid() {
115            let s = unsafe { String::from_utf8_unchecked(self.buffer) };
116
117            Ok(s)
118        } else {
119            Err(Utf8Error)
120        }
121    }
122}
123
124impl Utf8Builder {
125    /// Pushes a byte.
126    pub fn push(&mut self, b: u8) -> Result<(), Utf8Error> {
127        if self.sl == 0 {
128            let w = utf8_width::get_width(b);
129
130            match w {
131                0 => return Err(Utf8Error),
132                1 => {
133                    self.buffer.push(b);
134                }
135                _ => {
136                    self.buffer.push(b);
137                    self.sl = 1;
138                    self.sel = w as u8;
139                }
140            }
141        } else if self.sl + 1 == self.sel {
142            self.buffer.push(b);
143
144            self.sl = 0;
145            // self.sel = 0; // no need
146        } else {
147            self.buffer.push(b);
148
149            self.sl += 1;
150        }
151
152        Ok(())
153    }
154
155    /// Pushes a `&str`.
156    #[inline]
157    pub fn push_str(&mut self, s: &str) -> Result<(), Utf8Error> {
158        if self.sl == 0 {
159            self.buffer.extend_from_slice(s.as_bytes());
160
161            Ok(())
162        } else {
163            Err(Utf8Error)
164        }
165    }
166
167    /// Pushes a char.
168    pub fn push_char(&mut self, c: char) -> Result<(), Utf8Error> {
169        if self.sl == 0 {
170            self.buffer.reserve(4);
171
172            let len = self.buffer.len();
173
174            unsafe {
175                self.buffer.set_len(len + 4);
176            }
177
178            let c = c.encode_utf8(&mut self.buffer[len..]).len();
179
180            unsafe {
181                self.buffer.set_len(len + c);
182            }
183
184            Ok(())
185        } else {
186            Err(Utf8Error)
187        }
188    }
189
190    /// Pushes a chunk.
191    pub fn push_chunk(&mut self, chunk: &[u8]) -> Result<(), Utf8Error> {
192        let chunk_size = chunk.len();
193
194        if chunk_size == 0 {
195            return Ok(());
196        }
197
198        let mut e = if self.sl > 0 {
199            let r = (self.sel - self.sl) as usize;
200
201            match r.cmp(&chunk_size) {
202                Ordering::Greater => {
203                    let sl = self.sl as usize;
204                    let nsl = sl + chunk_size;
205
206                    self.buffer.extend_from_slice(chunk);
207
208                    self.sl = nsl as u8;
209
210                    return Ok(());
211                }
212                Ordering::Equal => {
213                    self.buffer.extend_from_slice(chunk);
214
215                    self.sl = 0;
216                    // self.sel = 0; // no need
217
218                    return Ok(());
219                }
220                Ordering::Less => {
221                    self.buffer.extend_from_slice(&chunk[..r]);
222
223                    self.sl = 0;
224                    // self.sel = 0; // no need
225
226                    r
227                }
228            }
229        } else {
230            0usize
231        };
232
233        loop {
234            let w = utf8_width::get_width(chunk[e]);
235
236            if w == 0 {
237                return Err(Utf8Error);
238            }
239
240            let r = chunk_size - e;
241
242            if r >= w {
243                self.buffer.extend_from_slice(&chunk[e..e + w]);
244
245                e += w;
246
247                if e == chunk_size {
248                    break;
249                }
250            } else {
251                self.buffer.extend_from_slice(&chunk[e..]);
252
253                self.sl = r as u8;
254                self.sel = w as u8;
255
256                break;
257            }
258        }
259
260        Ok(())
261    }
262}
263
264impl From<&str> for Utf8Builder {
265    #[inline]
266    fn from(s: &str) -> Self {
267        Utf8Builder {
268            buffer: s.as_bytes().to_vec(),
269            sl: 0,
270            sel: 0,
271        }
272    }
273}
274
275impl From<String> for Utf8Builder {
276    #[inline]
277    fn from(s: String) -> Self {
278        Utf8Builder {
279            buffer: s.into_bytes(),
280            sl: 0,
281            sel: 0,
282        }
283    }
284}