typst_library/foundations/
bytes.rs

1use std::any::Any;
2use std::fmt::{self, Debug, Formatter};
3use std::hash::{Hash, Hasher};
4use std::ops::{Add, AddAssign, Deref};
5use std::str::Utf8Error;
6use std::sync::Arc;
7
8use ecow::{EcoString, eco_format};
9use serde::{Serialize, Serializer};
10use typst_syntax::Lines;
11use typst_utils::LazyHash;
12
13use crate::diag::{StrResult, bail};
14use crate::foundations::{Array, Reflect, Repr, Str, Value, cast, func, scope, ty};
15
16/// A sequence of bytes.
17///
18/// This is conceptually similar to an array of [integers]($int) between `{0}`
19/// and `{255}`, but represented much more efficiently. You can iterate over it
20/// using a [for loop]($scripting/#loops).
21///
22/// You can convert
23/// - a [string]($str) or an [array] of integers to bytes with the [`bytes`]
24///   constructor
25/// - bytes to a string with the [`str`] constructor, with UTF-8 encoding
26/// - bytes to an array of integers with the [`array`] constructor
27///
28/// When [reading]($read) data from a file, you can decide whether to load it
29/// as a string or as raw bytes.
30///
31/// ```example
32/// #bytes((123, 160, 22, 0)) \
33/// #bytes("Hello 😃")
34///
35/// #let data = read(
36///   "rhino.png",
37///   encoding: none,
38/// )
39///
40/// // Magic bytes.
41/// #array(data.slice(0, 4)) \
42/// #str(data.slice(1, 4))
43/// ```
44#[ty(scope, cast)]
45#[derive(Clone, Hash)]
46#[allow(clippy::derived_hash_with_manual_eq)]
47pub struct Bytes(Arc<LazyHash<dyn Bytelike>>);
48
49impl Bytes {
50    /// Create `Bytes` from anything byte-like.
51    ///
52    /// The `data` type will directly back this bytes object. This means you can
53    /// e.g. pass `&'static [u8]` or `[u8; 8]` and no extra vector will be
54    /// allocated.
55    ///
56    /// If the type is `Vec<u8>` and the `Bytes` are unique (i.e. not cloned),
57    /// the vector will be reused when mutating to the `Bytes`.
58    ///
59    /// If your source type is a string, prefer [`Bytes::from_string`] to
60    /// directly use the UTF-8 encoded string data without any copying.
61    pub fn new<T>(data: T) -> Self
62    where
63        T: AsRef<[u8]> + Send + Sync + 'static,
64    {
65        Self(Arc::new(LazyHash::new(data)))
66    }
67
68    /// Create `Bytes` from anything string-like, implicitly viewing the UTF-8
69    /// representation.
70    ///
71    /// The `data` type will directly back this bytes object. This means you can
72    /// e.g. pass `String` or `EcoString` without any copying.
73    pub fn from_string<T>(data: T) -> Self
74    where
75        T: AsRef<str> + Send + Sync + 'static,
76    {
77        Self(Arc::new(LazyHash::new(StrWrapper(data))))
78    }
79
80    /// Return `true` if the length is 0.
81    pub fn is_empty(&self) -> bool {
82        self.as_slice().is_empty()
83    }
84
85    /// Return a view into the bytes.
86    pub fn as_slice(&self) -> &[u8] {
87        self
88    }
89
90    /// Try to view the bytes as an UTF-8 string.
91    ///
92    /// If these bytes were created via `Bytes::from_string`, UTF-8 validation
93    /// is skipped.
94    pub fn as_str(&self) -> Result<&str, Utf8Error> {
95        self.inner().as_str()
96    }
97
98    /// Return a copy of the bytes as a vector.
99    pub fn to_vec(&self) -> Vec<u8> {
100        self.as_slice().to_vec()
101    }
102
103    /// Try to turn the bytes into a `Str`.
104    ///
105    /// - If these bytes were created via `Bytes::from_string::<Str>`, the
106    ///   string is cloned directly.
107    /// - If these bytes were created via `Bytes::from_string`, but from a
108    ///   different type of string, UTF-8 validation is still skipped.
109    pub fn to_str(&self) -> Result<Str, Utf8Error> {
110        match self.inner().as_any().downcast_ref::<Str>() {
111            Some(string) => Ok(string.clone()),
112            None => self.as_str().map(Into::into),
113        }
114    }
115
116    /// Resolve an index or throw an out of bounds error.
117    fn locate(&self, index: i64) -> StrResult<usize> {
118        self.locate_opt(index).ok_or_else(|| out_of_bounds(index, self.len()))
119    }
120
121    /// Resolve an index, if it is within bounds.
122    ///
123    /// `index == len` is considered in bounds.
124    fn locate_opt(&self, index: i64) -> Option<usize> {
125        let len = self.as_slice().len();
126        let wrapped =
127            if index >= 0 { Some(index) } else { (len as i64).checked_add(index) };
128        wrapped.and_then(|v| usize::try_from(v).ok()).filter(|&v| v <= len)
129    }
130
131    /// Access the inner `dyn Bytelike`.
132    fn inner(&self) -> &dyn Bytelike {
133        &**self.0
134    }
135}
136
137#[scope]
138impl Bytes {
139    /// Converts a value to bytes.
140    ///
141    /// - Strings are encoded in UTF-8.
142    /// - Arrays of integers between `{0}` and `{255}` are converted directly. The
143    ///   dedicated byte representation is much more efficient than the array
144    ///   representation and thus typically used for large byte buffers (e.g. image
145    ///   data).
146    ///
147    /// ```example
148    /// #bytes("Hello 😃") \
149    /// #bytes((123, 160, 22, 0))
150    /// ```
151    #[func(constructor)]
152    pub fn construct(
153        /// The value that should be converted to bytes.
154        value: ToBytes,
155    ) -> Bytes {
156        value.0
157    }
158
159    /// The length in bytes.
160    #[func(title = "Length")]
161    pub fn len(&self) -> usize {
162        self.as_slice().len()
163    }
164
165    /// Returns the byte at the specified index. Returns the default value if
166    /// the index is out of bounds or fails with an error if no default value
167    /// was specified.
168    #[func]
169    pub fn at(
170        &self,
171        /// The index at which to retrieve the byte.
172        index: i64,
173        /// A default value to return if the index is out of bounds.
174        #[named]
175        default: Option<Value>,
176    ) -> StrResult<Value> {
177        self.locate_opt(index)
178            .and_then(|i| self.as_slice().get(i).map(|&b| Value::Int(b.into())))
179            .or(default)
180            .ok_or_else(|| out_of_bounds_no_default(index, self.len()))
181    }
182
183    /// Extracts a subslice of the bytes. Fails with an error if the start or
184    /// end index is out of bounds.
185    #[func]
186    pub fn slice(
187        &self,
188        /// The start index (inclusive).
189        start: i64,
190        /// The end index (exclusive). If omitted, the whole slice until the end
191        /// is extracted.
192        #[default]
193        end: Option<i64>,
194        /// The number of items to extract. This is equivalent to passing
195        /// `start + count` as the `end` position. Mutually exclusive with
196        /// `end`.
197        #[named]
198        count: Option<i64>,
199    ) -> StrResult<Bytes> {
200        let start = self.locate(start)?;
201        let end = end.or(count.map(|c| start as i64 + c));
202        let end = self.locate(end.unwrap_or(self.len() as i64))?.max(start);
203        let slice = &self.as_slice()[start..end];
204
205        // We could hold a view into the original bytes here instead of
206        // making a copy, but it's unclear when that's worth it. Java
207        // originally did that for strings, but went back on it because a
208        // very small view into a very large buffer would be a sort of
209        // memory leak.
210        Ok(Bytes::new(slice.to_vec()))
211    }
212}
213
214impl Debug for Bytes {
215    fn fmt(&self, f: &mut Formatter) -> fmt::Result {
216        write!(f, "Bytes({})", self.len())
217    }
218}
219
220impl Repr for Bytes {
221    fn repr(&self) -> EcoString {
222        eco_format!("bytes({})", self.len())
223    }
224}
225
226impl Deref for Bytes {
227    type Target = [u8];
228
229    fn deref(&self) -> &Self::Target {
230        self.inner().as_bytes()
231    }
232}
233
234impl Eq for Bytes {}
235
236impl PartialEq for Bytes {
237    fn eq(&self, other: &Self) -> bool {
238        self.0.eq(&other.0)
239    }
240}
241
242impl AsRef<[u8]> for Bytes {
243    fn as_ref(&self) -> &[u8] {
244        self
245    }
246}
247
248impl Add for Bytes {
249    type Output = Self;
250
251    fn add(mut self, rhs: Self) -> Self::Output {
252        self += rhs;
253        self
254    }
255}
256
257impl AddAssign for Bytes {
258    fn add_assign(&mut self, rhs: Self) {
259        if rhs.is_empty() {
260            // Nothing to do
261        } else if self.is_empty() {
262            *self = rhs;
263        } else if let Some(vec) = Arc::get_mut(&mut self.0)
264            .and_then(|unique| unique.as_any_mut().downcast_mut::<Vec<u8>>())
265        {
266            vec.extend_from_slice(&rhs);
267        } else {
268            *self = Self::new([self.as_slice(), rhs.as_slice()].concat());
269        }
270    }
271}
272
273impl Serialize for Bytes {
274    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
275    where
276        S: Serializer,
277    {
278        if serializer.is_human_readable() {
279            serializer.serialize_str(&self.repr())
280        } else {
281            serializer.serialize_bytes(self)
282        }
283    }
284}
285
286impl TryFrom<&Bytes> for Lines<String> {
287    type Error = Utf8Error;
288
289    #[comemo::memoize]
290    fn try_from(value: &Bytes) -> Result<Lines<String>, Utf8Error> {
291        let text = value.as_str()?;
292        Ok(Lines::new(text.to_string()))
293    }
294}
295
296/// Any type that can back a byte buffer.
297trait Bytelike: Send + Sync {
298    fn as_bytes(&self) -> &[u8];
299    fn as_str(&self) -> Result<&str, Utf8Error>;
300    fn as_any(&self) -> &dyn Any;
301    fn as_any_mut(&mut self) -> &mut dyn Any;
302}
303
304impl<T> Bytelike for T
305where
306    T: AsRef<[u8]> + Send + Sync + 'static,
307{
308    fn as_bytes(&self) -> &[u8] {
309        self.as_ref()
310    }
311
312    fn as_str(&self) -> Result<&str, Utf8Error> {
313        std::str::from_utf8(self.as_ref())
314    }
315
316    fn as_any(&self) -> &dyn Any {
317        self
318    }
319
320    fn as_any_mut(&mut self) -> &mut dyn Any {
321        self
322    }
323}
324
325impl Hash for dyn Bytelike {
326    fn hash<H: Hasher>(&self, state: &mut H) {
327        self.as_bytes().hash(state);
328    }
329}
330
331/// Makes string-like objects usable with `Bytes`.
332struct StrWrapper<T>(T);
333
334impl<T> Bytelike for StrWrapper<T>
335where
336    T: AsRef<str> + Send + Sync + 'static,
337{
338    fn as_bytes(&self) -> &[u8] {
339        self.0.as_ref().as_bytes()
340    }
341
342    fn as_str(&self) -> Result<&str, Utf8Error> {
343        Ok(self.0.as_ref())
344    }
345
346    fn as_any(&self) -> &dyn Any {
347        self
348    }
349
350    fn as_any_mut(&mut self) -> &mut dyn Any {
351        self
352    }
353}
354
355/// A value that can be cast to bytes.
356pub struct ToBytes(Bytes);
357
358cast! {
359    ToBytes,
360    v: Str => Self(Bytes::from_string(v)),
361    v: Array => Self(v.iter()
362        .map(|item| match item {
363            Value::Int(byte @ 0..=255) => Ok(*byte as u8),
364            Value::Int(_) => bail!("number must be between 0 and 255"),
365            value => Err(<u8 as Reflect>::error(value)),
366        })
367        .collect::<Result<Vec<u8>, _>>()
368        .map(Bytes::new)?
369    ),
370    v: Bytes => Self(v),
371}
372
373/// The out of bounds access error message.
374#[cold]
375fn out_of_bounds(index: i64, len: usize) -> EcoString {
376    eco_format!("byte index out of bounds (index: {index}, len: {len})")
377}
378
379/// The out of bounds access error message when no default value was given.
380#[cold]
381fn out_of_bounds_no_default(index: i64, len: usize) -> EcoString {
382    eco_format!(
383        "byte index out of bounds (index: {index}, len: {len}) \
384         and no default value was specified",
385    )
386}