typst_library/foundations/
bytes.rs

1use std::any::Any;
2use std::fmt::{self, Debug, Formatter};
3use std::hash::{Hash, Hasher};
4use std::ops::{Add, AddAssign, Deref};
5use std::str::Utf8Error;
6use std::sync::Arc;
7
8use ecow::{eco_format, EcoString};
9use serde::{Serialize, Serializer};
10use typst_utils::LazyHash;
11
12use crate::diag::{bail, StrResult};
13use crate::foundations::{cast, func, scope, ty, Array, Reflect, Repr, Str, Value};
14
15/// A sequence of bytes.
16///
17/// This is conceptually similar to an array of [integers]($int) between `{0}`
18/// and `{255}`, but represented much more efficiently. You can iterate over it
19/// using a [for loop]($scripting/#loops).
20///
21/// You can convert
22/// - a [string]($str) or an [array] of integers to bytes with the [`bytes`]
23///   constructor
24/// - bytes to a string with the [`str`] constructor, with UTF-8 encoding
25/// - bytes to an array of integers with the [`array`] constructor
26///
27/// When [reading]($read) data from a file, you can decide whether to load it
28/// as a string or as raw bytes.
29///
30/// ```example
31/// #bytes((123, 160, 22, 0)) \
32/// #bytes("Hello 😃")
33///
34/// #let data = read(
35///   "rhino.png",
36///   encoding: none,
37/// )
38///
39/// // Magic bytes.
40/// #array(data.slice(0, 4)) \
41/// #str(data.slice(1, 4))
42/// ```
43#[ty(scope, cast)]
44#[derive(Clone, Hash)]
45#[allow(clippy::derived_hash_with_manual_eq)]
46pub struct Bytes(Arc<LazyHash<dyn Bytelike>>);
47
48impl Bytes {
49    /// Create `Bytes` from anything byte-like.
50    ///
51    /// The `data` type will directly back this bytes object. This means you can
52    /// e.g. pass `&'static [u8]` or `[u8; 8]` and no extra vector will be
53    /// allocated.
54    ///
55    /// If the type is `Vec<u8>` and the `Bytes` are unique (i.e. not cloned),
56    /// the vector will be reused when mutating to the `Bytes`.
57    ///
58    /// If your source type is a string, prefer [`Bytes::from_string`] to
59    /// directly use the UTF-8 encoded string data without any copying.
60    pub fn new<T>(data: T) -> Self
61    where
62        T: AsRef<[u8]> + Send + Sync + 'static,
63    {
64        Self(Arc::new(LazyHash::new(data)))
65    }
66
67    /// Create `Bytes` from anything string-like, implicitly viewing the UTF-8
68    /// representation.
69    ///
70    /// The `data` type will directly back this bytes object. This means you can
71    /// e.g. pass `String` or `EcoString` without any copying.
72    pub fn from_string<T>(data: T) -> Self
73    where
74        T: AsRef<str> + Send + Sync + 'static,
75    {
76        Self(Arc::new(LazyHash::new(StrWrapper(data))))
77    }
78
79    /// Return `true` if the length is 0.
80    pub fn is_empty(&self) -> bool {
81        self.as_slice().is_empty()
82    }
83
84    /// Return a view into the bytes.
85    pub fn as_slice(&self) -> &[u8] {
86        self
87    }
88
89    /// Try to view the bytes as an UTF-8 string.
90    ///
91    /// If these bytes were created via `Bytes::from_string`, UTF-8 validation
92    /// is skipped.
93    pub fn as_str(&self) -> Result<&str, Utf8Error> {
94        self.inner().as_str()
95    }
96
97    /// Return a copy of the bytes as a vector.
98    pub fn to_vec(&self) -> Vec<u8> {
99        self.as_slice().to_vec()
100    }
101
102    /// Try to turn the bytes into a `Str`.
103    ///
104    /// - If these bytes were created via `Bytes::from_string::<Str>`, the
105    ///   string is cloned directly.
106    /// - If these bytes were created via `Bytes::from_string`, but from a
107    ///   different type of string, UTF-8 validation is still skipped.
108    pub fn to_str(&self) -> Result<Str, Utf8Error> {
109        match self.inner().as_any().downcast_ref::<Str>() {
110            Some(string) => Ok(string.clone()),
111            None => self.as_str().map(Into::into),
112        }
113    }
114
115    /// Resolve an index or throw an out of bounds error.
116    fn locate(&self, index: i64) -> StrResult<usize> {
117        self.locate_opt(index).ok_or_else(|| out_of_bounds(index, self.len()))
118    }
119
120    /// Resolve an index, if it is within bounds.
121    ///
122    /// `index == len` is considered in bounds.
123    fn locate_opt(&self, index: i64) -> Option<usize> {
124        let len = self.as_slice().len();
125        let wrapped =
126            if index >= 0 { Some(index) } else { (len as i64).checked_add(index) };
127        wrapped.and_then(|v| usize::try_from(v).ok()).filter(|&v| v <= len)
128    }
129
130    /// Access the inner `dyn Bytelike`.
131    fn inner(&self) -> &dyn Bytelike {
132        &**self.0
133    }
134}
135
136#[scope]
137impl Bytes {
138    /// Converts a value to bytes.
139    ///
140    /// - Strings are encoded in UTF-8.
141    /// - Arrays of integers between `{0}` and `{255}` are converted directly. The
142    ///   dedicated byte representation is much more efficient than the array
143    ///   representation and thus typically used for large byte buffers (e.g. image
144    ///   data).
145    ///
146    /// ```example
147    /// #bytes("Hello 😃") \
148    /// #bytes((123, 160, 22, 0))
149    /// ```
150    #[func(constructor)]
151    pub fn construct(
152        /// The value that should be converted to bytes.
153        value: ToBytes,
154    ) -> Bytes {
155        value.0
156    }
157
158    /// The length in bytes.
159    #[func(title = "Length")]
160    pub fn len(&self) -> usize {
161        self.as_slice().len()
162    }
163
164    /// Returns the byte at the specified index. Returns the default value if
165    /// the index is out of bounds or fails with an error if no default value
166    /// was specified.
167    #[func]
168    pub fn at(
169        &self,
170        /// The index at which to retrieve the byte.
171        index: i64,
172        /// A default value to return if the index is out of bounds.
173        #[named]
174        default: Option<Value>,
175    ) -> StrResult<Value> {
176        self.locate_opt(index)
177            .and_then(|i| self.as_slice().get(i).map(|&b| Value::Int(b.into())))
178            .or(default)
179            .ok_or_else(|| out_of_bounds_no_default(index, self.len()))
180    }
181
182    /// Extracts a subslice of the bytes. Fails with an error if the start or
183    /// end index is out of bounds.
184    #[func]
185    pub fn slice(
186        &self,
187        /// The start index (inclusive).
188        start: i64,
189        /// The end index (exclusive). If omitted, the whole slice until the end
190        /// is extracted.
191        #[default]
192        end: Option<i64>,
193        /// The number of items to extract. This is equivalent to passing
194        /// `start + count` as the `end` position. Mutually exclusive with
195        /// `end`.
196        #[named]
197        count: Option<i64>,
198    ) -> StrResult<Bytes> {
199        let mut end = end;
200        if end.is_none() {
201            end = count.map(|c: i64| start + c);
202        }
203
204        let start = self.locate(start)?;
205        let end = self.locate(end.unwrap_or(self.len() as i64))?.max(start);
206        let slice = &self.as_slice()[start..end];
207
208        // We could hold a view into the original bytes here instead of
209        // making a copy, but it's unclear when that's worth it. Java
210        // originally did that for strings, but went back on it because a
211        // very small view into a very large buffer would be a sort of
212        // memory leak.
213        Ok(Bytes::new(slice.to_vec()))
214    }
215}
216
217impl Debug for Bytes {
218    fn fmt(&self, f: &mut Formatter) -> fmt::Result {
219        write!(f, "Bytes({})", self.len())
220    }
221}
222
223impl Repr for Bytes {
224    fn repr(&self) -> EcoString {
225        eco_format!("bytes({})", self.len())
226    }
227}
228
229impl Deref for Bytes {
230    type Target = [u8];
231
232    fn deref(&self) -> &Self::Target {
233        self.inner().as_bytes()
234    }
235}
236
237impl Eq for Bytes {}
238
239impl PartialEq for Bytes {
240    fn eq(&self, other: &Self) -> bool {
241        self.0.eq(&other.0)
242    }
243}
244
245impl AsRef<[u8]> for Bytes {
246    fn as_ref(&self) -> &[u8] {
247        self
248    }
249}
250
251impl Add for Bytes {
252    type Output = Self;
253
254    fn add(mut self, rhs: Self) -> Self::Output {
255        self += rhs;
256        self
257    }
258}
259
260impl AddAssign for Bytes {
261    fn add_assign(&mut self, rhs: Self) {
262        if rhs.is_empty() {
263            // Nothing to do
264        } else if self.is_empty() {
265            *self = rhs;
266        } else if let Some(vec) = Arc::get_mut(&mut self.0)
267            .and_then(|unique| unique.as_any_mut().downcast_mut::<Vec<u8>>())
268        {
269            vec.extend_from_slice(&rhs);
270        } else {
271            *self = Self::new([self.as_slice(), rhs.as_slice()].concat());
272        }
273    }
274}
275
276impl Serialize for Bytes {
277    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
278    where
279        S: Serializer,
280    {
281        if serializer.is_human_readable() {
282            serializer.serialize_str(&eco_format!("{self:?}"))
283        } else {
284            serializer.serialize_bytes(self)
285        }
286    }
287}
288
289/// Any type that can back a byte buffer.
290trait Bytelike: Send + Sync {
291    fn as_bytes(&self) -> &[u8];
292    fn as_str(&self) -> Result<&str, Utf8Error>;
293    fn as_any(&self) -> &dyn Any;
294    fn as_any_mut(&mut self) -> &mut dyn Any;
295}
296
297impl<T> Bytelike for T
298where
299    T: AsRef<[u8]> + Send + Sync + 'static,
300{
301    fn as_bytes(&self) -> &[u8] {
302        self.as_ref()
303    }
304
305    fn as_str(&self) -> Result<&str, Utf8Error> {
306        std::str::from_utf8(self.as_ref())
307    }
308
309    fn as_any(&self) -> &dyn Any {
310        self
311    }
312
313    fn as_any_mut(&mut self) -> &mut dyn Any {
314        self
315    }
316}
317
318impl Hash for dyn Bytelike {
319    fn hash<H: Hasher>(&self, state: &mut H) {
320        self.as_bytes().hash(state);
321    }
322}
323
324/// Makes string-like objects usable with `Bytes`.
325struct StrWrapper<T>(T);
326
327impl<T> Bytelike for StrWrapper<T>
328where
329    T: AsRef<str> + Send + Sync + 'static,
330{
331    fn as_bytes(&self) -> &[u8] {
332        self.0.as_ref().as_bytes()
333    }
334
335    fn as_str(&self) -> Result<&str, Utf8Error> {
336        Ok(self.0.as_ref())
337    }
338
339    fn as_any(&self) -> &dyn Any {
340        self
341    }
342
343    fn as_any_mut(&mut self) -> &mut dyn Any {
344        self
345    }
346}
347
348/// A value that can be cast to bytes.
349pub struct ToBytes(Bytes);
350
351cast! {
352    ToBytes,
353    v: Str => Self(Bytes::from_string(v)),
354    v: Array => Self(v.iter()
355        .map(|item| match item {
356            Value::Int(byte @ 0..=255) => Ok(*byte as u8),
357            Value::Int(_) => bail!("number must be between 0 and 255"),
358            value => Err(<u8 as Reflect>::error(value)),
359        })
360        .collect::<Result<Vec<u8>, _>>()
361        .map(Bytes::new)?
362    ),
363    v: Bytes => Self(v),
364}
365
366/// The out of bounds access error message.
367#[cold]
368fn out_of_bounds(index: i64, len: usize) -> EcoString {
369    eco_format!("byte index out of bounds (index: {index}, len: {len})")
370}
371
372/// The out of bounds access error message when no default value was given.
373#[cold]
374fn out_of_bounds_no_default(index: i64, len: usize) -> EcoString {
375    eco_format!(
376        "byte index out of bounds (index: {index}, len: {len}) \
377         and no default value was specified",
378    )
379}