Skip to main content

typst_library/foundations/
bytes.rs

1use std::any::Any;
2use std::fmt::{self, Debug, Formatter};
3use std::hash::{Hash, Hasher};
4use std::ops::{Add, AddAssign, Deref};
5use std::str::Utf8Error;
6use std::sync::Arc;
7
8use ecow::{EcoString, eco_format};
9use serde::{Serialize, Serializer};
10use typst_syntax::{Lines, Source};
11use typst_utils::LazyHash;
12
13use crate::diag::{StrResult, bail};
14use crate::foundations::{Array, Reflect, Repr, Str, Value, cast, func, scope, ty};
15
16/// A sequence of bytes.
17///
18/// This is conceptually similar to an array of @int[integers] between `{0}` and
19/// `{255}`, but represented much more efficiently. You can iterate over it
20/// using a @reference:scripting:loops[for loop].
21///
22/// You can convert
23/// - a @str[string] or an @array[array] of integers to bytes with the @bytes
24///   constructor
25/// - bytes to a string with the @str constructor, with UTF-8 encoding
26/// - bytes to an array of integers with the @array constructor
27///
28/// When @read[reading] data from a file, you can decide whether to load it as a
29/// string or as raw bytes.
30///
31/// ```example
32/// #bytes((123, 160, 22, 0)) \
33/// #bytes("Hello 😃")
34///
35/// #let data = read(
36///   "rhino.png",
37///   encoding: none,
38/// )
39///
40/// // Magic bytes.
41/// #array(data.slice(0, 4)) \
42/// #str(data.slice(1, 4))
43/// ```
44#[ty(scope, cast)]
45#[derive(Clone, Hash)]
46pub struct Bytes(Arc<LazyHash<dyn Bytelike>>);
47
48impl Bytes {
49    /// Create `Bytes` from anything byte-like.
50    ///
51    /// The `data` type will directly back this bytes object. This means you can
52    /// e.g. pass `&'static [u8]` or `[u8; 8]` and no extra vector will be
53    /// allocated.
54    ///
55    /// If the type is `Vec<u8>` and the `Bytes` are unique (i.e. not cloned),
56    /// the vector will be reused when mutating to the `Bytes`.
57    ///
58    /// If your source type is a string, prefer [`Bytes::from_string`] to
59    /// directly use the UTF-8 encoded string data without any copying.
60    pub fn new<T>(data: T) -> Self
61    where
62        T: AsRef<[u8]> + Send + Sync + 'static,
63    {
64        Self(Arc::new(LazyHash::new(data)))
65    }
66
67    /// Create `Bytes` from anything string-like, implicitly viewing the UTF-8
68    /// representation.
69    ///
70    /// The `data` type will directly back this bytes object. This means you can
71    /// e.g. pass `String` or `EcoString` without any copying.
72    pub fn from_string<T>(data: T) -> Self
73    where
74        T: AsRef<str> + Send + Sync + 'static,
75    {
76        Self(Arc::new(LazyHash::new(StrWrapper(data))))
77    }
78
79    /// Return `true` if the length is 0.
80    pub fn is_empty(&self) -> bool {
81        self.as_slice().is_empty()
82    }
83
84    /// Return a view into the bytes.
85    pub fn as_slice(&self) -> &[u8] {
86        self
87    }
88
89    /// Try to view the bytes as an UTF-8 string.
90    ///
91    /// If these bytes were created via `Bytes::from_string`, UTF-8 validation
92    /// is skipped.
93    pub fn as_str(&self) -> Result<&str, Utf8Error> {
94        self.inner().as_str()
95    }
96
97    /// Attempts to take ownership of an underlying vector. If this is not
98    /// possible, returns a newly allocated vector with the byte data.
99    ///
100    /// For the underlying allocation to be reused, the bytes must have been
101    /// created via [`Bytes::new`] from a [`Vec<u8>`] and the reference count
102    /// must be 1.
103    pub fn into_vec(mut self) -> Vec<u8> {
104        match self.to_underlying_mut::<Vec<u8>>() {
105            Some(vec) => std::mem::take(vec),
106            None => self.as_slice().to_vec(),
107        }
108    }
109
110    /// Attempts to take ownership of an underlying string or byte vector. If
111    /// this is not possible, returns a newly allocated vector with the byte
112    /// data.
113    ///
114    /// For the underlying allocation to be reused, the bytes must have been
115    /// created via [`Bytes::new`] from a [`Vec<u8>`] or via
116    /// [`Bytes::from_string`] from a [`String`] and the reference count must be
117    /// 1.
118    pub fn into_string(mut self) -> Result<String, IntoStringError> {
119        if let Some(string) = self.to_underlying_string_mut::<String>() {
120            return Ok(std::mem::take(string));
121        }
122
123        let result = if let Some(vec) = self.to_underlying_mut::<Vec<u8>>() {
124            match String::from_utf8(std::mem::take(vec)) {
125                Ok(string) => return Ok(string),
126                Err(err) => {
127                    let error = err.utf8_error();
128                    *vec = err.into_bytes();
129                    Err(error)
130                }
131            }
132        } else {
133            self.as_str().map(ToOwned::to_owned)
134        };
135
136        result.map_err(|error| IntoStringError { bytes: self, error })
137    }
138
139    /// Try to turn the bytes into a `Str`.
140    ///
141    /// - If these bytes were created via `Bytes::from_string::<Str>`, the
142    ///   string is cloned directly.
143    /// - If these bytes were created via `Bytes::from_string`, but from a
144    ///   different type of string, UTF-8 validation is still skipped.
145    pub fn to_str(&self) -> Result<Str, Utf8Error> {
146        match (self.inner() as &dyn Any).downcast_ref::<Str>() {
147            Some(string) => Ok(string.clone()),
148            None => self.as_str().map(Into::into),
149        }
150    }
151
152    /// Try to produce line metadata for these bytes. Fails if the bytes are not
153    /// UTF-8 decodable.
154    ///
155    /// If the bytes were created from a [`Source`] file via
156    /// [`Bytes::from_string`], the source file's line metadata is reused.
157    /// Otherwise, line metadata is computed with internal memoization.
158    pub fn lines(&self) -> Result<Lines<String>, Utf8Error> {
159        #[comemo::memoize]
160        fn compute(bytes: &Bytes) -> Result<Lines<String>, Utf8Error> {
161            let text = bytes.as_str()?;
162            Ok(Lines::new(text.to_string()))
163        }
164
165        // Small optimization: If this comes from a source file via
166        // `Bytes::from_string`, we can directly use its lines.
167        match self.to_underlying_string::<Source>() {
168            Some(source) => Ok(source.lines().clone()),
169            None => compute(self),
170        }
171    }
172}
173
174impl Bytes {
175    /// Resolve an index or throw an out of bounds error.
176    fn locate(&self, index: i64) -> StrResult<usize> {
177        self.locate_opt(index).ok_or_else(|| out_of_bounds(index, self.len()))
178    }
179
180    /// Resolve an index, if it is within bounds.
181    ///
182    /// `index == len` is considered in bounds.
183    fn locate_opt(&self, index: i64) -> Option<usize> {
184        let len = self.as_slice().len();
185        let wrapped =
186            if index >= 0 { Some(index) } else { (len as i64).checked_add(index) };
187        wrapped.and_then(|v| usize::try_from(v).ok()).filter(|&v| v <= len)
188    }
189
190    /// Try to access a vector this was built from via [`Bytes::new`].
191    fn to_underlying_mut<T>(&mut self) -> Option<&mut T>
192    where
193        T: AsRef<[u8]> + Send + Sync + 'static,
194    {
195        Arc::get_mut(&mut self.0).and_then(|unique| {
196            let inner: &mut dyn Bytelike = &mut **unique;
197            (inner as &mut dyn Any).downcast_mut::<T>()
198        })
199    }
200
201    /// Try to access a string this was built from via [`Bytes::from_string`].
202    fn to_underlying_string<T>(&self) -> Option<&T>
203    where
204        T: AsRef<str> + Send + Sync + 'static,
205    {
206        (self.inner() as &dyn Any)
207            .downcast_ref::<StrWrapper<T>>()
208            .map(|wrapper| &wrapper.0)
209    }
210
211    /// Try to mutably access a string this was built from via [`Bytes::from_string`].
212    fn to_underlying_string_mut<T>(&mut self) -> Option<&mut T>
213    where
214        T: AsRef<str> + Send + Sync + 'static,
215    {
216        Arc::get_mut(&mut self.0).and_then(|unique| {
217            let inner: &mut dyn Bytelike = &mut **unique;
218            (inner as &mut dyn Any)
219                .downcast_mut::<StrWrapper<T>>()
220                .map(|wrapper| &mut wrapper.0)
221        })
222    }
223
224    /// Access the inner `dyn Bytelike`.
225    fn inner(&self) -> &dyn Bytelike {
226        &**self.0
227    }
228}
229
230#[scope]
231impl Bytes {
232    /// Converts a value to bytes.
233    ///
234    /// - Strings are encoded in UTF-8.
235    /// - Arrays of integers between `{0}` and `{255}` are converted directly.
236    ///   The dedicated byte representation is much more efficient than the
237    ///   array representation and thus typically used for large byte buffers
238    ///   (e.g. image data).
239    ///
240    /// ```example
241    /// #bytes("Hello 😃") \
242    /// #bytes((123, 160, 22, 0))
243    /// ```
244    #[func(constructor)]
245    pub fn construct(
246        /// The value that should be converted to bytes.
247        value: ToBytes,
248    ) -> Bytes {
249        value.0
250    }
251
252    /// The length in bytes.
253    #[func(title = "Length")]
254    pub fn len(&self) -> usize {
255        self.as_slice().len()
256    }
257
258    /// Returns the byte at the specified index. Returns the default value if
259    /// the index is out of bounds or fails with an error if no default value
260    /// was specified.
261    #[func]
262    pub fn at(
263        &self,
264        /// The index at which to retrieve the byte.
265        index: i64,
266        /// A default value to return if the index is out of bounds.
267        #[named]
268        default: Option<Value>,
269    ) -> StrResult<Value> {
270        self.locate_opt(index)
271            .and_then(|i| self.as_slice().get(i).map(|&b| Value::Int(b.into())))
272            .or(default)
273            .ok_or_else(|| out_of_bounds_no_default(index, self.len()))
274    }
275
276    /// Extracts a subslice of the bytes. Fails with an error if the start or
277    /// end index is out of bounds.
278    #[func]
279    pub fn slice(
280        &self,
281        /// The start index (inclusive).
282        start: i64,
283        /// The end index (exclusive). If omitted, the whole slice until the end
284        /// is extracted.
285        #[default]
286        end: Option<i64>,
287        /// The number of items to extract. This is equivalent to passing
288        /// `start + count` as the `end` position. Mutually exclusive with
289        /// `end`.
290        #[named]
291        count: Option<i64>,
292    ) -> StrResult<Bytes> {
293        let start = self.locate(start)?;
294        let end = end.or(count.map(|c| start as i64 + c));
295        let end = self.locate(end.unwrap_or(self.len() as i64))?.max(start);
296        let slice = &self.as_slice()[start..end];
297
298        // We could hold a view into the original bytes here instead of
299        // making a copy, but it's unclear when that's worth it. Java
300        // originally did that for strings, but went back on it because a
301        // very small view into a very large buffer would be a sort of
302        // memory leak.
303        Ok(Bytes::new(slice.to_vec()))
304    }
305}
306
307impl Debug for Bytes {
308    fn fmt(&self, f: &mut Formatter) -> fmt::Result {
309        write!(f, "Bytes({})", self.len())
310    }
311}
312
313impl Repr for Bytes {
314    fn repr(&self) -> EcoString {
315        eco_format!("bytes({})", self.len())
316    }
317}
318
319impl Deref for Bytes {
320    type Target = [u8];
321
322    fn deref(&self) -> &Self::Target {
323        self.inner().as_bytes()
324    }
325}
326
327impl Eq for Bytes {}
328
329impl PartialEq for Bytes {
330    fn eq(&self, other: &Self) -> bool {
331        self.0.eq(&other.0)
332    }
333}
334
335impl AsRef<[u8]> for Bytes {
336    fn as_ref(&self) -> &[u8] {
337        self
338    }
339}
340
341impl Add for Bytes {
342    type Output = Self;
343
344    fn add(mut self, rhs: Self) -> Self::Output {
345        self += rhs;
346        self
347    }
348}
349
350impl AddAssign for Bytes {
351    fn add_assign(&mut self, rhs: Self) {
352        if rhs.is_empty() {
353            // Nothing to do
354        } else if self.is_empty() {
355            *self = rhs;
356        } else if let Some(vec) = self.to_underlying_mut::<Vec<u8>>() {
357            vec.extend_from_slice(&rhs);
358        } else {
359            *self = Self::new([self.as_slice(), rhs.as_slice()].concat());
360        }
361    }
362}
363
364impl Serialize for Bytes {
365    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
366    where
367        S: Serializer,
368    {
369        if serializer.is_human_readable() {
370            serializer.serialize_str(&self.repr())
371        } else {
372            serializer.serialize_bytes(self)
373        }
374    }
375}
376
377/// An error that can occur in [`Bytes::into_string`].
378#[derive(Debug)]
379pub struct IntoStringError {
380    pub bytes: Bytes,
381    pub error: Utf8Error,
382}
383
384/// Any type that can back a byte buffer.
385trait Bytelike: Any + Send + Sync {
386    fn as_bytes(&self) -> &[u8];
387    fn as_str(&self) -> Result<&str, Utf8Error>;
388}
389
390impl<T> Bytelike for T
391where
392    T: AsRef<[u8]> + Send + Sync + 'static,
393{
394    fn as_bytes(&self) -> &[u8] {
395        self.as_ref()
396    }
397
398    fn as_str(&self) -> Result<&str, Utf8Error> {
399        std::str::from_utf8(self.as_ref())
400    }
401}
402
403impl Hash for dyn Bytelike {
404    fn hash<H: Hasher>(&self, state: &mut H) {
405        self.as_bytes().hash(state);
406    }
407}
408
409/// Makes string-like objects usable with `Bytes`.
410struct StrWrapper<T>(T);
411
412impl<T> Bytelike for StrWrapper<T>
413where
414    T: AsRef<str> + Send + Sync + 'static,
415{
416    fn as_bytes(&self) -> &[u8] {
417        self.0.as_ref().as_bytes()
418    }
419
420    fn as_str(&self) -> Result<&str, Utf8Error> {
421        Ok(self.0.as_ref())
422    }
423}
424
425/// A value that can be cast to bytes.
426pub struct ToBytes(Bytes);
427
428cast! {
429    ToBytes,
430    v: Str => Self(Bytes::from_string(v)),
431    v: Array => Self(v.iter()
432        .map(|item| match item {
433            Value::Int(byte @ 0..=255) => Ok(*byte as u8),
434            Value::Int(_) => bail!("number must be between 0 and 255"),
435            value => Err(<u8 as Reflect>::error(value)),
436        })
437        .collect::<Result<Vec<u8>, _>>()
438        .map(Bytes::new)?
439    ),
440    v: Bytes => Self(v),
441}
442
443/// The out of bounds access error message.
444#[cold]
445fn out_of_bounds(index: i64, len: usize) -> EcoString {
446    eco_format!("byte index out of bounds (index: {index}, len: {len})")
447}
448
449/// The out of bounds access error message when no default value was given.
450#[cold]
451fn out_of_bounds_no_default(index: i64, len: usize) -> EcoString {
452    eco_format!(
453        "byte index out of bounds (index: {index}, len: {len}) \
454         and no default value was specified",
455    )
456}
457
458#[cfg(test)]
459mod tests {
460    use super::*;
461
462    /// Round-tripping with lone ownership should retain the same string.
463    #[test]
464    fn test_bytes_into_string_lone() {
465        let s1 = String::from("hello world");
466        let p1 = s1.as_ptr();
467        let s2 = Bytes::from_string(s1).into_string().unwrap();
468        let p2 = s2.as_ptr();
469        assert!(std::ptr::eq(p1, p2));
470    }
471
472    /// Round-tripping with shared ownership can yield a copy.
473    #[test]
474    fn test_bytes_into_string_shared() {
475        let s1 = String::from("hello world");
476        let p1 = s1.as_ptr();
477        let x = Bytes::from_string(s1);
478        let y = x.clone();
479        let s2 = x.into_string().unwrap();
480        let p2 = s2.as_ptr();
481        let s3 = y.into_string().unwrap();
482        let p3 = s3.as_ptr();
483        // The first one yields a copy.
484        assert!(!std::ptr::eq(p1, p2));
485        // The last one yields the original string.
486        assert!(std::ptr::eq(p1, p3));
487    }
488
489    /// Vector can also be reused as string.
490    #[test]
491    fn test_bytes_into_string_from_vec() {
492        let v1 = String::from("hello world").into_bytes();
493        let p1 = v1.as_ptr();
494        let v2 = Bytes::new(v1).into_string().unwrap().into_bytes();
495        let p2 = v2.as_ptr();
496        assert!(std::ptr::eq(p1, p2));
497    }
498
499    /// UTF-8 error should retain the original bytes if it's a vector that could
500    /// become a string.
501    #[test]
502    fn test_bytes_into_string_from_vec_error() {
503        let s = b"hello world\xFF";
504        let err = Bytes::new(Vec::from(s)).into_string().unwrap_err();
505        assert_eq!(err.bytes.as_slice(), s);
506    }
507}