Skip to main content

typst_kit/
files.rs

1//! File loading and management.
2
3use std::fs;
4use std::mem;
5use std::path::{Path, PathBuf};
6use std::str;
7use std::str::Utf8Error;
8use std::sync::Arc;
9
10use parking_lot::Mutex;
11use rustc_hash::FxHashMap;
12use typst_library::diag::{FileError, FileResult};
13use typst_library::foundations::Bytes;
14use typst_syntax::{FileId, Source, VirtualPath};
15
16#[cfg(feature = "system-files")]
17use {crate::packages::SystemPackages, typst_syntax::VirtualRoot};
18
19/// Holds loaded files and sources.
20///
21/// This type is backed by a file loader of your choosing. Internally, it
22/// handles caching of loaded files and creation of Typst [sources](Source).
23/// This is the right level of abstraction if you're building a Typst
24/// integration that's concerned with providing input bytes on-demand, but does
25/// not require tighter integration with Typst [`Source`s](Source). It is
26/// appropriate for most clients.
27///
28/// If you need more control, you can skip this and implement custom logic that
29/// directly handles the [`World::source`](typst_library::World::source) and
30/// [`World::file`](typst_library::World::file) requests. A language server is
31/// an example of an integration that might want to go even deeper, to create,
32/// manage, and edit source files by itself. If you go the manual route, ensure
33/// that those methods are cheap on repeated calls (either through caching or by
34/// virtue of always being cheap).
35#[derive(Default)]
36pub struct FileStore<L> {
37    loader: L,
38    slots: Mutex<FxHashMap<FileId, FileSlot>>,
39}
40
41impl<L> FileStore<L>
42where
43    L: FileLoader,
44{
45    /// Creates a new file store that loads file data via the provided `loader`.
46    pub fn new(loader: L) -> Self {
47        Self { loader, slots: Mutex::new(FxHashMap::default()) }
48    }
49
50    /// Returns a reference to the underlying loader.
51    pub fn loader(&self) -> &L {
52        &self.loader
53    }
54
55    /// Returns a mutable reference to the underlying loader.
56    pub fn loader_mut(&mut self) -> &mut L {
57        &mut self.loader
58    }
59
60    /// Drops the store, extracting the underlying loader.
61    pub fn into_loader(self) -> L {
62        self.loader
63    }
64
65    /// Retrieves the given file id as a Typst source.
66    ///
67    /// Can directly be used to implement
68    /// [`World::source`](typst_library::World::source).
69    pub fn source(&self, id: FileId) -> FileResult<Source> {
70        self.slot(id, |slot| slot.source(&self.loader, id))
71    }
72
73    /// Retrieves the given file id as a raw file.
74    ///
75    /// Can directly be used to implement
76    /// [`World::file`](typst_library::World::file).
77    pub fn file(&self, id: FileId) -> FileResult<Bytes> {
78        self.slot(id, |slot| slot.file(&self.loader, id))
79    }
80
81    /// Returns all files that were referenced since the last
82    /// [`reset()`](Self::reset).
83    ///
84    /// Also returns a reference to the loader so that the IDs can be resolved
85    /// with it. It couldn't be accessed through [`.loader()`](Self::loader)
86    /// while iterating because of overlapping borrows.
87    ///
88    /// The dependencies are returned in arbitrary order! If you want to get a
89    /// consistent result, you should sort them by a suitable criterion after
90    /// the fact.
91    pub fn dependencies(&mut self) -> (&L, impl Iterator<Item = FileId> + '_) {
92        let iter = self
93            .slots
94            .get_mut()
95            .iter()
96            .filter(|(_, slot)| slot.accessed())
97            .map(|(&id, _)| id);
98        (&self.loader, iter)
99    }
100
101    /// Resets the store.
102    ///
103    /// This marks all loaded file as stale. On subsequent accesses, they will
104    /// be loaded once more through the underlying loader. Moreover, calls to
105    /// [`dependencies()`](Self::dependencies) will not yield files accessed
106    /// before the call to `reset()`.
107    ///
108    /// Unlike when creating an entirely new store, source files will be edited
109    /// in place with updated data, leading to improved incremental compilation
110    /// performance.
111    pub fn reset(&mut self) {
112        #[allow(clippy::iter_over_hash_type, reason = "order does not matter")]
113        for slot in self.slots.get_mut().values_mut() {
114            slot.reset();
115        }
116    }
117
118    /// Access the canonical slot for the given file id.
119    fn slot<F, T>(&self, id: FileId, f: F) -> FileResult<T>
120    where
121        F: FnOnce(&mut FileSlot) -> FileResult<T>,
122    {
123        let mut map = self.slots.lock();
124        f(map.entry(id).or_default())
125    }
126}
127
128/// Holds the state for a file.
129enum FileSlot {
130    /// Nothing is loaded that, but we may have a stale source from before a
131    /// reset (i.e. from an earlier compilation) that we can reuse and edit in
132    /// place.
133    ///
134    /// Transitions to
135    /// - loaded when a file is requested
136    /// - to parsed if a source is requested and the data could be loaded
137    ///   (otherwise to loaded).
138    Empty(Stale<Source>),
139    /// The slot has been requested as a `file()` but not as a `source()` (at
140    /// least since the last reset). We can still have a stale, reusable source.
141    ///
142    /// Transitions to
143    /// - parsed when a source is requested
144    Loaded(FileResult<Bytes>, Stale<Source>),
145    /// The slot has been requested as a `source()` and potentially as a
146    /// `file()`.
147    ///
148    /// If possible, the bytes are backed by the source (via
149    /// `Bytes::from_string(source)`) so that we can serve `file()` and
150    /// `source()` requests from the same underlying data. Note that this is not
151    /// possible if the data has a UTF8-BOM as it is stripped for the source,
152    /// but should be retained in the file.
153    Parsed(Result<Source, Utf8Error>, Bytes),
154}
155
156/// Holds a source that is not up to date, but may be updated to the newest
157/// state for better incremental performance than parsing and numbering it from
158/// scratch.
159type Stale<T> = Option<T>;
160
161impl FileSlot {
162    /// Whether the slot has been accessed in any way since the last reset.
163    fn accessed(&self) -> bool {
164        !matches!(self, Self::Empty(_))
165    }
166
167    /// Resets the slot to its empty state.
168    fn reset(&mut self) {
169        let stale = match mem::take(self) {
170            Self::Parsed(Ok(source), _) => Some(source),
171            _ => None,
172        };
173        *self = Self::Empty(stale);
174    }
175
176    /// Retrieves the slot's bytes.
177    fn file(&mut self, loader: &impl FileLoader, id: FileId) -> FileResult<Bytes> {
178        match self {
179            Self::Empty(stale) => {
180                let result = loader.load(id);
181                *self = Self::Loaded(result.clone(), mem::take(stale));
182                result
183            }
184            Self::Loaded(result, _) => result.clone(),
185            Self::Parsed(_, bytes) => Ok(bytes.clone()),
186        }
187    }
188
189    /// Retrieves the source for this slot.
190    fn source(&mut self, loader: &impl FileLoader, id: FileId) -> FileResult<Source> {
191        // When we already have a source or error, this returns. Otherwise, it
192        // loads or extracts the bytes and a potential stale source file.
193        let (bytes, stale) = match self {
194            Self::Empty(stale) => match loader.load(id) {
195                Ok(bytes) => (bytes, mem::take(stale)),
196                Err(err) => {
197                    *self = Self::Loaded(Err(err.clone()), mem::take(stale));
198                    return Err(err);
199                }
200            },
201            Self::Loaded(Ok(_), _) => match mem::take(self) {
202                Self::Loaded(Ok(bytes), stale) => (bytes, stale),
203                _ => unreachable!(),
204            },
205            Self::Loaded(Err(err), _) => return Err(err.clone()),
206            Self::Parsed(source, _) => return Ok(source.clone()?),
207        };
208
209        const UTF8_BOM: &[u8] = b"\xef\xbb\xbf";
210        let without_bom = bytes.strip_prefix(UTF8_BOM);
211
212        // Create a source file, with various attempts to reuse things.
213        let (result, bytes) = if let Some(mut source) = stale {
214            let result = str::from_utf8(without_bom.unwrap_or(&bytes)).map(|new| {
215                // If we have a stale source file, reuse it.
216                source.replace(new);
217                source
218            });
219            (result, bytes)
220        } else if let Some(rest) = without_bom {
221            // If we had a BOM, we can't reuse the bytes for a string, so we
222            // just create a source with a cloned string.
223            (str::from_utf8(rest).map(|text| Source::new(id, text.into())), bytes)
224        } else {
225            // If we had no BOM, we attempt to reuse an existing `String` or
226            // `Vec<u8>` within the `Bytes`, backing the `Bytes` with the
227            // resulting `Source` instead. This way, we can transition from
228            // a vector-backed file to a source without reallocating.
229            match bytes.into_string().map(|text| Source::new(id, text)) {
230                Ok(source) => (Ok(source.clone()), Bytes::from_string(source)),
231                Err(err) => (Err(err.error), err.bytes),
232            }
233        };
234
235        *self = Self::Parsed(result.clone(), bytes);
236        Ok(result?)
237    }
238}
239
240impl Default for FileSlot {
241    fn default() -> Self {
242        Self::Empty(None)
243    }
244}
245
246/// Provides data for files, backing a [`FileStore`].
247///
248/// If you want to load files in a different way, the first step would be to
249/// create your own type that implements [`FileLoader`]. For an example, you can
250/// take a look at how `typst-cli` implements it.
251///
252/// If you need even more control, you can also skip the [`FileStore`] and
253/// implement fully custom logic that directly handles the
254/// [`World::source`](typst_library::World::source) and
255/// [`World::file`](typst_library::World::file) requests.
256pub trait FileLoader {
257    /// Load the data for the given file ID.
258    ///
259    /// Generally, here you'll want to match on the
260    /// [`root()`](typst_syntax::RootedPath::root) of the `id` to check whether
261    /// the file should be loaded from the project or a package. Then, you'll
262    /// load the data at the path
263    /// [`id.vpath()`](typst_syntax::RootedPath::vpath) in the project /
264    /// package.
265    fn load(&self, id: FileId) -> FileResult<Bytes>;
266}
267
268impl<F: FileLoader> FileLoader for Box<F> {
269    fn load(&self, id: FileId) -> FileResult<Bytes> {
270        (**self).load(id)
271    }
272}
273
274impl<F: FileLoader> FileLoader for Arc<F> {
275    fn load(&self, id: FileId) -> FileResult<Bytes> {
276        (**self).load(id)
277    }
278}
279
280/// Serves project files from a directory and package files from standard
281/// locations.
282///
283/// With this implementation,
284/// - project files are loaded from a project root directory through an
285///   [`FsRoot`].
286/// - package files are loaded from configured directories and/or the official
287///   Typst Universe package registry via [`SystemPackages`].
288#[cfg(feature = "system-files")]
289#[derive(Debug)]
290pub struct SystemFiles {
291    project: FsRoot,
292    packages: SystemPackages,
293}
294
295#[cfg(feature = "system-files")]
296impl SystemFiles {
297    /// Creates a new instance with a given file system root for project files
298    /// and the given configuration for system packages.
299    pub fn new(project: FsRoot, packages: SystemPackages) -> Self {
300        Self { project, packages }
301    }
302
303    /// Resolves the path of the given file `id` in the file system.
304    pub fn resolve(&self, id: FileId) -> FileResult<PathBuf> {
305        self.root(id)?.resolve(id.vpath())
306    }
307
308    /// Resolves the root in which the given file ID resides.
309    pub fn root(&self, id: FileId) -> FileResult<FsRoot> {
310        Ok(match id.root() {
311            VirtualRoot::Project => self.project.clone(),
312            VirtualRoot::Package(spec) => self.packages.obtain(spec)?,
313        })
314    }
315}
316
317#[cfg(feature = "system-files")]
318impl FileLoader for SystemFiles {
319    fn load(&self, id: FileId) -> FileResult<Bytes> {
320        self.root(id)?.load(id.vpath())
321    }
322}
323
324/// A [root](typst_syntax::VirtualRoot) that is backed by a file system directory.
325///
326/// A Typst project forms a root. Similarly, each package has its own root.
327/// Through this mechanism, projects and packages are isolated from each other.
328#[derive(Debug, Clone, Eq, PartialEq, Hash)]
329pub struct FsRoot(PathBuf);
330
331impl FsRoot {
332    /// Creates a new instance with the given root path.
333    pub fn new(root: PathBuf) -> Self {
334        Self(root)
335    }
336
337    /// The path at which the root resides in the file system.
338    pub fn path(&self) -> &Path {
339        &self.0
340    }
341
342    /// Resolves the real file system path for the given virtual path in this
343    /// root.
344    pub fn resolve(&self, path: &VirtualPath) -> FileResult<PathBuf> {
345        path.realize(&self.0).map_err(Into::into)
346    }
347
348    /// Loads file data from the given virtual path in this root.
349    pub fn load(&self, path: &VirtualPath) -> FileResult<Bytes> {
350        // Join the path to the root. If it tries to escape, deny access. Note:
351        // It can still escape via symlinks.
352        let path = self.resolve(path)?;
353        let f = |e| FileError::from_io(e, &path);
354        if fs::metadata(&path).map_err(f)?.is_dir() {
355            Err(FileError::IsDirectory)
356        } else {
357            fs::read(&path).map(Bytes::new).map_err(f)
358        }
359    }
360}
361
362#[cfg(test)]
363mod tests {
364    use typst_syntax::{RootedPath, VirtualRoot};
365
366    use super::*;
367
368    /// Test that a file that's first been loaded as raw bytes correctly
369    /// transitions into the source state.
370    #[test]
371    fn test_file_store_source_via_file() {
372        let store = FileStore::new(TestLoader(1));
373        store.file(id("a.typ")).must_be(A_TEXT);
374        store.source(id("a.typ")).must_be(A_TEXT);
375    }
376
377    /// With BOM, the storage cannot be reused and the data differs.
378    #[test]
379    fn test_file_store_bom() {
380        let store = FileStore::new(TestLoader(1));
381        store.file(id("b.typ")).must_be(B_DATA);
382        store.source(id("b.typ")).must_be(B_TEXT);
383    }
384
385    /// Here that a file request that's already been served as a source reuses
386    /// the same underlying buffer.
387    #[test]
388    fn test_file_store_storage_reuse() {
389        let store = FileStore::new(TestLoader(1));
390        let a_source = store.source(id("a.typ")).unwrap();
391        let a_file = store.file(id("a.typ")).unwrap();
392        a_file.must_be(A_TEXT);
393        a_source.must_be(A_TEXT);
394        assert!(std::ptr::eq(a_file.as_slice().as_ptr(), a_source.text().as_ptr()));
395    }
396
397    /// Check that resetting reloads files.
398    #[test]
399    fn test_file_store_cycles() {
400        let mut store = FileStore::new(TestLoader(1));
401        let deps = |store: &mut FileStore<TestLoader>| {
402            let (_, iter) = store.dependencies();
403            let mut vec = iter
404                .map(|id| id.get().vpath().get_without_slash())
405                .collect::<Vec<_>>();
406            vec.sort();
407            vec
408        };
409        store.source(id("a.typ")).must_be(A_TEXT);
410        store.source(id("d.typ")).must_be("1");
411        assert_eq!(store.file(id("e.bin")), Err(FileError::NotFound("e.bin".into())));
412        assert_eq!(deps(&mut store), ["a.typ", "d.typ", "e.bin"]);
413        store.loader_mut().0 = 5;
414        store.reset();
415        store.source(id("d.typ")).must_be("5");
416        store.file(id("e.bin")).must_be(E_TEXT);
417        assert_eq!(deps(&mut store), ["d.typ", "e.bin"]);
418    }
419
420    const A_TEXT: &str = "Hello from A";
421    const B_DATA: &[u8] = b"\xef\xbb\xbfHello from B";
422    const B_TEXT: &str = "Hello from B";
423    const C_DATA: &[u8] = b"a\xFF\xFF\xFFb";
424    const E_TEXT: &str = "A secret";
425
426    struct TestLoader(usize);
427
428    impl FileLoader for TestLoader {
429        fn load(&self, id: FileId) -> FileResult<Bytes> {
430            Ok(match id.vpath().get_without_slash() {
431                "a.typ" => Bytes::new(Vec::from(A_TEXT)),
432                "b.typ" => Bytes::new(B_DATA),
433                "c.bin" => Bytes::new(C_DATA),
434                "d.typ" => Bytes::from_string(format!("{}", self.0)),
435                "e.bin" if self.0 > 3 => Bytes::from_string(E_TEXT),
436                path => return Err(FileError::NotFound(path.into())),
437            })
438        }
439    }
440
441    fn id(path: &str) -> FileId {
442        RootedPath::new(VirtualRoot::Project, VirtualPath::new(path).unwrap()).intern()
443    }
444
445    trait OutputExt {
446        fn must_be(&self, data: impl AsRef<[u8]>);
447    }
448
449    impl OutputExt for Source {
450        #[track_caller]
451        fn must_be(&self, data: impl AsRef<[u8]>) {
452            assert_eq!(self.text().as_bytes(), data.as_ref());
453        }
454    }
455
456    impl OutputExt for Bytes {
457        #[track_caller]
458        fn must_be(&self, data: impl AsRef<[u8]>) {
459            assert_eq!(self.as_slice(), data.as_ref());
460        }
461    }
462
463    impl<T: OutputExt> OutputExt for FileResult<T> {
464        #[track_caller]
465        fn must_be(&self, data: impl AsRef<[u8]>) {
466            self.as_ref().unwrap().must_be(data);
467        }
468    }
469}