Skip to main content

typst_kit/
files.rs

1//! File loading and management.
2
3use std::fs;
4use std::mem;
5use std::path::{Path, PathBuf};
6use std::str;
7use std::str::Utf8Error;
8
9use parking_lot::Mutex;
10use rustc_hash::FxHashMap;
11use typst_library::diag::{FileError, FileResult};
12use typst_library::foundations::Bytes;
13use typst_syntax::{FileId, Source, VirtualPath};
14
15#[cfg(feature = "system-files")]
16use {crate::packages::SystemPackages, typst_syntax::VirtualRoot};
17
18/// Holds loaded files and sources.
19///
20/// This type is backed by a file loader of your choosing. Internally, it
21/// handles caching of loaded files and creation of Typst [sources](Source).
22/// This is the right level of abstraction if you're building a Typst
23/// integration that's concerned with providing input bytes on-demand, but does
24/// not require tighter integration with Typst [`Source`s](Source). It is
25/// appropriate for most clients.
26///
27/// If you need more control, you can skip this and implement custom logic that
28/// directly handles the [`World::source`](typst_library::World::source) and
29/// [`World::file`](typst_library::World::file) requests. A language server is
30/// an example of an integration that might want to go even deeper,  to create,
31/// manage, and edit source files by itself. If you go the manual route, ensure
32/// that those methods are cheap on repeated calls (either through caching or by
33/// virtue of always being cheap).
34#[derive(Default)]
35pub struct FileStore<L> {
36    loader: L,
37    slots: Mutex<FxHashMap<FileId, FileSlot>>,
38}
39
40impl<L> FileStore<L>
41where
42    L: FileLoader,
43{
44    /// Creates a new file store that loads file data via the provided `loader`.
45    pub fn new(loader: L) -> Self {
46        Self { loader, slots: Mutex::new(FxHashMap::default()) }
47    }
48
49    /// Returns a reference to the underlying loader.
50    pub fn loader(&self) -> &L {
51        &self.loader
52    }
53
54    /// Returns a mutable reference to the underlying loader.
55    pub fn loader_mut(&mut self) -> &mut L {
56        &mut self.loader
57    }
58
59    /// Drops the store, extracting the underlying loader.
60    pub fn into_loader(self) -> L {
61        self.loader
62    }
63
64    /// Retrieves the given file id as a Typst source.
65    ///
66    /// Can directly be used to implement
67    /// [`World::source`](typst_library::World::source).
68    pub fn source(&self, id: FileId) -> FileResult<Source> {
69        self.slot(id, |slot| slot.source(&self.loader, id))
70    }
71
72    /// Retrieves the given file id as a raw file.
73    ///
74    /// Can directly be used to implement
75    /// [`World::file`](typst_library::World::file).
76    pub fn file(&self, id: FileId) -> FileResult<Bytes> {
77        self.slot(id, |slot| slot.file(&self.loader, id))
78    }
79
80    /// Returns all files that were referenced since the last
81    /// [`reset()`](Self::reset).
82    ///
83    /// Also returns a reference to the loader so that the IDs can be resolved
84    /// with it. It couldn't be accessed through [`.loader()`](Self::loader)
85    /// while iterating because of overlapping borrows.
86    ///
87    /// The dependencies are returned in arbitrary order! If you want to get a
88    /// consistent result, you should sort them by a suitable criterion after
89    /// the fact.
90    pub fn dependencies(&mut self) -> (&L, impl Iterator<Item = FileId> + '_) {
91        let iter = self
92            .slots
93            .get_mut()
94            .iter()
95            .filter(|(_, slot)| slot.accessed())
96            .map(|(&id, _)| id);
97        (&self.loader, iter)
98    }
99
100    /// Resets the store.
101    ///
102    /// This marks all loaded file as stale. On subsequent accesses, they will
103    /// be loaded once more through the underlying loader. Moreover, calls to
104    /// [`dependencies()`](Self::dependencies) will not yield files accessed
105    /// before the call to `reset()`.
106    ///
107    /// Unlike when creating an entirely new store, source files will be edited
108    /// in place with updated data, leading to improved incremental compilation
109    /// performance.
110    pub fn reset(&mut self) {
111        #[allow(clippy::iter_over_hash_type, reason = "order does not matter")]
112        for slot in self.slots.get_mut().values_mut() {
113            slot.reset();
114        }
115    }
116
117    /// Access the canonical slot for the given file id.
118    fn slot<F, T>(&self, id: FileId, f: F) -> FileResult<T>
119    where
120        F: FnOnce(&mut FileSlot) -> FileResult<T>,
121    {
122        let mut map = self.slots.lock();
123        f(map.entry(id).or_default())
124    }
125}
126
127/// Holds the state for a file.
128enum FileSlot {
129    /// Nothing is loaded that, but we may have a stale source from before a
130    /// reset (i.e. from an earlier compilation) that we can reuse and edit in
131    /// place.
132    ///
133    /// Transitions to
134    /// - loaded when a file is requested
135    /// - to parsed if a source is requested and the data could be loaded
136    ///   (otherwise to loaded).
137    Empty(Stale<Source>),
138    /// The slot has been requested as a `file()` but not as a `source()` (at
139    /// least since the last reset). We can still have a stale, reusable source.
140    ///
141    /// Transitions to
142    /// - parsed when a source is requested
143    Loaded(FileResult<Bytes>, Stale<Source>),
144    /// The slot has been requested as a `source()` and potentially as a
145    /// `file()`.
146    ///
147    /// If possible, the bytes are backed by the source (via
148    /// `Bytes::from_string(source)`) so that we can serve `file()` and
149    /// `source()` requests from the same underlying data. Note that this is not
150    /// possible if the data has a UTF8-BOM as it is stripped for the source,
151    /// but should be retained in the file.
152    Parsed(Result<Source, Utf8Error>, Bytes),
153}
154
155/// Holds a source that is not up to date, but may be updated to the newest
156/// state for better incremental performance than parsing and numbering it from
157/// scratch.
158type Stale<T> = Option<T>;
159
160impl FileSlot {
161    /// Whether the slot has been accessed in any way since the last reset.
162    fn accessed(&self) -> bool {
163        !matches!(self, Self::Empty(_))
164    }
165
166    /// Resets the slot to its empty state.
167    fn reset(&mut self) {
168        let stale = match mem::take(self) {
169            Self::Parsed(Ok(source), _) => Some(source),
170            _ => None,
171        };
172        *self = Self::Empty(stale);
173    }
174
175    /// Retrieves the slot's bytes.
176    fn file(&mut self, loader: &impl FileLoader, id: FileId) -> FileResult<Bytes> {
177        match self {
178            Self::Empty(stale) => {
179                let result = loader.load(id);
180                *self = Self::Loaded(result.clone(), mem::take(stale));
181                result
182            }
183            Self::Loaded(result, _) => result.clone(),
184            Self::Parsed(_, bytes) => Ok(bytes.clone()),
185        }
186    }
187
188    /// Retrieves the source for this slot.
189    fn source(&mut self, loader: &impl FileLoader, id: FileId) -> FileResult<Source> {
190        // When we already have a source or error, this returns. Otherwise, it
191        // loads or extracts the bytes and a potential stale source file.
192        let (bytes, stale) = match self {
193            Self::Empty(stale) => match loader.load(id) {
194                Ok(bytes) => (bytes, mem::take(stale)),
195                Err(err) => {
196                    *self = Self::Loaded(Err(err.clone()), mem::take(stale));
197                    return Err(err);
198                }
199            },
200            Self::Loaded(Ok(_), _) => match mem::take(self) {
201                Self::Loaded(Ok(bytes), stale) => (bytes, stale),
202                _ => unreachable!(),
203            },
204            Self::Loaded(Err(err), _) => return Err(err.clone()),
205            Self::Parsed(source, _) => return Ok(source.clone()?),
206        };
207
208        const UTF8_BOM: &[u8] = b"\xef\xbb\xbf";
209        let without_bom = bytes.strip_prefix(UTF8_BOM);
210
211        // Create a source file, with various attempts to reuse things.
212        let (result, bytes) = if let Some(mut source) = stale {
213            let result = str::from_utf8(without_bom.unwrap_or(&bytes)).map(|new| {
214                // If we have a stale source file, reuse it.
215                source.replace(new);
216                source
217            });
218            (result, bytes)
219        } else if let Some(rest) = without_bom {
220            // If we had a BOM, we can't reuse the bytes for a string, so we
221            // just create a source with a cloned string.
222            (str::from_utf8(rest).map(|text| Source::new(id, text.into())), bytes)
223        } else {
224            // If we had no BOM, we attempt to reuse an existing `String` or
225            // `Vec<u8>` within the `Bytes`, backing the `Bytes` with the
226            // resulting `Source` instead. This way, we can transition from
227            // a vector-backed file to a source without reallocating.
228            match bytes.into_string().map(|text| Source::new(id, text)) {
229                Ok(source) => (Ok(source.clone()), Bytes::from_string(source)),
230                Err(err) => (Err(err.error), err.bytes),
231            }
232        };
233
234        *self = Self::Parsed(result.clone(), bytes);
235        Ok(result?)
236    }
237}
238
239impl Default for FileSlot {
240    fn default() -> Self {
241        Self::Empty(None)
242    }
243}
244
245/// Provides data for files, backing a [`FileStore`].
246///
247/// If you want to load files in a different way, the first step would be to
248/// create your own type that implements [`FileLoader`]. For an example, you can
249/// take a look at how `typst-cli` implements it.
250///
251/// If you need even more control, you can also skip the [`FileStore`] and
252/// implement fully custom logic that directly handles the
253/// [`World::source`](typst_library::World::source) and
254/// [`World::file`](typst_library::World::file) requests.
255pub trait FileLoader {
256    /// Load the data for the given file ID.
257    ///
258    /// Generally, here you'll want to match on the
259    /// [`root()`](typst_syntax::RootedPath::root) of the `id` to check whether
260    /// the file should be loaded from the project or a package. Then, you'll
261    /// load the data at the path
262    /// [`id.vpath()`](typst_syntax::RootedPath::vpath) in the project /
263    /// package.
264    fn load(&self, id: FileId) -> FileResult<Bytes>;
265}
266
267/// Serves project files from a directory and package files from standard
268/// locations.
269///
270/// With this implementation,
271/// - project files are loaded from a project root directory through an
272///   [`FsRoot`].
273/// - package files are loaded from configured directories and/or the official
274///   Typst Universe package registry via [`SystemPackages`].
275#[cfg(feature = "system-files")]
276pub struct SystemFiles {
277    project: FsRoot,
278    packages: SystemPackages,
279}
280
281#[cfg(feature = "system-files")]
282impl SystemFiles {
283    /// Creates a new instance with a given file system root for project files
284    /// and the given configuration for system packages.
285    pub fn new(project: FsRoot, packages: SystemPackages) -> Self {
286        Self { project, packages }
287    }
288
289    /// Resolves the path of the given file `id` in the file system.
290    pub fn resolve(&self, id: FileId) -> FileResult<PathBuf> {
291        Ok(self.root(id)?.resolve(id.vpath()))
292    }
293
294    /// Resolves the root in which the given file ID resides.
295    pub fn root(&self, id: FileId) -> FileResult<FsRoot> {
296        Ok(match id.root() {
297            VirtualRoot::Project => self.project.clone(),
298            VirtualRoot::Package(spec) => self.packages.obtain(spec)?,
299        })
300    }
301}
302
303#[cfg(feature = "system-files")]
304impl FileLoader for SystemFiles {
305    fn load(&self, id: FileId) -> FileResult<Bytes> {
306        self.root(id)?.load(id.vpath())
307    }
308}
309
310/// A [root](typst_syntax::VirtualRoot) that is backed by a file system directory.
311///
312/// A Typst project forms a root. Similarly, each package has its own root.
313/// Through this mechanism, projects and packages are isolated from each other.
314#[derive(Debug, Clone, Eq, PartialEq, Hash)]
315pub struct FsRoot(PathBuf);
316
317impl FsRoot {
318    /// Creates a new instance with the given root path.
319    pub fn new(root: PathBuf) -> Self {
320        Self(root)
321    }
322
323    /// The path at which the root resides in the file system.
324    pub fn path(&self) -> &Path {
325        &self.0
326    }
327
328    /// Resolves the real file system path for the given virtual path in this
329    /// root.
330    pub fn resolve(&self, path: &VirtualPath) -> PathBuf {
331        path.realize(&self.0)
332    }
333
334    /// Loads file data from the given virtual path in this root.
335    pub fn load(&self, path: &VirtualPath) -> FileResult<Bytes> {
336        // Join the path to the root. If it tries to escape, deny access. Note:
337        // It can still escape via symlinks.
338        let path = self.resolve(path);
339        let f = |e| FileError::from_io(e, &path);
340        if fs::metadata(&path).map_err(f)?.is_dir() {
341            Err(FileError::IsDirectory)
342        } else {
343            fs::read(&path).map(Bytes::new).map_err(f)
344        }
345    }
346}
347
348#[cfg(test)]
349mod tests {
350    use typst_syntax::{RootedPath, VirtualRoot};
351
352    use super::*;
353
354    /// Test that a file that's first been loaded as raw bytes correctly
355    /// transitions into the source state.
356    #[test]
357    fn test_file_store_source_via_file() {
358        let store = FileStore::new(TestLoader(1));
359        store.file(id("a.typ")).must_be(A_TEXT);
360        store.source(id("a.typ")).must_be(A_TEXT);
361    }
362
363    /// With BOM, the storage cannot be reused and the data differs.
364    #[test]
365    fn test_file_store_bom() {
366        let store = FileStore::new(TestLoader(1));
367        store.file(id("b.typ")).must_be(B_DATA);
368        store.source(id("b.typ")).must_be(B_TEXT);
369    }
370
371    /// Here that a file request that's already been served as a source reuses
372    /// the same underlying buffer.
373    #[test]
374    fn test_file_store_storage_reuse() {
375        let store = FileStore::new(TestLoader(1));
376        let a_source = store.source(id("a.typ")).unwrap();
377        let a_file = store.file(id("a.typ")).unwrap();
378        a_file.must_be(A_TEXT);
379        a_source.must_be(A_TEXT);
380        assert!(std::ptr::eq(a_file.as_slice().as_ptr(), a_source.text().as_ptr()));
381    }
382
383    /// Check that resetting reloads files.
384    #[test]
385    fn test_file_store_cycles() {
386        let mut store = FileStore::new(TestLoader(1));
387        let deps = |store: &mut FileStore<TestLoader>| {
388            let (_, iter) = store.dependencies();
389            let mut vec = iter
390                .map(|id| id.get().vpath().get_without_slash())
391                .collect::<Vec<_>>();
392            vec.sort();
393            vec
394        };
395        store.source(id("a.typ")).must_be(A_TEXT);
396        store.source(id("d.typ")).must_be("1");
397        assert_eq!(store.file(id("e.bin")), Err(FileError::NotFound("e.bin".into())));
398        assert_eq!(deps(&mut store), ["a.typ", "d.typ", "e.bin"]);
399        store.loader_mut().0 = 5;
400        store.reset();
401        store.source(id("d.typ")).must_be("5");
402        store.file(id("e.bin")).must_be(E_TEXT);
403        assert_eq!(deps(&mut store), ["d.typ", "e.bin"]);
404    }
405
406    const A_TEXT: &str = "Hello from A";
407    const B_DATA: &[u8] = b"\xef\xbb\xbfHello from B";
408    const B_TEXT: &str = "Hello from B";
409    const C_DATA: &[u8] = b"a\xFF\xFF\xFFb";
410    const E_TEXT: &str = "A secret";
411
412    struct TestLoader(usize);
413
414    impl FileLoader for TestLoader {
415        fn load(&self, id: FileId) -> FileResult<Bytes> {
416            Ok(match id.vpath().get_without_slash() {
417                "a.typ" => Bytes::new(Vec::from(A_TEXT)),
418                "b.typ" => Bytes::new(B_DATA),
419                "c.bin" => Bytes::new(C_DATA),
420                "d.typ" => Bytes::from_string(format!("{}", self.0)),
421                "e.bin" if self.0 > 3 => Bytes::from_string(E_TEXT),
422                path => return Err(FileError::NotFound(path.into())),
423            })
424        }
425    }
426
427    fn id(path: &str) -> FileId {
428        RootedPath::new(VirtualRoot::Project, VirtualPath::new(path).unwrap()).intern()
429    }
430
431    trait OutputExt {
432        fn must_be(&self, data: impl AsRef<[u8]>);
433    }
434
435    impl OutputExt for Source {
436        #[track_caller]
437        fn must_be(&self, data: impl AsRef<[u8]>) {
438            assert_eq!(self.text().as_bytes(), data.as_ref());
439        }
440    }
441
442    impl OutputExt for Bytes {
443        #[track_caller]
444        fn must_be(&self, data: impl AsRef<[u8]>) {
445            assert_eq!(self.as_slice(), data.as_ref());
446        }
447    }
448
449    impl<T: OutputExt> OutputExt for FileResult<T> {
450        #[track_caller]
451        fn must_be(&self, data: impl AsRef<[u8]>) {
452            self.as_ref().unwrap().must_be(data);
453        }
454    }
455}