typst_kit/files.rs
1//! File loading and management.
2
3use std::fs;
4use std::mem;
5use std::path::{Path, PathBuf};
6use std::str;
7use std::str::Utf8Error;
8
9use parking_lot::Mutex;
10use rustc_hash::FxHashMap;
11use typst_library::diag::{FileError, FileResult};
12use typst_library::foundations::Bytes;
13use typst_syntax::{FileId, Source, VirtualPath};
14
15#[cfg(feature = "system-files")]
16use {crate::packages::SystemPackages, typst_syntax::VirtualRoot};
17
18/// Holds loaded files and sources.
19///
20/// This type is backed by a file loader of your choosing. Internally, it
21/// handles caching of loaded files and creation of Typst [sources](Source).
22/// This is the right level of abstraction if you're building a Typst
23/// integration that's concerned with providing input bytes on-demand, but does
24/// not require tighter integration with Typst [`Source`s](Source). It is
25/// appropriate for most clients.
26///
27/// If you need more control, you can skip this and implement custom logic that
28/// directly handles the [`World::source`](typst_library::World::source) and
29/// [`World::file`](typst_library::World::file) requests. A language server is
30/// an example of an integration that might want to go even deeper, to create,
31/// manage, and edit source files by itself. If you go the manual route, ensure
32/// that those methods are cheap on repeated calls (either through caching or by
33/// virtue of always being cheap).
34#[derive(Default)]
35pub struct FileStore<L> {
36 loader: L,
37 slots: Mutex<FxHashMap<FileId, FileSlot>>,
38}
39
40impl<L> FileStore<L>
41where
42 L: FileLoader,
43{
44 /// Creates a new file store that loads file data via the provided `loader`.
45 pub fn new(loader: L) -> Self {
46 Self { loader, slots: Mutex::new(FxHashMap::default()) }
47 }
48
49 /// Returns a reference to the underlying loader.
50 pub fn loader(&self) -> &L {
51 &self.loader
52 }
53
54 /// Returns a mutable reference to the underlying loader.
55 pub fn loader_mut(&mut self) -> &mut L {
56 &mut self.loader
57 }
58
59 /// Drops the store, extracting the underlying loader.
60 pub fn into_loader(self) -> L {
61 self.loader
62 }
63
64 /// Retrieves the given file id as a Typst source.
65 ///
66 /// Can directly be used to implement
67 /// [`World::source`](typst_library::World::source).
68 pub fn source(&self, id: FileId) -> FileResult<Source> {
69 self.slot(id, |slot| slot.source(&self.loader, id))
70 }
71
72 /// Retrieves the given file id as a raw file.
73 ///
74 /// Can directly be used to implement
75 /// [`World::file`](typst_library::World::file).
76 pub fn file(&self, id: FileId) -> FileResult<Bytes> {
77 self.slot(id, |slot| slot.file(&self.loader, id))
78 }
79
80 /// Returns all files that were referenced since the last
81 /// [`reset()`](Self::reset).
82 ///
83 /// Also returns a reference to the loader so that the IDs can be resolved
84 /// with it. It couldn't be accessed through [`.loader()`](Self::loader)
85 /// while iterating because of overlapping borrows.
86 ///
87 /// The dependencies are returned in arbitrary order! If you want to get a
88 /// consistent result, you should sort them by a suitable criterion after
89 /// the fact.
90 pub fn dependencies(&mut self) -> (&L, impl Iterator<Item = FileId> + '_) {
91 let iter = self
92 .slots
93 .get_mut()
94 .iter()
95 .filter(|(_, slot)| slot.accessed())
96 .map(|(&id, _)| id);
97 (&self.loader, iter)
98 }
99
100 /// Resets the store.
101 ///
102 /// This marks all loaded file as stale. On subsequent accesses, they will
103 /// be loaded once more through the underlying loader. Moreover, calls to
104 /// [`dependencies()`](Self::dependencies) will not yield files accessed
105 /// before the call to `reset()`.
106 ///
107 /// Unlike when creating an entirely new store, source files will be edited
108 /// in place with updated data, leading to improved incremental compilation
109 /// performance.
110 pub fn reset(&mut self) {
111 #[allow(clippy::iter_over_hash_type, reason = "order does not matter")]
112 for slot in self.slots.get_mut().values_mut() {
113 slot.reset();
114 }
115 }
116
117 /// Access the canonical slot for the given file id.
118 fn slot<F, T>(&self, id: FileId, f: F) -> FileResult<T>
119 where
120 F: FnOnce(&mut FileSlot) -> FileResult<T>,
121 {
122 let mut map = self.slots.lock();
123 f(map.entry(id).or_default())
124 }
125}
126
127/// Holds the state for a file.
128enum FileSlot {
129 /// Nothing is loaded that, but we may have a stale source from before a
130 /// reset (i.e. from an earlier compilation) that we can reuse and edit in
131 /// place.
132 ///
133 /// Transitions to
134 /// - loaded when a file is requested
135 /// - to parsed if a source is requested and the data could be loaded
136 /// (otherwise to loaded).
137 Empty(Stale<Source>),
138 /// The slot has been requested as a `file()` but not as a `source()` (at
139 /// least since the last reset). We can still have a stale, reusable source.
140 ///
141 /// Transitions to
142 /// - parsed when a source is requested
143 Loaded(FileResult<Bytes>, Stale<Source>),
144 /// The slot has been requested as a `source()` and potentially as a
145 /// `file()`.
146 ///
147 /// If possible, the bytes are backed by the source (via
148 /// `Bytes::from_string(source)`) so that we can serve `file()` and
149 /// `source()` requests from the same underlying data. Note that this is not
150 /// possible if the data has a UTF8-BOM as it is stripped for the source,
151 /// but should be retained in the file.
152 Parsed(Result<Source, Utf8Error>, Bytes),
153}
154
155/// Holds a source that is not up to date, but may be updated to the newest
156/// state for better incremental performance than parsing and numbering it from
157/// scratch.
158type Stale<T> = Option<T>;
159
160impl FileSlot {
161 /// Whether the slot has been accessed in any way since the last reset.
162 fn accessed(&self) -> bool {
163 !matches!(self, Self::Empty(_))
164 }
165
166 /// Resets the slot to its empty state.
167 fn reset(&mut self) {
168 let stale = match mem::take(self) {
169 Self::Parsed(Ok(source), _) => Some(source),
170 _ => None,
171 };
172 *self = Self::Empty(stale);
173 }
174
175 /// Retrieves the slot's bytes.
176 fn file(&mut self, loader: &impl FileLoader, id: FileId) -> FileResult<Bytes> {
177 match self {
178 Self::Empty(stale) => {
179 let result = loader.load(id);
180 *self = Self::Loaded(result.clone(), mem::take(stale));
181 result
182 }
183 Self::Loaded(result, _) => result.clone(),
184 Self::Parsed(_, bytes) => Ok(bytes.clone()),
185 }
186 }
187
188 /// Retrieves the source for this slot.
189 fn source(&mut self, loader: &impl FileLoader, id: FileId) -> FileResult<Source> {
190 // When we already have a source or error, this returns. Otherwise, it
191 // loads or extracts the bytes and a potential stale source file.
192 let (bytes, stale) = match self {
193 Self::Empty(stale) => match loader.load(id) {
194 Ok(bytes) => (bytes, mem::take(stale)),
195 Err(err) => {
196 *self = Self::Loaded(Err(err.clone()), mem::take(stale));
197 return Err(err);
198 }
199 },
200 Self::Loaded(Ok(_), _) => match mem::take(self) {
201 Self::Loaded(Ok(bytes), stale) => (bytes, stale),
202 _ => unreachable!(),
203 },
204 Self::Loaded(Err(err), _) => return Err(err.clone()),
205 Self::Parsed(source, _) => return Ok(source.clone()?),
206 };
207
208 const UTF8_BOM: &[u8] = b"\xef\xbb\xbf";
209 let without_bom = bytes.strip_prefix(UTF8_BOM);
210
211 // Create a source file, with various attempts to reuse things.
212 let (result, bytes) = if let Some(mut source) = stale {
213 let result = str::from_utf8(without_bom.unwrap_or(&bytes)).map(|new| {
214 // If we have a stale source file, reuse it.
215 source.replace(new);
216 source
217 });
218 (result, bytes)
219 } else if let Some(rest) = without_bom {
220 // If we had a BOM, we can't reuse the bytes for a string, so we
221 // just create a source with a cloned string.
222 (str::from_utf8(rest).map(|text| Source::new(id, text.into())), bytes)
223 } else {
224 // If we had no BOM, we attempt to reuse an existing `String` or
225 // `Vec<u8>` within the `Bytes`, backing the `Bytes` with the
226 // resulting `Source` instead. This way, we can transition from
227 // a vector-backed file to a source without reallocating.
228 match bytes.into_string().map(|text| Source::new(id, text)) {
229 Ok(source) => (Ok(source.clone()), Bytes::from_string(source)),
230 Err(err) => (Err(err.error), err.bytes),
231 }
232 };
233
234 *self = Self::Parsed(result.clone(), bytes);
235 Ok(result?)
236 }
237}
238
239impl Default for FileSlot {
240 fn default() -> Self {
241 Self::Empty(None)
242 }
243}
244
245/// Provides data for files, backing a [`FileStore`].
246///
247/// If you want to load files in a different way, the first step would be to
248/// create your own type that implements [`FileLoader`]. For an example, you can
249/// take a look at how `typst-cli` implements it.
250///
251/// If you need even more control, you can also skip the [`FileStore`] and
252/// implement fully custom logic that directly handles the
253/// [`World::source`](typst_library::World::source) and
254/// [`World::file`](typst_library::World::file) requests.
255pub trait FileLoader {
256 /// Load the data for the given file ID.
257 ///
258 /// Generally, here you'll want to match on the
259 /// [`root()`](typst_syntax::RootedPath::root) of the `id` to check whether
260 /// the file should be loaded from the project or a package. Then, you'll
261 /// load the data at the path
262 /// [`id.vpath()`](typst_syntax::RootedPath::vpath) in the project /
263 /// package.
264 fn load(&self, id: FileId) -> FileResult<Bytes>;
265}
266
267/// Serves project files from a directory and package files from standard
268/// locations.
269///
270/// With this implementation,
271/// - project files are loaded from a project root directory through an
272/// [`FsRoot`].
273/// - package files are loaded from configured directories and/or the official
274/// Typst Universe package registry via [`SystemPackages`].
275#[cfg(feature = "system-files")]
276pub struct SystemFiles {
277 project: FsRoot,
278 packages: SystemPackages,
279}
280
281#[cfg(feature = "system-files")]
282impl SystemFiles {
283 /// Creates a new instance with a given file system root for project files
284 /// and the given configuration for system packages.
285 pub fn new(project: FsRoot, packages: SystemPackages) -> Self {
286 Self { project, packages }
287 }
288
289 /// Resolves the path of the given file `id` in the file system.
290 pub fn resolve(&self, id: FileId) -> FileResult<PathBuf> {
291 Ok(self.root(id)?.resolve(id.vpath()))
292 }
293
294 /// Resolves the root in which the given file ID resides.
295 pub fn root(&self, id: FileId) -> FileResult<FsRoot> {
296 Ok(match id.root() {
297 VirtualRoot::Project => self.project.clone(),
298 VirtualRoot::Package(spec) => self.packages.obtain(spec)?,
299 })
300 }
301}
302
303#[cfg(feature = "system-files")]
304impl FileLoader for SystemFiles {
305 fn load(&self, id: FileId) -> FileResult<Bytes> {
306 self.root(id)?.load(id.vpath())
307 }
308}
309
310/// A [root](typst_syntax::VirtualRoot) that is backed by a file system directory.
311///
312/// A Typst project forms a root. Similarly, each package has its own root.
313/// Through this mechanism, projects and packages are isolated from each other.
314#[derive(Debug, Clone, Eq, PartialEq, Hash)]
315pub struct FsRoot(PathBuf);
316
317impl FsRoot {
318 /// Creates a new instance with the given root path.
319 pub fn new(root: PathBuf) -> Self {
320 Self(root)
321 }
322
323 /// The path at which the root resides in the file system.
324 pub fn path(&self) -> &Path {
325 &self.0
326 }
327
328 /// Resolves the real file system path for the given virtual path in this
329 /// root.
330 pub fn resolve(&self, path: &VirtualPath) -> PathBuf {
331 path.realize(&self.0)
332 }
333
334 /// Loads file data from the given virtual path in this root.
335 pub fn load(&self, path: &VirtualPath) -> FileResult<Bytes> {
336 // Join the path to the root. If it tries to escape, deny access. Note:
337 // It can still escape via symlinks.
338 let path = self.resolve(path);
339 let f = |e| FileError::from_io(e, &path);
340 if fs::metadata(&path).map_err(f)?.is_dir() {
341 Err(FileError::IsDirectory)
342 } else {
343 fs::read(&path).map(Bytes::new).map_err(f)
344 }
345 }
346}
347
348#[cfg(test)]
349mod tests {
350 use typst_syntax::{RootedPath, VirtualRoot};
351
352 use super::*;
353
354 /// Test that a file that's first been loaded as raw bytes correctly
355 /// transitions into the source state.
356 #[test]
357 fn test_file_store_source_via_file() {
358 let store = FileStore::new(TestLoader(1));
359 store.file(id("a.typ")).must_be(A_TEXT);
360 store.source(id("a.typ")).must_be(A_TEXT);
361 }
362
363 /// With BOM, the storage cannot be reused and the data differs.
364 #[test]
365 fn test_file_store_bom() {
366 let store = FileStore::new(TestLoader(1));
367 store.file(id("b.typ")).must_be(B_DATA);
368 store.source(id("b.typ")).must_be(B_TEXT);
369 }
370
371 /// Here that a file request that's already been served as a source reuses
372 /// the same underlying buffer.
373 #[test]
374 fn test_file_store_storage_reuse() {
375 let store = FileStore::new(TestLoader(1));
376 let a_source = store.source(id("a.typ")).unwrap();
377 let a_file = store.file(id("a.typ")).unwrap();
378 a_file.must_be(A_TEXT);
379 a_source.must_be(A_TEXT);
380 assert!(std::ptr::eq(a_file.as_slice().as_ptr(), a_source.text().as_ptr()));
381 }
382
383 /// Check that resetting reloads files.
384 #[test]
385 fn test_file_store_cycles() {
386 let mut store = FileStore::new(TestLoader(1));
387 let deps = |store: &mut FileStore<TestLoader>| {
388 let (_, iter) = store.dependencies();
389 let mut vec = iter
390 .map(|id| id.get().vpath().get_without_slash())
391 .collect::<Vec<_>>();
392 vec.sort();
393 vec
394 };
395 store.source(id("a.typ")).must_be(A_TEXT);
396 store.source(id("d.typ")).must_be("1");
397 assert_eq!(store.file(id("e.bin")), Err(FileError::NotFound("e.bin".into())));
398 assert_eq!(deps(&mut store), ["a.typ", "d.typ", "e.bin"]);
399 store.loader_mut().0 = 5;
400 store.reset();
401 store.source(id("d.typ")).must_be("5");
402 store.file(id("e.bin")).must_be(E_TEXT);
403 assert_eq!(deps(&mut store), ["d.typ", "e.bin"]);
404 }
405
406 const A_TEXT: &str = "Hello from A";
407 const B_DATA: &[u8] = b"\xef\xbb\xbfHello from B";
408 const B_TEXT: &str = "Hello from B";
409 const C_DATA: &[u8] = b"a\xFF\xFF\xFFb";
410 const E_TEXT: &str = "A secret";
411
412 struct TestLoader(usize);
413
414 impl FileLoader for TestLoader {
415 fn load(&self, id: FileId) -> FileResult<Bytes> {
416 Ok(match id.vpath().get_without_slash() {
417 "a.typ" => Bytes::new(Vec::from(A_TEXT)),
418 "b.typ" => Bytes::new(B_DATA),
419 "c.bin" => Bytes::new(C_DATA),
420 "d.typ" => Bytes::from_string(format!("{}", self.0)),
421 "e.bin" if self.0 > 3 => Bytes::from_string(E_TEXT),
422 path => return Err(FileError::NotFound(path.into())),
423 })
424 }
425 }
426
427 fn id(path: &str) -> FileId {
428 RootedPath::new(VirtualRoot::Project, VirtualPath::new(path).unwrap()).intern()
429 }
430
431 trait OutputExt {
432 fn must_be(&self, data: impl AsRef<[u8]>);
433 }
434
435 impl OutputExt for Source {
436 #[track_caller]
437 fn must_be(&self, data: impl AsRef<[u8]>) {
438 assert_eq!(self.text().as_bytes(), data.as_ref());
439 }
440 }
441
442 impl OutputExt for Bytes {
443 #[track_caller]
444 fn must_be(&self, data: impl AsRef<[u8]>) {
445 assert_eq!(self.as_slice(), data.as_ref());
446 }
447 }
448
449 impl<T: OutputExt> OutputExt for FileResult<T> {
450 #[track_caller]
451 fn must_be(&self, data: impl AsRef<[u8]>) {
452 self.as_ref().unwrap().must_be(data);
453 }
454 }
455}