typst_kit/files.rs
1//! File loading and management.
2
3use std::fs;
4use std::mem;
5use std::path::{Path, PathBuf};
6use std::str;
7use std::str::Utf8Error;
8use std::sync::Arc;
9
10use parking_lot::Mutex;
11use rustc_hash::FxHashMap;
12use typst_library::diag::{FileError, FileResult};
13use typst_library::foundations::Bytes;
14use typst_syntax::{FileId, Source, VirtualPath};
15
16#[cfg(feature = "system-files")]
17use {crate::packages::SystemPackages, typst_syntax::VirtualRoot};
18
19/// Holds loaded files and sources.
20///
21/// This type is backed by a file loader of your choosing. Internally, it
22/// handles caching of loaded files and creation of Typst [sources](Source).
23/// This is the right level of abstraction if you're building a Typst
24/// integration that's concerned with providing input bytes on-demand, but does
25/// not require tighter integration with Typst [`Source`s](Source). It is
26/// appropriate for most clients.
27///
28/// If you need more control, you can skip this and implement custom logic that
29/// directly handles the [`World::source`](typst_library::World::source) and
30/// [`World::file`](typst_library::World::file) requests. A language server is
31/// an example of an integration that might want to go even deeper, to create,
32/// manage, and edit source files by itself. If you go the manual route, ensure
33/// that those methods are cheap on repeated calls (either through caching or by
34/// virtue of always being cheap).
35#[derive(Default)]
36pub struct FileStore<L> {
37 loader: L,
38 slots: Mutex<FxHashMap<FileId, FileSlot>>,
39}
40
41impl<L> FileStore<L>
42where
43 L: FileLoader,
44{
45 /// Creates a new file store that loads file data via the provided `loader`.
46 pub fn new(loader: L) -> Self {
47 Self { loader, slots: Mutex::new(FxHashMap::default()) }
48 }
49
50 /// Returns a reference to the underlying loader.
51 pub fn loader(&self) -> &L {
52 &self.loader
53 }
54
55 /// Returns a mutable reference to the underlying loader.
56 pub fn loader_mut(&mut self) -> &mut L {
57 &mut self.loader
58 }
59
60 /// Drops the store, extracting the underlying loader.
61 pub fn into_loader(self) -> L {
62 self.loader
63 }
64
65 /// Retrieves the given file id as a Typst source.
66 ///
67 /// Can directly be used to implement
68 /// [`World::source`](typst_library::World::source).
69 pub fn source(&self, id: FileId) -> FileResult<Source> {
70 self.slot(id, |slot| slot.source(&self.loader, id))
71 }
72
73 /// Retrieves the given file id as a raw file.
74 ///
75 /// Can directly be used to implement
76 /// [`World::file`](typst_library::World::file).
77 pub fn file(&self, id: FileId) -> FileResult<Bytes> {
78 self.slot(id, |slot| slot.file(&self.loader, id))
79 }
80
81 /// Returns all files that were referenced since the last
82 /// [`reset()`](Self::reset).
83 ///
84 /// Also returns a reference to the loader so that the IDs can be resolved
85 /// with it. It couldn't be accessed through [`.loader()`](Self::loader)
86 /// while iterating because of overlapping borrows.
87 ///
88 /// The dependencies are returned in arbitrary order! If you want to get a
89 /// consistent result, you should sort them by a suitable criterion after
90 /// the fact.
91 pub fn dependencies(&mut self) -> (&L, impl Iterator<Item = FileId> + '_) {
92 let iter = self
93 .slots
94 .get_mut()
95 .iter()
96 .filter(|(_, slot)| slot.accessed())
97 .map(|(&id, _)| id);
98 (&self.loader, iter)
99 }
100
101 /// Resets the store.
102 ///
103 /// This marks all loaded file as stale. On subsequent accesses, they will
104 /// be loaded once more through the underlying loader. Moreover, calls to
105 /// [`dependencies()`](Self::dependencies) will not yield files accessed
106 /// before the call to `reset()`.
107 ///
108 /// Unlike when creating an entirely new store, source files will be edited
109 /// in place with updated data, leading to improved incremental compilation
110 /// performance.
111 pub fn reset(&mut self) {
112 #[allow(clippy::iter_over_hash_type, reason = "order does not matter")]
113 for slot in self.slots.get_mut().values_mut() {
114 slot.reset();
115 }
116 }
117
118 /// Access the canonical slot for the given file id.
119 fn slot<F, T>(&self, id: FileId, f: F) -> FileResult<T>
120 where
121 F: FnOnce(&mut FileSlot) -> FileResult<T>,
122 {
123 let mut map = self.slots.lock();
124 f(map.entry(id).or_default())
125 }
126}
127
128/// Holds the state for a file.
129enum FileSlot {
130 /// Nothing is loaded that, but we may have a stale source from before a
131 /// reset (i.e. from an earlier compilation) that we can reuse and edit in
132 /// place.
133 ///
134 /// Transitions to
135 /// - loaded when a file is requested
136 /// - to parsed if a source is requested and the data could be loaded
137 /// (otherwise to loaded).
138 Empty(Stale<Source>),
139 /// The slot has been requested as a `file()` but not as a `source()` (at
140 /// least since the last reset). We can still have a stale, reusable source.
141 ///
142 /// Transitions to
143 /// - parsed when a source is requested
144 Loaded(FileResult<Bytes>, Stale<Source>),
145 /// The slot has been requested as a `source()` and potentially as a
146 /// `file()`.
147 ///
148 /// If possible, the bytes are backed by the source (via
149 /// `Bytes::from_string(source)`) so that we can serve `file()` and
150 /// `source()` requests from the same underlying data. Note that this is not
151 /// possible if the data has a UTF8-BOM as it is stripped for the source,
152 /// but should be retained in the file.
153 Parsed(Result<Source, Utf8Error>, Bytes),
154}
155
156/// Holds a source that is not up to date, but may be updated to the newest
157/// state for better incremental performance than parsing and numbering it from
158/// scratch.
159type Stale<T> = Option<T>;
160
161impl FileSlot {
162 /// Whether the slot has been accessed in any way since the last reset.
163 fn accessed(&self) -> bool {
164 !matches!(self, Self::Empty(_))
165 }
166
167 /// Resets the slot to its empty state.
168 fn reset(&mut self) {
169 let stale = match mem::take(self) {
170 Self::Parsed(Ok(source), _) => Some(source),
171 _ => None,
172 };
173 *self = Self::Empty(stale);
174 }
175
176 /// Retrieves the slot's bytes.
177 fn file(&mut self, loader: &impl FileLoader, id: FileId) -> FileResult<Bytes> {
178 match self {
179 Self::Empty(stale) => {
180 let result = loader.load(id);
181 *self = Self::Loaded(result.clone(), mem::take(stale));
182 result
183 }
184 Self::Loaded(result, _) => result.clone(),
185 Self::Parsed(_, bytes) => Ok(bytes.clone()),
186 }
187 }
188
189 /// Retrieves the source for this slot.
190 fn source(&mut self, loader: &impl FileLoader, id: FileId) -> FileResult<Source> {
191 // When we already have a source or error, this returns. Otherwise, it
192 // loads or extracts the bytes and a potential stale source file.
193 let (bytes, stale) = match self {
194 Self::Empty(stale) => match loader.load(id) {
195 Ok(bytes) => (bytes, mem::take(stale)),
196 Err(err) => {
197 *self = Self::Loaded(Err(err.clone()), mem::take(stale));
198 return Err(err);
199 }
200 },
201 Self::Loaded(Ok(_), _) => match mem::take(self) {
202 Self::Loaded(Ok(bytes), stale) => (bytes, stale),
203 _ => unreachable!(),
204 },
205 Self::Loaded(Err(err), _) => return Err(err.clone()),
206 Self::Parsed(source, _) => return Ok(source.clone()?),
207 };
208
209 const UTF8_BOM: &[u8] = b"\xef\xbb\xbf";
210 let without_bom = bytes.strip_prefix(UTF8_BOM);
211
212 // Create a source file, with various attempts to reuse things.
213 let (result, bytes) = if let Some(mut source) = stale {
214 let result = str::from_utf8(without_bom.unwrap_or(&bytes)).map(|new| {
215 // If we have a stale source file, reuse it.
216 source.replace(new);
217 source
218 });
219 (result, bytes)
220 } else if let Some(rest) = without_bom {
221 // If we had a BOM, we can't reuse the bytes for a string, so we
222 // just create a source with a cloned string.
223 (str::from_utf8(rest).map(|text| Source::new(id, text.into())), bytes)
224 } else {
225 // If we had no BOM, we attempt to reuse an existing `String` or
226 // `Vec<u8>` within the `Bytes`, backing the `Bytes` with the
227 // resulting `Source` instead. This way, we can transition from
228 // a vector-backed file to a source without reallocating.
229 match bytes.into_string().map(|text| Source::new(id, text)) {
230 Ok(source) => (Ok(source.clone()), Bytes::from_string(source)),
231 Err(err) => (Err(err.error), err.bytes),
232 }
233 };
234
235 *self = Self::Parsed(result.clone(), bytes);
236 Ok(result?)
237 }
238}
239
240impl Default for FileSlot {
241 fn default() -> Self {
242 Self::Empty(None)
243 }
244}
245
246/// Provides data for files, backing a [`FileStore`].
247///
248/// If you want to load files in a different way, the first step would be to
249/// create your own type that implements [`FileLoader`]. For an example, you can
250/// take a look at how `typst-cli` implements it.
251///
252/// If you need even more control, you can also skip the [`FileStore`] and
253/// implement fully custom logic that directly handles the
254/// [`World::source`](typst_library::World::source) and
255/// [`World::file`](typst_library::World::file) requests.
256pub trait FileLoader {
257 /// Load the data for the given file ID.
258 ///
259 /// Generally, here you'll want to match on the
260 /// [`root()`](typst_syntax::RootedPath::root) of the `id` to check whether
261 /// the file should be loaded from the project or a package. Then, you'll
262 /// load the data at the path
263 /// [`id.vpath()`](typst_syntax::RootedPath::vpath) in the project /
264 /// package.
265 fn load(&self, id: FileId) -> FileResult<Bytes>;
266}
267
268impl<F: FileLoader> FileLoader for Box<F> {
269 fn load(&self, id: FileId) -> FileResult<Bytes> {
270 (**self).load(id)
271 }
272}
273
274impl<F: FileLoader> FileLoader for Arc<F> {
275 fn load(&self, id: FileId) -> FileResult<Bytes> {
276 (**self).load(id)
277 }
278}
279
280/// Serves project files from a directory and package files from standard
281/// locations.
282///
283/// With this implementation,
284/// - project files are loaded from a project root directory through an
285/// [`FsRoot`].
286/// - package files are loaded from configured directories and/or the official
287/// Typst Universe package registry via [`SystemPackages`].
288#[cfg(feature = "system-files")]
289#[derive(Debug)]
290pub struct SystemFiles {
291 project: FsRoot,
292 packages: SystemPackages,
293}
294
295#[cfg(feature = "system-files")]
296impl SystemFiles {
297 /// Creates a new instance with a given file system root for project files
298 /// and the given configuration for system packages.
299 pub fn new(project: FsRoot, packages: SystemPackages) -> Self {
300 Self { project, packages }
301 }
302
303 /// Resolves the path of the given file `id` in the file system.
304 pub fn resolve(&self, id: FileId) -> FileResult<PathBuf> {
305 self.root(id)?.resolve(id.vpath())
306 }
307
308 /// Resolves the root in which the given file ID resides.
309 pub fn root(&self, id: FileId) -> FileResult<FsRoot> {
310 Ok(match id.root() {
311 VirtualRoot::Project => self.project.clone(),
312 VirtualRoot::Package(spec) => self.packages.obtain(spec)?,
313 })
314 }
315}
316
317#[cfg(feature = "system-files")]
318impl FileLoader for SystemFiles {
319 fn load(&self, id: FileId) -> FileResult<Bytes> {
320 self.root(id)?.load(id.vpath())
321 }
322}
323
324/// A [root](typst_syntax::VirtualRoot) that is backed by a file system directory.
325///
326/// A Typst project forms a root. Similarly, each package has its own root.
327/// Through this mechanism, projects and packages are isolated from each other.
328#[derive(Debug, Clone, Eq, PartialEq, Hash)]
329pub struct FsRoot(PathBuf);
330
331impl FsRoot {
332 /// Creates a new instance with the given root path.
333 pub fn new(root: PathBuf) -> Self {
334 Self(root)
335 }
336
337 /// The path at which the root resides in the file system.
338 pub fn path(&self) -> &Path {
339 &self.0
340 }
341
342 /// Resolves the real file system path for the given virtual path in this
343 /// root.
344 pub fn resolve(&self, path: &VirtualPath) -> FileResult<PathBuf> {
345 path.realize(&self.0).map_err(Into::into)
346 }
347
348 /// Loads file data from the given virtual path in this root.
349 pub fn load(&self, path: &VirtualPath) -> FileResult<Bytes> {
350 // Join the path to the root. If it tries to escape, deny access. Note:
351 // It can still escape via symlinks.
352 let path = self.resolve(path)?;
353 let f = |e| FileError::from_io(e, &path);
354 if fs::metadata(&path).map_err(f)?.is_dir() {
355 Err(FileError::IsDirectory)
356 } else {
357 fs::read(&path).map(Bytes::new).map_err(f)
358 }
359 }
360}
361
362#[cfg(test)]
363mod tests {
364 use typst_syntax::{RootedPath, VirtualRoot};
365
366 use super::*;
367
368 /// Test that a file that's first been loaded as raw bytes correctly
369 /// transitions into the source state.
370 #[test]
371 fn test_file_store_source_via_file() {
372 let store = FileStore::new(TestLoader(1));
373 store.file(id("a.typ")).must_be(A_TEXT);
374 store.source(id("a.typ")).must_be(A_TEXT);
375 }
376
377 /// With BOM, the storage cannot be reused and the data differs.
378 #[test]
379 fn test_file_store_bom() {
380 let store = FileStore::new(TestLoader(1));
381 store.file(id("b.typ")).must_be(B_DATA);
382 store.source(id("b.typ")).must_be(B_TEXT);
383 }
384
385 /// Here that a file request that's already been served as a source reuses
386 /// the same underlying buffer.
387 #[test]
388 fn test_file_store_storage_reuse() {
389 let store = FileStore::new(TestLoader(1));
390 let a_source = store.source(id("a.typ")).unwrap();
391 let a_file = store.file(id("a.typ")).unwrap();
392 a_file.must_be(A_TEXT);
393 a_source.must_be(A_TEXT);
394 assert!(std::ptr::eq(a_file.as_slice().as_ptr(), a_source.text().as_ptr()));
395 }
396
397 /// Check that resetting reloads files.
398 #[test]
399 fn test_file_store_cycles() {
400 let mut store = FileStore::new(TestLoader(1));
401 let deps = |store: &mut FileStore<TestLoader>| {
402 let (_, iter) = store.dependencies();
403 let mut vec = iter
404 .map(|id| id.get().vpath().get_without_slash())
405 .collect::<Vec<_>>();
406 vec.sort();
407 vec
408 };
409 store.source(id("a.typ")).must_be(A_TEXT);
410 store.source(id("d.typ")).must_be("1");
411 assert_eq!(store.file(id("e.bin")), Err(FileError::NotFound("e.bin".into())));
412 assert_eq!(deps(&mut store), ["a.typ", "d.typ", "e.bin"]);
413 store.loader_mut().0 = 5;
414 store.reset();
415 store.source(id("d.typ")).must_be("5");
416 store.file(id("e.bin")).must_be(E_TEXT);
417 assert_eq!(deps(&mut store), ["d.typ", "e.bin"]);
418 }
419
420 const A_TEXT: &str = "Hello from A";
421 const B_DATA: &[u8] = b"\xef\xbb\xbfHello from B";
422 const B_TEXT: &str = "Hello from B";
423 const C_DATA: &[u8] = b"a\xFF\xFF\xFFb";
424 const E_TEXT: &str = "A secret";
425
426 struct TestLoader(usize);
427
428 impl FileLoader for TestLoader {
429 fn load(&self, id: FileId) -> FileResult<Bytes> {
430 Ok(match id.vpath().get_without_slash() {
431 "a.typ" => Bytes::new(Vec::from(A_TEXT)),
432 "b.typ" => Bytes::new(B_DATA),
433 "c.bin" => Bytes::new(C_DATA),
434 "d.typ" => Bytes::from_string(format!("{}", self.0)),
435 "e.bin" if self.0 > 3 => Bytes::from_string(E_TEXT),
436 path => return Err(FileError::NotFound(path.into())),
437 })
438 }
439 }
440
441 fn id(path: &str) -> FileId {
442 RootedPath::new(VirtualRoot::Project, VirtualPath::new(path).unwrap()).intern()
443 }
444
445 trait OutputExt {
446 fn must_be(&self, data: impl AsRef<[u8]>);
447 }
448
449 impl OutputExt for Source {
450 #[track_caller]
451 fn must_be(&self, data: impl AsRef<[u8]>) {
452 assert_eq!(self.text().as_bytes(), data.as_ref());
453 }
454 }
455
456 impl OutputExt for Bytes {
457 #[track_caller]
458 fn must_be(&self, data: impl AsRef<[u8]>) {
459 assert_eq!(self.as_slice(), data.as_ref());
460 }
461 }
462
463 impl<T: OutputExt> OutputExt for FileResult<T> {
464 #[track_caller]
465 fn must_be(&self, data: impl AsRef<[u8]>) {
466 self.as_ref().unwrap().must_be(data);
467 }
468 }
469}