Skip to main content

typst_batch/process/
scan.rs

1//! Fast scanning API (skips Layout phase).
2//!
3//! # Example
4//!
5//! ```ignore
6//! use typst_batch::Scanner;
7//!
8//! // Simple scan
9//! let result = Scanner::new(root).scan(path)?;
10//! let links = result.extract(LinkExtractor::new());
11//!
12//! // With sys.inputs
13//! let result = Scanner::new(root)
14//!     .with_inputs([("draft", true)])
15//!     .scan(path)?;
16//!
17//! // Multiple extractions in one pass
18//! let (links, headings) = result.extract((
19//!     LinkExtractor::new(),
20//!     HeadingExtractor::new(),
21//! ));
22//! ```
23
24use std::ops::ControlFlow;
25use std::path::{Path, PathBuf};
26
27use serde_json::Value as JsonValue;
28use typst::comemo::Track;
29use typst::diag::SourceDiagnostic;
30use typst::engine::{Route, Sink, Traced};
31use typst::foundations::{Content, Dict, Label};
32use typst::introspection::MetadataElem;
33use typst::loading::DataSource;
34use typst::model::{Destination, HeadingElem, LinkElem, LinkTarget};
35use typst::utils::PicoStr;
36use typst::visualize::ImageElem;
37use typst::World;
38use typst::ROUTINES;
39use typst_html::{HtmlAttr, HtmlElem};
40
41use super::inputs::WithInputs;
42use super::session::{AccessedDeps, CompileSession};
43use crate::diagnostic::{has_errors, CompileError};
44use crate::resource::file::PackageId;
45use crate::world::TypstWorld;
46
47/// Builder for fast Typst scanning (Eval-only, skips Layout).
48///
49/// Significantly faster than [`Compiler`](super::compile::Compiler) because it skips:
50/// - Layout calculation
51/// - Frame generation
52/// - HTML document creation
53pub struct Scanner<'a> {
54    root: &'a Path,
55    inputs: Option<Dict>,
56}
57
58impl<'a> WithInputs for Scanner<'a> {
59    fn inputs_mut(&mut self) -> &mut Option<Dict> {
60        &mut self.inputs
61    }
62}
63
64impl<'a> Scanner<'a> {
65    /// Create a new scanner with the given root directory.
66    pub fn new(root: &'a Path) -> Self {
67        Self { root, inputs: None }
68    }
69
70    /// Execute the scan on a single file.
71    pub fn scan<P: AsRef<Path>>(self, path: P) -> Result<ScanResult, CompileError> {
72        let path = path.as_ref();
73        let world = self.build_world(path);
74        scan_impl(&world)
75    }
76
77    fn build_world(&self, path: &Path) -> TypstWorld {
78        match &self.inputs {
79            Some(inputs) => TypstWorld::builder(path, self.root)
80                .with_local_cache()
81                .no_fonts()
82                .with_inputs_dict(inputs.clone())
83                .build(),
84            None => TypstWorld::builder(path, self.root)
85                .with_local_cache()
86                .no_fonts()
87                .build(),
88        }
89    }
90}
91
92/// Result of fast scanning (Eval-only, no Layout).
93#[derive(Debug)]
94pub struct ScanResult {
95    /// The document's Content tree for extraction.
96    content: Content,
97    /// Files and packages accessed during scanning.
98    accessed: AccessedDeps,
99    /// Scan diagnostics (warnings only).
100    diagnostics: Vec<SourceDiagnostic>,
101}
102
103impl ScanResult {
104    /// Extract data using an extractor.
105    ///
106    /// For advanced use cases or custom extractors.
107    ///
108    /// # Examples
109    ///
110    /// ```ignore
111    /// // Multiple extractors (tuple)
112    /// let (links, headings) = result.extract((
113    ///     LinkExtractor::new(),
114    ///     HeadingExtractor::new(),
115    /// ));
116    /// ```
117    #[inline]
118    pub fn extract<E: Extractor>(&self, extractor: E) -> E::Output {
119        extract(&self.content, extractor)
120    }
121
122    /// Extract all links from the document.
123    #[inline]
124    pub fn links(&self) -> Vec<Link> {
125        self.extract(LinkExtractor::new())
126    }
127
128    /// Extract all headings from the document.
129    #[inline]
130    pub fn headings(&self) -> Vec<Heading> {
131        self.extract(HeadingExtractor::new())
132    }
133
134    /// Extract metadata by label.
135    #[inline]
136    pub fn metadata(&self, label: &str) -> Option<JsonValue> {
137        MetadataExtractor::new(label).and_then(|e| self.extract(e))
138    }
139
140    /// Get the raw content tree.
141    pub fn content(&self) -> &Content {
142        &self.content
143    }
144
145    /// Get files and packages accessed during scanning.
146    pub fn accessed(&self) -> &AccessedDeps {
147        &self.accessed
148    }
149
150    /// Get files accessed during scanning.
151    pub fn accessed_files(&self) -> &[PathBuf] {
152        &self.accessed.files
153    }
154
155    /// Get packages accessed during scanning.
156    ///
157    /// Useful for detecting virtual package usage (e.g., `@myapp/data`).
158    pub fn accessed_packages(&self) -> &[PackageId] {
159        &self.accessed.packages
160    }
161
162    /// Get scan diagnostics.
163    pub fn diagnostics(&self) -> &[SourceDiagnostic] {
164        &self.diagnostics
165    }
166}
167
168/// Trait for extracting data from Typst Content.
169pub trait Extractor: Sized {
170    /// The type returned after extraction.
171    type Output;
172
173    /// Visit a content element during traversal.
174    ///
175    /// Return `ControlFlow::Break(())` to stop traversal early.
176    fn visit(&mut self, elem: &Content) -> ControlFlow<()>;
177
178    /// Finalize and return the extracted data.
179    fn finish(self) -> Self::Output;
180}
181
182/// Extract data from Content using an extractor.
183pub fn extract<E: Extractor>(content: &Content, mut extractor: E) -> E::Output {
184    let _ = content.traverse(&mut |elem: Content| extractor.visit(&elem));
185    extractor.finish()
186}
187
188macro_rules! impl_extractor_for_tuple {
189    ($first:ident $(, $rest:ident)*) => {
190        impl<$first: Extractor $(, $rest: Extractor)*> Extractor for ($first, $($rest,)*) {
191            type Output = ($first::Output, $($rest::Output,)*);
192
193            #[allow(non_snake_case)]
194            fn visit(&mut self, elem: &Content) -> ControlFlow<()> {
195                let ($first, $($rest,)*) = self;
196                $first.visit(elem)?;
197                $($rest.visit(elem)?;)*
198                ControlFlow::Continue(())
199            }
200
201            #[allow(non_snake_case)]
202            fn finish(self) -> Self::Output {
203                let ($first, $($rest,)*) = self;
204                ($first.finish(), $($rest.finish(),)*)
205            }
206        }
207
208        impl_extractor_for_tuple!($($rest),*);
209    };
210    () => {};
211}
212
213impl_extractor_for_tuple!(A, B, C, D, E, F, G, H);
214
215/// Extracts all links from the document.
216#[derive(Debug, Default)]
217pub struct LinkExtractor {
218    links: Vec<Link>,
219    href_attr: Option<HtmlAttr>,
220    src_attr: Option<HtmlAttr>,
221}
222
223impl LinkExtractor {
224    /// Create a new link extractor.
225    pub fn new() -> Self {
226        Self {
227            links: Vec::new(),
228            href_attr: HtmlAttr::intern("href").ok(),
229            src_attr: HtmlAttr::intern("src").ok(),
230        }
231    }
232}
233
234impl Extractor for LinkExtractor {
235    type Output = Vec<Link>;
236
237    fn visit(&mut self, elem: &Content) -> ControlFlow<()> {
238        if let Some(link) = elem.to_packed::<LinkElem>()
239            && let LinkTarget::Dest(Destination::Url(url)) = &link.dest
240        {
241            self.links.push(Link {
242                dest: url.as_str().to_string(),
243                source: LinkSource::Link,
244            });
245        }
246
247        if let Some(html_elem) = elem.to_packed::<HtmlElem>() {
248            let attrs = html_elem.attrs.get_cloned(Default::default());
249
250            if let Some(href) = self.href_attr
251                && let Some(value) = attrs.get(href)
252            {
253                self.links.push(Link {
254                    dest: value.to_string(),
255                    source: LinkSource::Href,
256                });
257            }
258
259            if let Some(src) = self.src_attr
260                && let Some(value) = attrs.get(src)
261            {
262                self.links.push(Link {
263                    dest: value.to_string(),
264                    source: LinkSource::Src,
265                });
266            }
267        }
268
269        // Extract image source paths
270        if let Some(image) = elem.to_packed::<ImageElem>() {
271            if let DataSource::Path(path) = &image.source.source {
272                self.links.push(Link {
273                    dest: path.to_string(),
274                    source: LinkSource::Image,
275                });
276            }
277        }
278
279        ControlFlow::Continue(())
280    }
281
282    fn finish(self) -> Self::Output {
283        self.links
284    }
285}
286
287/// A link extracted from the document.
288#[derive(Debug, Clone, PartialEq, Eq)]
289pub struct Link {
290    /// The link destination.
291    pub dest: String,
292    /// Where this link came from.
293    pub source: LinkSource,
294}
295
296/// The source of a link.
297#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
298pub enum LinkSource {
299    /// From `#link()` element.
300    Link,
301    /// From `href` attribute.
302    Href,
303    /// From `src` attribute.
304    Src,
305    /// From `#image()` element source path.
306    Image,
307}
308
309impl Link {
310    /// Check if HTTP/HTTPS link.
311    #[inline]
312    pub fn is_http(&self) -> bool {
313        self.dest.starts_with("http://") || self.dest.starts_with("https://")
314    }
315
316    /// Check if external link.
317    #[inline]
318    pub fn is_external(&self) -> bool {
319        self.dest.contains("://")
320            || self.dest.starts_with("mailto:")
321            || self.dest.starts_with("tel:")
322    }
323
324    /// Check if site-root link (starts with `/`).
325    #[inline]
326    pub fn is_site_root(&self) -> bool {
327        self.dest.starts_with('/') && !self.dest.starts_with("//")
328    }
329
330    /// Check if fragment link (starts with `#`).
331    #[inline]
332    pub fn is_fragment(&self) -> bool {
333        self.dest.starts_with('#')
334    }
335
336    /// Check if relative link.
337    #[inline]
338    pub fn is_relative(&self) -> bool {
339        !self.is_external() && !self.is_site_root() && !self.is_fragment()
340    }
341}
342
343/// Extracts all headings from the document.
344#[derive(Debug, Default)]
345pub struct HeadingExtractor {
346    headings: Vec<Heading>,
347}
348
349impl HeadingExtractor {
350    /// Create a new heading extractor.
351    pub fn new() -> Self {
352        Self::default()
353    }
354}
355
356impl Extractor for HeadingExtractor {
357    type Output = Vec<Heading>;
358
359    fn visit(&mut self, elem: &Content) -> ControlFlow<()> {
360        if let Some(heading) = elem.to_packed::<HeadingElem>() {
361            let level = heading.resolve_level(Default::default()).get() as u8;
362            let text = heading.body.plain_text().to_string();
363            // Extract supplement if it's custom Content (not Auto or Func)
364            let supplement = heading
365                .supplement
366                .get_cloned(Default::default())
367                .custom()
368                .flatten()
369                .and_then(|s| match s {
370                    typst::model::Supplement::Content(c) => Some(c.plain_text().to_string()),
371                    typst::model::Supplement::Func(_) => None,
372                })
373                .filter(|s| s != "Section");
374            self.headings.push(Heading { level, text, supplement });
375        }
376        ControlFlow::Continue(())
377    }
378
379    fn finish(self) -> Self::Output {
380        self.headings
381    }
382}
383
384/// A heading extracted from the document.
385#[derive(Debug, Clone, PartialEq, Eq)]
386pub struct Heading {
387    /// Heading level (1-6).
388    pub level: u8,
389    /// Heading text content (plain text from body).
390    pub text: String,
391    /// Heading supplement (e.g., "Section", "Chapter").
392    /// Used for custom heading IDs when not default.
393    pub supplement: Option<String>,
394}
395
396/// Extracts metadata by label.
397#[derive(Debug)]
398pub struct MetadataExtractor {
399    label: Label,
400    value: Option<JsonValue>,
401}
402
403impl MetadataExtractor {
404    /// Create a new metadata extractor for the given label.
405    pub fn new(label: &str) -> Option<Self> {
406        Some(Self {
407            label: Label::new(PicoStr::intern(label))?,
408            value: None,
409        })
410    }
411}
412
413impl Extractor for MetadataExtractor {
414    type Output = Option<JsonValue>;
415
416    fn visit(&mut self, elem: &Content) -> ControlFlow<()> {
417        if self.value.is_some() {
418            return ControlFlow::Break(());
419        }
420
421        if let Some(meta) = elem.to_packed::<MetadataElem>()
422            && meta.label() == Some(self.label)
423        {
424            self.value = serde_json::to_value(&meta.value).ok();
425            return ControlFlow::Break(());
426        }
427        ControlFlow::Continue(())
428    }
429
430    fn finish(self) -> Self::Output {
431        self.value
432    }
433}
434
435/// Internal scan implementation, exposed for BatchCompiler reuse.
436pub(crate) fn scan_impl(world: &TypstWorld) -> Result<ScanResult, CompileError> {
437    let session = CompileSession::start();
438    let line_offset = world.prelude_line_count();
439
440    let traced = Traced::default();
441    let mut sink = Sink::new();
442
443    let source = world
444        .source(world.main())
445        .map_err(|e| CompileError::html_export(format!("Failed to read source: {e:?}")))?;
446
447    let world_ref: &dyn World = world;
448    let result = typst_eval::eval(
449        &ROUTINES,
450        world_ref.track(),
451        traced.track(),
452        sink.track_mut(),
453        Route::default().track(),
454        &source,
455    );
456
457    let warnings = sink.warnings();
458
459    let module = result.map_err(|errors| {
460        let all_diags: Vec<_> = errors.iter().chain(&warnings).cloned().collect();
461        CompileError::compilation_with_offset(world, all_diags, line_offset)
462    })?;
463
464    if has_errors(&warnings) {
465        return Err(CompileError::compilation_with_offset(world, warnings.to_vec(), line_offset));
466    }
467
468    let accessed = session.finish(world.root());
469
470    Ok(ScanResult {
471        content: module.content(),
472        accessed,
473        diagnostics: warnings.to_vec(),
474    })
475}
476
477
478
479#[cfg(test)]
480mod tests {
481    use super::*;
482    use std::fs;
483    use tempfile::TempDir;
484
485    #[test]
486    fn test_scanner_basic() {
487        let dir = TempDir::new().unwrap();
488        let file = dir.path().join("test.typ");
489        fs::write(&file, "= Hello\nWorld").unwrap();
490
491        let result = Scanner::new(dir.path()).scan(&file);
492        assert!(result.is_ok());
493        assert!(!result.unwrap().content().is_empty());
494    }
495
496    #[test]
497    fn test_scanner_with_inputs() {
498        let dir = TempDir::new().unwrap();
499        let file = dir.path().join("test.typ");
500        fs::write(
501            &file,
502            r#"#let x = sys.inputs.at("key", default: "none")
503= #x"#,
504        )
505        .unwrap();
506
507        let result = Scanner::new(dir.path())
508            .with_inputs([("key", "value")])
509            .scan(&file);
510        assert!(result.is_ok());
511    }
512
513    #[test]
514    fn test_extract_links() {
515        let dir = TempDir::new().unwrap();
516        let file = dir.path().join("test.typ");
517        fs::write(
518            &file,
519            r#"
520#link("https://example.com")[External]
521#link("/local")[Local]
522"#,
523        )
524        .unwrap();
525
526        let result = Scanner::new(dir.path()).scan(&file).unwrap();
527        let links = result.extract(LinkExtractor::new());
528
529        assert_eq!(links.len(), 2);
530        assert!(links.iter().any(|l| l.is_http()));
531        assert!(links.iter().any(|l| l.is_site_root()));
532    }
533
534    #[test]
535    fn test_extract_image_links() {
536        let dir = TempDir::new().unwrap();
537
538        // Create a dummy image file
539        let img_path = dir.path().join("test.png");
540        fs::write(&img_path, &[0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A]).unwrap();
541
542        let file = dir.path().join("test.typ");
543        fs::write(
544            &file,
545            r#"#image("test.png")"#,
546        )
547        .unwrap();
548
549        let result = Scanner::new(dir.path()).scan(&file).unwrap();
550        let links = result.extract(LinkExtractor::new());
551
552        eprintln!("links: {:?}", links);
553        assert_eq!(links.len(), 1);
554        assert_eq!(links[0].source, LinkSource::Image);
555        assert!(links[0].dest.contains("test.png"));
556    }
557
558    #[test]
559    fn test_extract_headings() {
560        let dir = TempDir::new().unwrap();
561        let file = dir.path().join("test.typ");
562        fs::write(
563            &file,
564            r#"
565= Level 1
566== Level 2
567=== Level 3
568"#,
569        )
570        .unwrap();
571
572        let result = Scanner::new(dir.path()).scan(&file).unwrap();
573        let headings = result.extract(HeadingExtractor::new());
574
575        assert_eq!(headings.len(), 3);
576        assert_eq!(headings[0].level, 1);
577        assert_eq!(headings[1].level, 2);
578        assert_eq!(headings[2].level, 3);
579    }
580
581    #[test]
582    fn test_extract_metadata() {
583        let dir = TempDir::new().unwrap();
584        let file = dir.path().join("test.typ");
585        fs::write(
586            &file,
587            r#"#metadata((title: "Test")) <meta>"#,
588        )
589        .unwrap();
590
591        let result = Scanner::new(dir.path()).scan(&file).unwrap();
592        let meta = result.extract(MetadataExtractor::new("meta").unwrap());
593
594        assert!(meta.is_some());
595    }
596
597    #[test]
598    fn test_extract_tuple() {
599        let dir = TempDir::new().unwrap();
600        let file = dir.path().join("test.typ");
601        fs::write(
602            &file,
603            r#"
604= Heading
605#link("https://example.com")[Link]
606"#,
607        )
608        .unwrap();
609
610        let result = Scanner::new(dir.path()).scan(&file).unwrap();
611        let (links, headings) = result.extract((
612            LinkExtractor::new(),
613            HeadingExtractor::new(),
614        ));
615
616        assert_eq!(links.len(), 1);
617        assert_eq!(headings.len(), 1);
618    }
619
620    #[test]
621    fn test_link_classification() {
622        let http = Link { dest: "http://x.com".into(), source: LinkSource::Link };
623        let https = Link { dest: "https://x.com".into(), source: LinkSource::Link };
624        let mailto = Link { dest: "mailto:a@b.com".into(), source: LinkSource::Href };
625        let root = Link { dest: "/about".into(), source: LinkSource::Link };
626        let fragment = Link { dest: "#section".into(), source: LinkSource::Link };
627        let relative = Link { dest: "./img.png".into(), source: LinkSource::Src };
628
629        assert!(http.is_http() && http.is_external());
630        assert!(https.is_http() && https.is_external());
631        assert!(!mailto.is_http() && mailto.is_external());
632        assert!(root.is_site_root() && !root.is_external());
633        assert!(fragment.is_fragment());
634        assert!(relative.is_relative());
635    }
636}