1use std::ops::ControlFlow;
25use std::path::{Path, PathBuf};
26
27use serde_json::Value as JsonValue;
28use typst::comemo::Track;
29use typst::diag::SourceDiagnostic;
30use typst::engine::{Route, Sink, Traced};
31use typst::foundations::{Content, Dict, Label};
32use typst::introspection::MetadataElem;
33use typst::loading::DataSource;
34use typst::model::{Destination, HeadingElem, LinkElem, LinkTarget};
35use typst::utils::PicoStr;
36use typst::visualize::ImageElem;
37use typst::World;
38use typst::ROUTINES;
39use typst_html::{HtmlAttr, HtmlElem};
40
41use super::inputs::WithInputs;
42use super::session::{AccessedDeps, CompileSession};
43use crate::diagnostic::{has_errors, CompileError};
44use crate::resource::file::PackageId;
45use crate::world::TypstWorld;
46
47pub struct Scanner<'a> {
54 root: &'a Path,
55 inputs: Option<Dict>,
56}
57
58impl<'a> WithInputs for Scanner<'a> {
59 fn inputs_mut(&mut self) -> &mut Option<Dict> {
60 &mut self.inputs
61 }
62}
63
64impl<'a> Scanner<'a> {
65 pub fn new(root: &'a Path) -> Self {
67 Self { root, inputs: None }
68 }
69
70 pub fn scan<P: AsRef<Path>>(self, path: P) -> Result<ScanResult, CompileError> {
72 let path = path.as_ref();
73 let world = self.build_world(path);
74 scan_impl(&world)
75 }
76
77 fn build_world(&self, path: &Path) -> TypstWorld {
78 match &self.inputs {
79 Some(inputs) => TypstWorld::builder(path, self.root)
80 .with_local_cache()
81 .no_fonts()
82 .with_inputs_dict(inputs.clone())
83 .build(),
84 None => TypstWorld::builder(path, self.root)
85 .with_local_cache()
86 .no_fonts()
87 .build(),
88 }
89 }
90}
91
92#[derive(Debug)]
94pub struct ScanResult {
95 content: Content,
97 accessed: AccessedDeps,
99 diagnostics: Vec<SourceDiagnostic>,
101}
102
103impl ScanResult {
104 #[inline]
118 pub fn extract<E: Extractor>(&self, extractor: E) -> E::Output {
119 extract(&self.content, extractor)
120 }
121
122 #[inline]
124 pub fn links(&self) -> Vec<Link> {
125 self.extract(LinkExtractor::new())
126 }
127
128 #[inline]
130 pub fn headings(&self) -> Vec<Heading> {
131 self.extract(HeadingExtractor::new())
132 }
133
134 #[inline]
136 pub fn metadata(&self, label: &str) -> Option<JsonValue> {
137 MetadataExtractor::new(label).and_then(|e| self.extract(e))
138 }
139
140 pub fn content(&self) -> &Content {
142 &self.content
143 }
144
145 pub fn accessed(&self) -> &AccessedDeps {
147 &self.accessed
148 }
149
150 pub fn accessed_files(&self) -> &[PathBuf] {
152 &self.accessed.files
153 }
154
155 pub fn accessed_packages(&self) -> &[PackageId] {
159 &self.accessed.packages
160 }
161
162 pub fn diagnostics(&self) -> &[SourceDiagnostic] {
164 &self.diagnostics
165 }
166}
167
168pub trait Extractor: Sized {
170 type Output;
172
173 fn visit(&mut self, elem: &Content) -> ControlFlow<()>;
177
178 fn finish(self) -> Self::Output;
180}
181
182pub fn extract<E: Extractor>(content: &Content, mut extractor: E) -> E::Output {
184 let _ = content.traverse(&mut |elem: Content| extractor.visit(&elem));
185 extractor.finish()
186}
187
188macro_rules! impl_extractor_for_tuple {
189 ($first:ident $(, $rest:ident)*) => {
190 impl<$first: Extractor $(, $rest: Extractor)*> Extractor for ($first, $($rest,)*) {
191 type Output = ($first::Output, $($rest::Output,)*);
192
193 #[allow(non_snake_case)]
194 fn visit(&mut self, elem: &Content) -> ControlFlow<()> {
195 let ($first, $($rest,)*) = self;
196 $first.visit(elem)?;
197 $($rest.visit(elem)?;)*
198 ControlFlow::Continue(())
199 }
200
201 #[allow(non_snake_case)]
202 fn finish(self) -> Self::Output {
203 let ($first, $($rest,)*) = self;
204 ($first.finish(), $($rest.finish(),)*)
205 }
206 }
207
208 impl_extractor_for_tuple!($($rest),*);
209 };
210 () => {};
211}
212
213impl_extractor_for_tuple!(A, B, C, D, E, F, G, H);
214
215#[derive(Debug, Default)]
217pub struct LinkExtractor {
218 links: Vec<Link>,
219 href_attr: Option<HtmlAttr>,
220 src_attr: Option<HtmlAttr>,
221}
222
223impl LinkExtractor {
224 pub fn new() -> Self {
226 Self {
227 links: Vec::new(),
228 href_attr: HtmlAttr::intern("href").ok(),
229 src_attr: HtmlAttr::intern("src").ok(),
230 }
231 }
232}
233
234impl Extractor for LinkExtractor {
235 type Output = Vec<Link>;
236
237 fn visit(&mut self, elem: &Content) -> ControlFlow<()> {
238 if let Some(link) = elem.to_packed::<LinkElem>()
239 && let LinkTarget::Dest(Destination::Url(url)) = &link.dest
240 {
241 self.links.push(Link {
242 dest: url.as_str().to_string(),
243 source: LinkSource::Link,
244 });
245 }
246
247 if let Some(html_elem) = elem.to_packed::<HtmlElem>() {
248 let attrs = html_elem.attrs.get_cloned(Default::default());
249
250 if let Some(href) = self.href_attr
251 && let Some(value) = attrs.get(href)
252 {
253 self.links.push(Link {
254 dest: value.to_string(),
255 source: LinkSource::Href,
256 });
257 }
258
259 if let Some(src) = self.src_attr
260 && let Some(value) = attrs.get(src)
261 {
262 self.links.push(Link {
263 dest: value.to_string(),
264 source: LinkSource::Src,
265 });
266 }
267 }
268
269 if let Some(image) = elem.to_packed::<ImageElem>() {
271 if let DataSource::Path(path) = &image.source.source {
272 self.links.push(Link {
273 dest: path.to_string(),
274 source: LinkSource::Image,
275 });
276 }
277 }
278
279 ControlFlow::Continue(())
280 }
281
282 fn finish(self) -> Self::Output {
283 self.links
284 }
285}
286
287#[derive(Debug, Clone, PartialEq, Eq)]
289pub struct Link {
290 pub dest: String,
292 pub source: LinkSource,
294}
295
296#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
298pub enum LinkSource {
299 Link,
301 Href,
303 Src,
305 Image,
307}
308
309impl Link {
310 #[inline]
312 pub fn is_http(&self) -> bool {
313 self.dest.starts_with("http://") || self.dest.starts_with("https://")
314 }
315
316 #[inline]
318 pub fn is_external(&self) -> bool {
319 self.dest.contains("://")
320 || self.dest.starts_with("mailto:")
321 || self.dest.starts_with("tel:")
322 }
323
324 #[inline]
326 pub fn is_site_root(&self) -> bool {
327 self.dest.starts_with('/') && !self.dest.starts_with("//")
328 }
329
330 #[inline]
332 pub fn is_fragment(&self) -> bool {
333 self.dest.starts_with('#')
334 }
335
336 #[inline]
338 pub fn is_relative(&self) -> bool {
339 !self.is_external() && !self.is_site_root() && !self.is_fragment()
340 }
341}
342
343#[derive(Debug, Default)]
345pub struct HeadingExtractor {
346 headings: Vec<Heading>,
347}
348
349impl HeadingExtractor {
350 pub fn new() -> Self {
352 Self::default()
353 }
354}
355
356impl Extractor for HeadingExtractor {
357 type Output = Vec<Heading>;
358
359 fn visit(&mut self, elem: &Content) -> ControlFlow<()> {
360 if let Some(heading) = elem.to_packed::<HeadingElem>() {
361 let level = heading.resolve_level(Default::default()).get() as u8;
362 let text = heading.body.plain_text().to_string();
363 let supplement = heading
365 .supplement
366 .get_cloned(Default::default())
367 .custom()
368 .flatten()
369 .and_then(|s| match s {
370 typst::model::Supplement::Content(c) => Some(c.plain_text().to_string()),
371 typst::model::Supplement::Func(_) => None,
372 })
373 .filter(|s| s != "Section");
374 self.headings.push(Heading { level, text, supplement });
375 }
376 ControlFlow::Continue(())
377 }
378
379 fn finish(self) -> Self::Output {
380 self.headings
381 }
382}
383
384#[derive(Debug, Clone, PartialEq, Eq)]
386pub struct Heading {
387 pub level: u8,
389 pub text: String,
391 pub supplement: Option<String>,
394}
395
396#[derive(Debug)]
398pub struct MetadataExtractor {
399 label: Label,
400 value: Option<JsonValue>,
401}
402
403impl MetadataExtractor {
404 pub fn new(label: &str) -> Option<Self> {
406 Some(Self {
407 label: Label::new(PicoStr::intern(label))?,
408 value: None,
409 })
410 }
411}
412
413impl Extractor for MetadataExtractor {
414 type Output = Option<JsonValue>;
415
416 fn visit(&mut self, elem: &Content) -> ControlFlow<()> {
417 if self.value.is_some() {
418 return ControlFlow::Break(());
419 }
420
421 if let Some(meta) = elem.to_packed::<MetadataElem>()
422 && meta.label() == Some(self.label)
423 {
424 self.value = serde_json::to_value(&meta.value).ok();
425 return ControlFlow::Break(());
426 }
427 ControlFlow::Continue(())
428 }
429
430 fn finish(self) -> Self::Output {
431 self.value
432 }
433}
434
435pub(crate) fn scan_impl(world: &TypstWorld) -> Result<ScanResult, CompileError> {
437 let session = CompileSession::start();
438 let line_offset = world.prelude_line_count();
439
440 let traced = Traced::default();
441 let mut sink = Sink::new();
442
443 let source = world
444 .source(world.main())
445 .map_err(|e| CompileError::html_export(format!("Failed to read source: {e:?}")))?;
446
447 let world_ref: &dyn World = world;
448 let result = typst_eval::eval(
449 &ROUTINES,
450 world_ref.track(),
451 traced.track(),
452 sink.track_mut(),
453 Route::default().track(),
454 &source,
455 );
456
457 let warnings = sink.warnings();
458
459 let module = result.map_err(|errors| {
460 let all_diags: Vec<_> = errors.iter().chain(&warnings).cloned().collect();
461 CompileError::compilation_with_offset(world, all_diags, line_offset)
462 })?;
463
464 if has_errors(&warnings) {
465 return Err(CompileError::compilation_with_offset(world, warnings.to_vec(), line_offset));
466 }
467
468 let accessed = session.finish(world.root());
469
470 Ok(ScanResult {
471 content: module.content(),
472 accessed,
473 diagnostics: warnings.to_vec(),
474 })
475}
476
477
478
479#[cfg(test)]
480mod tests {
481 use super::*;
482 use std::fs;
483 use tempfile::TempDir;
484
485 #[test]
486 fn test_scanner_basic() {
487 let dir = TempDir::new().unwrap();
488 let file = dir.path().join("test.typ");
489 fs::write(&file, "= Hello\nWorld").unwrap();
490
491 let result = Scanner::new(dir.path()).scan(&file);
492 assert!(result.is_ok());
493 assert!(!result.unwrap().content().is_empty());
494 }
495
496 #[test]
497 fn test_scanner_with_inputs() {
498 let dir = TempDir::new().unwrap();
499 let file = dir.path().join("test.typ");
500 fs::write(
501 &file,
502 r#"#let x = sys.inputs.at("key", default: "none")
503= #x"#,
504 )
505 .unwrap();
506
507 let result = Scanner::new(dir.path())
508 .with_inputs([("key", "value")])
509 .scan(&file);
510 assert!(result.is_ok());
511 }
512
513 #[test]
514 fn test_extract_links() {
515 let dir = TempDir::new().unwrap();
516 let file = dir.path().join("test.typ");
517 fs::write(
518 &file,
519 r#"
520#link("https://example.com")[External]
521#link("/local")[Local]
522"#,
523 )
524 .unwrap();
525
526 let result = Scanner::new(dir.path()).scan(&file).unwrap();
527 let links = result.extract(LinkExtractor::new());
528
529 assert_eq!(links.len(), 2);
530 assert!(links.iter().any(|l| l.is_http()));
531 assert!(links.iter().any(|l| l.is_site_root()));
532 }
533
534 #[test]
535 fn test_extract_image_links() {
536 let dir = TempDir::new().unwrap();
537
538 let img_path = dir.path().join("test.png");
540 fs::write(&img_path, &[0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A]).unwrap();
541
542 let file = dir.path().join("test.typ");
543 fs::write(
544 &file,
545 r#"#image("test.png")"#,
546 )
547 .unwrap();
548
549 let result = Scanner::new(dir.path()).scan(&file).unwrap();
550 let links = result.extract(LinkExtractor::new());
551
552 eprintln!("links: {:?}", links);
553 assert_eq!(links.len(), 1);
554 assert_eq!(links[0].source, LinkSource::Image);
555 assert!(links[0].dest.contains("test.png"));
556 }
557
558 #[test]
559 fn test_extract_headings() {
560 let dir = TempDir::new().unwrap();
561 let file = dir.path().join("test.typ");
562 fs::write(
563 &file,
564 r#"
565= Level 1
566== Level 2
567=== Level 3
568"#,
569 )
570 .unwrap();
571
572 let result = Scanner::new(dir.path()).scan(&file).unwrap();
573 let headings = result.extract(HeadingExtractor::new());
574
575 assert_eq!(headings.len(), 3);
576 assert_eq!(headings[0].level, 1);
577 assert_eq!(headings[1].level, 2);
578 assert_eq!(headings[2].level, 3);
579 }
580
581 #[test]
582 fn test_extract_metadata() {
583 let dir = TempDir::new().unwrap();
584 let file = dir.path().join("test.typ");
585 fs::write(
586 &file,
587 r#"#metadata((title: "Test")) <meta>"#,
588 )
589 .unwrap();
590
591 let result = Scanner::new(dir.path()).scan(&file).unwrap();
592 let meta = result.extract(MetadataExtractor::new("meta").unwrap());
593
594 assert!(meta.is_some());
595 }
596
597 #[test]
598 fn test_extract_tuple() {
599 let dir = TempDir::new().unwrap();
600 let file = dir.path().join("test.typ");
601 fs::write(
602 &file,
603 r#"
604= Heading
605#link("https://example.com")[Link]
606"#,
607 )
608 .unwrap();
609
610 let result = Scanner::new(dir.path()).scan(&file).unwrap();
611 let (links, headings) = result.extract((
612 LinkExtractor::new(),
613 HeadingExtractor::new(),
614 ));
615
616 assert_eq!(links.len(), 1);
617 assert_eq!(headings.len(), 1);
618 }
619
620 #[test]
621 fn test_link_classification() {
622 let http = Link { dest: "http://x.com".into(), source: LinkSource::Link };
623 let https = Link { dest: "https://x.com".into(), source: LinkSource::Link };
624 let mailto = Link { dest: "mailto:a@b.com".into(), source: LinkSource::Href };
625 let root = Link { dest: "/about".into(), source: LinkSource::Link };
626 let fragment = Link { dest: "#section".into(), source: LinkSource::Link };
627 let relative = Link { dest: "./img.png".into(), source: LinkSource::Src };
628
629 assert!(http.is_http() && http.is_external());
630 assert!(https.is_http() && https.is_external());
631 assert!(!mailto.is_http() && mailto.is_external());
632 assert!(root.is_site_root() && !root.is_external());
633 assert!(fragment.is_fragment());
634 assert!(relative.is_relative());
635 }
636}