typst_library/introspection/
introspector.rs

1use std::collections::{BTreeSet, HashMap, HashSet};
2use std::fmt::{self, Debug, Formatter};
3use std::hash::Hash;
4use std::num::NonZeroUsize;
5use std::sync::RwLock;
6
7use ecow::EcoVec;
8use smallvec::SmallVec;
9use typst_utils::NonZeroExt;
10
11use crate::diag::{bail, StrResult};
12use crate::foundations::{Content, Label, Repr, Selector};
13use crate::html::{HtmlElement, HtmlNode};
14use crate::introspection::{Location, Tag};
15use crate::layout::{Frame, FrameItem, Page, Point, Position, Transform};
16use crate::model::Numbering;
17
18/// Can be queried for elements and their positions.
19#[derive(Default, Clone)]
20pub struct Introspector {
21    /// The number of pages in the document.
22    pages: usize,
23    /// The page numberings, indexed by page number minus 1.
24    page_numberings: Vec<Option<Numbering>>,
25    /// The page supplements, indexed by page number minus 1.
26    page_supplements: Vec<Content>,
27
28    /// All introspectable elements.
29    elems: Vec<Pair>,
30    /// Lists all elements with a specific hash key. This is used for
31    /// introspector-assisted location assignment during measurement.
32    keys: MultiMap<u128, Location>,
33
34    /// Accelerates lookup of elements by location.
35    locations: HashMap<Location, usize>,
36    /// Accelerates lookup of elements by label.
37    labels: MultiMap<Label, usize>,
38
39    /// Caches queries done on the introspector. This is important because
40    /// even if all top-level queries are distinct, they often have shared
41    /// subqueries. Example: Individual counter queries with `before` that
42    /// all depend on a global counter query.
43    queries: QueryCache,
44}
45
46/// A pair of content and its position.
47type Pair = (Content, Position);
48
49impl Introspector {
50    /// Creates an introspector for a page list.
51    #[typst_macros::time(name = "introspect pages")]
52    pub fn paged(pages: &[Page]) -> Self {
53        IntrospectorBuilder::new().build_paged(pages)
54    }
55
56    /// Creates an introspector for HTML.
57    #[typst_macros::time(name = "introspect html")]
58    pub fn html(root: &HtmlElement) -> Self {
59        IntrospectorBuilder::new().build_html(root)
60    }
61
62    /// Iterates over all locatable elements.
63    pub fn all(&self) -> impl Iterator<Item = &Content> + '_ {
64        self.elems.iter().map(|(c, _)| c)
65    }
66
67    /// Retrieves the element with the given index.
68    #[track_caller]
69    fn get_by_idx(&self, idx: usize) -> &Content {
70        &self.elems[idx].0
71    }
72
73    /// Retrieves the position of the element with the given index.
74    #[track_caller]
75    fn get_pos_by_idx(&self, idx: usize) -> Position {
76        self.elems[idx].1
77    }
78
79    /// Retrieves an element by its location.
80    fn get_by_loc(&self, location: &Location) -> Option<&Content> {
81        self.locations.get(location).map(|&idx| self.get_by_idx(idx))
82    }
83
84    /// Retrieves the position of the element with the given index.
85    fn get_pos_by_loc(&self, location: &Location) -> Option<Position> {
86        self.locations.get(location).map(|&idx| self.get_pos_by_idx(idx))
87    }
88
89    /// Performs a binary search for `elem` among the `list`.
90    fn binary_search(&self, list: &[Content], elem: &Content) -> Result<usize, usize> {
91        list.binary_search_by_key(&self.elem_index(elem), |elem| self.elem_index(elem))
92    }
93
94    /// Gets the index of this element.
95    fn elem_index(&self, elem: &Content) -> usize {
96        self.loc_index(&elem.location().unwrap())
97    }
98
99    /// Gets the index of the element with this location among all.
100    fn loc_index(&self, location: &Location) -> usize {
101        self.locations.get(location).copied().unwrap_or(usize::MAX)
102    }
103}
104
105#[comemo::track]
106impl Introspector {
107    /// Query for all matching elements.
108    pub fn query(&self, selector: &Selector) -> EcoVec<Content> {
109        let hash = typst_utils::hash128(selector);
110        if let Some(output) = self.queries.get(hash) {
111            return output;
112        }
113
114        let output = match selector {
115            Selector::Elem(..) => self
116                .all()
117                .filter(|elem| selector.matches(elem, None))
118                .cloned()
119                .collect(),
120            Selector::Location(location) => {
121                self.get_by_loc(location).cloned().into_iter().collect()
122            }
123            Selector::Label(label) => self
124                .labels
125                .get(label)
126                .iter()
127                .map(|&idx| self.get_by_idx(idx).clone())
128                .collect(),
129            Selector::Or(selectors) => selectors
130                .iter()
131                .flat_map(|sel| self.query(sel))
132                .map(|elem| self.elem_index(&elem))
133                .collect::<BTreeSet<usize>>()
134                .into_iter()
135                .map(|idx| self.get_by_idx(idx).clone())
136                .collect(),
137            Selector::And(selectors) => {
138                let mut results: Vec<_> =
139                    selectors.iter().map(|sel| self.query(sel)).collect();
140
141                // Extract the smallest result list and then keep only those
142                // elements in the smallest list that are also in all other
143                // lists.
144                results
145                    .iter()
146                    .enumerate()
147                    .min_by_key(|(_, vec)| vec.len())
148                    .map(|(i, _)| i)
149                    .map(|i| results.swap_remove(i))
150                    .iter()
151                    .flatten()
152                    .filter(|candidate| {
153                        results
154                            .iter()
155                            .all(|other| self.binary_search(other, candidate).is_ok())
156                    })
157                    .cloned()
158                    .collect()
159            }
160            Selector::Before { selector, end, inclusive } => {
161                let mut list = self.query(selector);
162                if let Some(end) = self.query_first(end) {
163                    // Determine which elements are before `end`.
164                    let split = match self.binary_search(&list, &end) {
165                        // Element itself is contained.
166                        Ok(i) => i + *inclusive as usize,
167                        // Element itself is not contained.
168                        Err(i) => i,
169                    };
170                    list = list[..split].into();
171                }
172                list
173            }
174            Selector::After { selector, start, inclusive } => {
175                let mut list = self.query(selector);
176                if let Some(start) = self.query_first(start) {
177                    // Determine which elements are after `start`.
178                    let split = match self.binary_search(&list, &start) {
179                        // Element itself is contained.
180                        Ok(i) => i + !*inclusive as usize,
181                        // Element itself is not contained.
182                        Err(i) => i,
183                    };
184                    list = list[split..].into();
185                }
186                list
187            }
188            // Not supported here.
189            Selector::Can(_) | Selector::Regex(_) => EcoVec::new(),
190        };
191
192        self.queries.insert(hash, output.clone());
193        output
194    }
195
196    /// Query for the first element that matches the selector.
197    pub fn query_first(&self, selector: &Selector) -> Option<Content> {
198        match selector {
199            Selector::Location(location) => self.get_by_loc(location).cloned(),
200            Selector::Label(label) => self
201                .labels
202                .get(label)
203                .first()
204                .map(|&idx| self.get_by_idx(idx).clone()),
205            _ => self.query(selector).first().cloned(),
206        }
207    }
208
209    /// Query for the first element that matches the selector.
210    pub fn query_unique(&self, selector: &Selector) -> StrResult<Content> {
211        match selector {
212            Selector::Location(location) => self
213                .get_by_loc(location)
214                .cloned()
215                .ok_or_else(|| "element does not exist in the document".into()),
216            Selector::Label(label) => self.query_label(*label).cloned(),
217            _ => {
218                let elems = self.query(selector);
219                if elems.len() > 1 {
220                    bail!("selector matches multiple elements",);
221                }
222                elems
223                    .into_iter()
224                    .next()
225                    .ok_or_else(|| "selector does not match any element".into())
226            }
227        }
228    }
229
230    /// Query for a unique element with the label.
231    pub fn query_label(&self, label: Label) -> StrResult<&Content> {
232        match *self.labels.get(&label) {
233            [idx] => Ok(self.get_by_idx(idx)),
234            [] => bail!("label `{}` does not exist in the document", label.repr()),
235            _ => bail!("label `{}` occurs multiple times in the document", label.repr()),
236        }
237    }
238
239    /// This is an optimized version of
240    /// `query(selector.before(end, true).len()` used by counters and state.
241    pub fn query_count_before(&self, selector: &Selector, end: Location) -> usize {
242        // See `query()` for details.
243        let list = self.query(selector);
244        if let Some(end) = self.get_by_loc(&end) {
245            match self.binary_search(&list, end) {
246                Ok(i) => i + 1,
247                Err(i) => i,
248            }
249        } else {
250            list.len()
251        }
252    }
253
254    /// The total number pages.
255    pub fn pages(&self) -> NonZeroUsize {
256        NonZeroUsize::new(self.pages).unwrap_or(NonZeroUsize::ONE)
257    }
258
259    /// Find the page number for the given location.
260    pub fn page(&self, location: Location) -> NonZeroUsize {
261        self.position(location).page
262    }
263
264    /// Find the position for the given location.
265    pub fn position(&self, location: Location) -> Position {
266        self.get_pos_by_loc(&location)
267            .unwrap_or(Position { page: NonZeroUsize::ONE, point: Point::zero() })
268    }
269
270    /// Gets the page numbering for the given location, if any.
271    pub fn page_numbering(&self, location: Location) -> Option<&Numbering> {
272        let page = self.page(location);
273        self.page_numberings
274            .get(page.get() - 1)
275            .and_then(|slot| slot.as_ref())
276    }
277
278    /// Gets the page supplement for the given location, if any.
279    pub fn page_supplement(&self, location: Location) -> Content {
280        let page = self.page(location);
281        self.page_supplements.get(page.get() - 1).cloned().unwrap_or_default()
282    }
283
284    /// Try to find a location for an element with the given `key` hash
285    /// that is closest after the `anchor`.
286    ///
287    /// This is used for introspector-assisted location assignment during
288    /// measurement. See the "Dealing with Measurement" section of the
289    /// [`Locator`](crate::introspection::Locator) docs for more details.
290    pub fn locator(&self, key: u128, anchor: Location) -> Option<Location> {
291        let anchor = self.loc_index(&anchor);
292        self.keys
293            .get(&key)
294            .iter()
295            .copied()
296            .min_by_key(|loc| self.loc_index(loc).wrapping_sub(anchor))
297    }
298}
299
300impl Debug for Introspector {
301    fn fmt(&self, f: &mut Formatter) -> fmt::Result {
302        f.pad("Introspector(..)")
303    }
304}
305
306/// A map from one keys to multiple elements.
307#[derive(Clone)]
308struct MultiMap<K, V>(HashMap<K, SmallVec<[V; 1]>>);
309
310impl<K, V> MultiMap<K, V>
311where
312    K: Hash + Eq,
313{
314    fn get(&self, key: &K) -> &[V] {
315        self.0.get(key).map_or(&[], |vec| vec.as_slice())
316    }
317
318    fn insert(&mut self, key: K, value: V) {
319        self.0.entry(key).or_default().push(value);
320    }
321
322    fn take(&mut self, key: &K) -> Option<impl Iterator<Item = V>> {
323        self.0.remove(key).map(|vec| vec.into_iter())
324    }
325}
326
327impl<K, V> Default for MultiMap<K, V> {
328    fn default() -> Self {
329        Self(HashMap::new())
330    }
331}
332
333/// Caches queries.
334#[derive(Default)]
335struct QueryCache(RwLock<HashMap<u128, EcoVec<Content>>>);
336
337impl QueryCache {
338    fn get(&self, hash: u128) -> Option<EcoVec<Content>> {
339        self.0.read().unwrap().get(&hash).cloned()
340    }
341
342    fn insert(&self, hash: u128, output: EcoVec<Content>) {
343        self.0.write().unwrap().insert(hash, output);
344    }
345}
346
347impl Clone for QueryCache {
348    fn clone(&self) -> Self {
349        Self(RwLock::new(self.0.read().unwrap().clone()))
350    }
351}
352
353/// Builds the introspector.
354#[derive(Default)]
355struct IntrospectorBuilder {
356    pages: usize,
357    page_numberings: Vec<Option<Numbering>>,
358    page_supplements: Vec<Content>,
359    seen: HashSet<Location>,
360    insertions: MultiMap<Location, Vec<Pair>>,
361    keys: MultiMap<u128, Location>,
362    locations: HashMap<Location, usize>,
363    labels: MultiMap<Label, usize>,
364}
365
366impl IntrospectorBuilder {
367    /// Create an empty builder.
368    fn new() -> Self {
369        Self::default()
370    }
371
372    /// Build an introspector for a page list.
373    fn build_paged(mut self, pages: &[Page]) -> Introspector {
374        self.pages = pages.len();
375        self.page_numberings.reserve(pages.len());
376        self.page_supplements.reserve(pages.len());
377
378        // Discover all elements.
379        let mut elems = Vec::new();
380        for (i, page) in pages.iter().enumerate() {
381            self.page_numberings.push(page.numbering.clone());
382            self.page_supplements.push(page.supplement.clone());
383            self.discover_in_frame(
384                &mut elems,
385                &page.frame,
386                NonZeroUsize::new(1 + i).unwrap(),
387                Transform::identity(),
388            );
389        }
390
391        self.finalize(elems)
392    }
393
394    /// Build an introspector for an HTML document.
395    fn build_html(mut self, root: &HtmlElement) -> Introspector {
396        let mut elems = Vec::new();
397        self.discover_in_html(&mut elems, root);
398        self.finalize(elems)
399    }
400
401    /// Processes the tags in the frame.
402    fn discover_in_frame(
403        &mut self,
404        sink: &mut Vec<Pair>,
405        frame: &Frame,
406        page: NonZeroUsize,
407        ts: Transform,
408    ) {
409        for (pos, item) in frame.items() {
410            match item {
411                FrameItem::Group(group) => {
412                    let ts = ts
413                        .pre_concat(Transform::translate(pos.x, pos.y))
414                        .pre_concat(group.transform);
415
416                    if let Some(parent) = group.parent {
417                        let mut nested = vec![];
418                        self.discover_in_frame(&mut nested, &group.frame, page, ts);
419                        self.insertions.insert(parent, nested);
420                    } else {
421                        self.discover_in_frame(sink, &group.frame, page, ts);
422                    }
423                }
424                FrameItem::Tag(tag) => {
425                    self.discover_in_tag(
426                        sink,
427                        tag,
428                        Position { page, point: pos.transform(ts) },
429                    );
430                }
431                _ => {}
432            }
433        }
434    }
435
436    /// Processes the tags in the HTML element.
437    fn discover_in_html(&mut self, sink: &mut Vec<Pair>, elem: &HtmlElement) {
438        for child in &elem.children {
439            match child {
440                HtmlNode::Tag(tag) => self.discover_in_tag(
441                    sink,
442                    tag,
443                    Position { page: NonZeroUsize::ONE, point: Point::zero() },
444                ),
445                HtmlNode::Text(_, _) => {}
446                HtmlNode::Element(elem) => self.discover_in_html(sink, elem),
447                HtmlNode::Frame(frame) => self.discover_in_frame(
448                    sink,
449                    frame,
450                    NonZeroUsize::ONE,
451                    Transform::identity(),
452                ),
453            }
454        }
455    }
456
457    /// Handle a tag.
458    fn discover_in_tag(&mut self, sink: &mut Vec<Pair>, tag: &Tag, position: Position) {
459        match tag {
460            Tag::Start(elem) => {
461                let loc = elem.location().unwrap();
462                if self.seen.insert(loc) {
463                    sink.push((elem.clone(), position));
464                }
465            }
466            Tag::End(loc, key) => {
467                self.keys.insert(*key, *loc);
468            }
469        }
470    }
471
472    /// Build a complete introspector with all acceleration structures from a
473    /// list of top-level pairs.
474    fn finalize(mut self, root: Vec<Pair>) -> Introspector {
475        self.locations.reserve(self.seen.len());
476
477        // Save all pairs and their descendants in the correct order.
478        let mut elems = Vec::with_capacity(self.seen.len());
479        for pair in root {
480            self.visit(&mut elems, pair);
481        }
482
483        Introspector {
484            pages: self.pages,
485            page_numberings: self.page_numberings,
486            page_supplements: self.page_supplements,
487            elems,
488            keys: self.keys,
489            locations: self.locations,
490            labels: self.labels,
491            queries: QueryCache::default(),
492        }
493    }
494
495    /// Saves a pair and all its descendants into `elems` and populates the
496    /// acceleration structures.
497    fn visit(&mut self, elems: &mut Vec<Pair>, pair: Pair) {
498        let elem = &pair.0;
499        let loc = elem.location().unwrap();
500        let idx = elems.len();
501
502        // Populate the location acceleration map.
503        self.locations.insert(loc, idx);
504
505        // Populate the label acceleration map.
506        if let Some(label) = elem.label() {
507            self.labels.insert(label, idx);
508        }
509
510        // Save the element.
511        elems.push(pair);
512
513        // Process potential descendants.
514        if let Some(insertions) = self.insertions.take(&loc) {
515            for pair in insertions.flatten() {
516                self.visit(elems, pair);
517            }
518        }
519    }
520}