typst_library/introspection/
introspector.rs

1use std::collections::BTreeSet;
2use std::fmt::{self, Debug, Formatter};
3use std::hash::Hash;
4use std::num::NonZeroUsize;
5use std::sync::RwLock;
6
7use ecow::{EcoString, EcoVec};
8use rustc_hash::{FxHashMap, FxHashSet};
9use smallvec::SmallVec;
10use typst_utils::NonZeroExt;
11
12use crate::diag::{StrResult, bail};
13use crate::foundations::{Content, Label, Repr, Selector};
14use crate::introspection::{Location, Tag};
15use crate::layout::{Frame, FrameItem, Point, Position, Transform};
16use crate::model::Numbering;
17
18/// Can be queried for elements and their positions.
19#[derive(Default, Clone)]
20pub struct Introspector {
21    /// The number of pages in the document.
22    pages: usize,
23    /// The page numberings, indexed by page number minus 1.
24    page_numberings: Vec<Option<Numbering>>,
25    /// The page supplements, indexed by page number minus 1.
26    page_supplements: Vec<Content>,
27
28    /// All introspectable elements.
29    elems: Vec<Pair>,
30    /// Lists all elements with a specific hash key. This is used for
31    /// introspector-assisted location assignment during measurement.
32    keys: MultiMap<u128, Location>,
33
34    /// Accelerates lookup of elements by location.
35    locations: FxHashMap<Location, usize>,
36    /// Accelerates lookup of elements by label.
37    labels: MultiMap<Label, usize>,
38
39    /// Maps from element locations to assigned HTML IDs. This used to support
40    /// intra-doc links in HTML export. In paged export, is is simply left
41    /// empty and [`Self::html_id`] is not used.
42    html_ids: FxHashMap<Location, EcoString>,
43
44    /// Caches queries done on the introspector. This is important because
45    /// even if all top-level queries are distinct, they often have shared
46    /// subqueries. Example: Individual counter queries with `before` that
47    /// all depend on a global counter query.
48    queries: QueryCache,
49}
50
51/// A pair of content and its position.
52type Pair = (Content, Position);
53
54impl Introspector {
55    /// Iterates over all locatable elements.
56    pub fn all(&self) -> impl Iterator<Item = &Content> + '_ {
57        self.elems.iter().map(|(c, _)| c)
58    }
59
60    /// Checks how many times a label exists.
61    pub fn label_count(&self, label: Label) -> usize {
62        self.labels.get(&label).len()
63    }
64
65    /// Enriches an existing introspector with HTML IDs, which were assigned
66    /// to the DOM in a post-processing step.
67    pub fn set_html_ids(&mut self, html_ids: FxHashMap<Location, EcoString>) {
68        self.html_ids = html_ids;
69    }
70
71    /// Retrieves the element with the given index.
72    #[track_caller]
73    fn get_by_idx(&self, idx: usize) -> &Content {
74        &self.elems[idx].0
75    }
76
77    /// Retrieves the position of the element with the given index.
78    #[track_caller]
79    fn get_pos_by_idx(&self, idx: usize) -> Position {
80        self.elems[idx].1
81    }
82
83    /// Retrieves an element by its location.
84    fn get_by_loc(&self, location: &Location) -> Option<&Content> {
85        self.locations.get(location).map(|&idx| self.get_by_idx(idx))
86    }
87
88    /// Retrieves the position of the element with the given index.
89    fn get_pos_by_loc(&self, location: &Location) -> Option<Position> {
90        self.locations.get(location).map(|&idx| self.get_pos_by_idx(idx))
91    }
92
93    /// Performs a binary search for `elem` among the `list`.
94    fn binary_search(&self, list: &[Content], elem: &Content) -> Result<usize, usize> {
95        list.binary_search_by_key(&self.elem_index(elem), |elem| self.elem_index(elem))
96    }
97
98    /// Gets the index of this element.
99    fn elem_index(&self, elem: &Content) -> usize {
100        self.loc_index(&elem.location().unwrap())
101    }
102
103    /// Gets the index of the element with this location among all.
104    fn loc_index(&self, location: &Location) -> usize {
105        self.locations.get(location).copied().unwrap_or(usize::MAX)
106    }
107}
108
109#[comemo::track]
110impl Introspector {
111    /// Query for all matching elements.
112    pub fn query(&self, selector: &Selector) -> EcoVec<Content> {
113        let hash = typst_utils::hash128(selector);
114        if let Some(output) = self.queries.get(hash) {
115            return output;
116        }
117
118        let output = match selector {
119            Selector::Elem(..) => self
120                .all()
121                .filter(|elem| selector.matches(elem, None))
122                .cloned()
123                .collect(),
124            Selector::Location(location) => {
125                self.get_by_loc(location).cloned().into_iter().collect()
126            }
127            Selector::Label(label) => self
128                .labels
129                .get(label)
130                .iter()
131                .map(|&idx| self.get_by_idx(idx).clone())
132                .collect(),
133            Selector::Or(selectors) => selectors
134                .iter()
135                .flat_map(|sel| self.query(sel))
136                .map(|elem| self.elem_index(&elem))
137                .collect::<BTreeSet<usize>>()
138                .into_iter()
139                .map(|idx| self.get_by_idx(idx).clone())
140                .collect(),
141            Selector::And(selectors) => {
142                let mut results: Vec<_> =
143                    selectors.iter().map(|sel| self.query(sel)).collect();
144
145                // Extract the smallest result list and then keep only those
146                // elements in the smallest list that are also in all other
147                // lists.
148                results
149                    .iter()
150                    .enumerate()
151                    .min_by_key(|(_, vec)| vec.len())
152                    .map(|(i, _)| i)
153                    .map(|i| results.swap_remove(i))
154                    .iter()
155                    .flatten()
156                    .filter(|candidate| {
157                        results
158                            .iter()
159                            .all(|other| self.binary_search(other, candidate).is_ok())
160                    })
161                    .cloned()
162                    .collect()
163            }
164            Selector::Before { selector, end, inclusive } => {
165                let mut list = self.query(selector);
166                if let Some(end) = self.query_first(end) {
167                    // Determine which elements are before `end`.
168                    let split = match self.binary_search(&list, &end) {
169                        // Element itself is contained.
170                        Ok(i) => i + *inclusive as usize,
171                        // Element itself is not contained.
172                        Err(i) => i,
173                    };
174                    list = list[..split].into();
175                }
176                list
177            }
178            Selector::After { selector, start, inclusive } => {
179                let mut list = self.query(selector);
180                if let Some(start) = self.query_first(start) {
181                    // Determine which elements are after `start`.
182                    let split = match self.binary_search(&list, &start) {
183                        // Element itself is contained.
184                        Ok(i) => i + !*inclusive as usize,
185                        // Element itself is not contained.
186                        Err(i) => i,
187                    };
188                    list = list[split..].into();
189                }
190                list
191            }
192            // Not supported here.
193            Selector::Can(_) | Selector::Regex(_) => EcoVec::new(),
194        };
195
196        self.queries.insert(hash, output.clone());
197        output
198    }
199
200    /// Query for the first element that matches the selector.
201    pub fn query_first(&self, selector: &Selector) -> Option<Content> {
202        match selector {
203            Selector::Location(location) => self.get_by_loc(location).cloned(),
204            Selector::Label(label) => self
205                .labels
206                .get(label)
207                .first()
208                .map(|&idx| self.get_by_idx(idx).clone()),
209            _ => self.query(selector).first().cloned(),
210        }
211    }
212
213    /// Query for the first element that matches the selector.
214    pub fn query_unique(&self, selector: &Selector) -> StrResult<Content> {
215        match selector {
216            Selector::Location(location) => self
217                .get_by_loc(location)
218                .cloned()
219                .ok_or_else(|| "element does not exist in the document".into()),
220            Selector::Label(label) => self.query_label(*label).cloned(),
221            _ => {
222                let elems = self.query(selector);
223                if elems.len() > 1 {
224                    bail!("selector matches multiple elements",);
225                }
226                elems
227                    .into_iter()
228                    .next()
229                    .ok_or_else(|| "selector does not match any element".into())
230            }
231        }
232    }
233
234    /// Query for a unique element with the label.
235    pub fn query_label(&self, label: Label) -> StrResult<&Content> {
236        match *self.labels.get(&label) {
237            [idx] => Ok(self.get_by_idx(idx)),
238            [] => bail!("label `{}` does not exist in the document", label.repr()),
239            _ => bail!("label `{}` occurs multiple times in the document", label.repr()),
240        }
241    }
242
243    /// This is an optimized version of
244    /// `query(selector.before(end, true).len()` used by counters and state.
245    pub fn query_count_before(&self, selector: &Selector, end: Location) -> usize {
246        // See `query()` for details.
247        let list = self.query(selector);
248        if let Some(end) = self.get_by_loc(&end) {
249            match self.binary_search(&list, end) {
250                Ok(i) => i + 1,
251                Err(i) => i,
252            }
253        } else {
254            list.len()
255        }
256    }
257
258    /// The total number pages.
259    pub fn pages(&self) -> NonZeroUsize {
260        NonZeroUsize::new(self.pages).unwrap_or(NonZeroUsize::ONE)
261    }
262
263    /// Find the page number for the given location.
264    pub fn page(&self, location: Location) -> NonZeroUsize {
265        self.position(location).page
266    }
267
268    /// Find the position for the given location.
269    pub fn position(&self, location: Location) -> Position {
270        self.get_pos_by_loc(&location)
271            .unwrap_or(Position { page: NonZeroUsize::ONE, point: Point::zero() })
272    }
273
274    /// Gets the page numbering for the given location, if any.
275    pub fn page_numbering(&self, location: Location) -> Option<&Numbering> {
276        let page = self.page(location);
277        self.page_numberings
278            .get(page.get() - 1)
279            .and_then(|slot| slot.as_ref())
280    }
281
282    /// Gets the page supplement for the given location, if any.
283    pub fn page_supplement(&self, location: Location) -> Content {
284        let page = self.page(location);
285        self.page_supplements.get(page.get() - 1).cloned().unwrap_or_default()
286    }
287
288    /// Retrieves the ID to link to for this location in HTML export.
289    pub fn html_id(&self, location: Location) -> Option<&EcoString> {
290        self.html_ids.get(&location)
291    }
292
293    /// Try to find a location for an element with the given `key` hash
294    /// that is closest after the `anchor`.
295    ///
296    /// This is used for introspector-assisted location assignment during
297    /// measurement. See the "Dealing with Measurement" section of the
298    /// [`Locator`](crate::introspection::Locator) docs for more details.
299    pub fn locator(&self, key: u128, anchor: Location) -> Option<Location> {
300        let anchor = self.loc_index(&anchor);
301        self.keys
302            .get(&key)
303            .iter()
304            .copied()
305            .min_by_key(|loc| self.loc_index(loc).wrapping_sub(anchor))
306    }
307}
308
309impl Debug for Introspector {
310    fn fmt(&self, f: &mut Formatter) -> fmt::Result {
311        f.pad("Introspector(..)")
312    }
313}
314
315/// A map from one keys to multiple elements.
316#[derive(Clone)]
317struct MultiMap<K, V>(FxHashMap<K, SmallVec<[V; 1]>>);
318
319impl<K, V> MultiMap<K, V>
320where
321    K: Hash + Eq,
322{
323    fn get(&self, key: &K) -> &[V] {
324        self.0.get(key).map_or(&[], |vec| vec.as_slice())
325    }
326
327    fn insert(&mut self, key: K, value: V) {
328        self.0.entry(key).or_default().push(value);
329    }
330
331    fn take(&mut self, key: &K) -> Option<impl Iterator<Item = V> + use<K, V>> {
332        self.0.remove(key).map(|vec| vec.into_iter())
333    }
334}
335
336impl<K, V> Default for MultiMap<K, V> {
337    fn default() -> Self {
338        Self(FxHashMap::default())
339    }
340}
341
342/// Caches queries.
343#[derive(Default)]
344struct QueryCache(RwLock<FxHashMap<u128, EcoVec<Content>>>);
345
346impl QueryCache {
347    fn get(&self, hash: u128) -> Option<EcoVec<Content>> {
348        self.0.read().unwrap().get(&hash).cloned()
349    }
350
351    fn insert(&self, hash: u128, output: EcoVec<Content>) {
352        self.0.write().unwrap().insert(hash, output);
353    }
354}
355
356impl Clone for QueryCache {
357    fn clone(&self) -> Self {
358        Self(RwLock::new(self.0.read().unwrap().clone()))
359    }
360}
361
362/// Builds the introspector.
363#[derive(Default)]
364pub struct IntrospectorBuilder {
365    pub pages: usize,
366    pub page_numberings: Vec<Option<Numbering>>,
367    pub page_supplements: Vec<Content>,
368    pub html_ids: FxHashMap<Location, EcoString>,
369    seen: FxHashSet<Location>,
370    insertions: MultiMap<Location, Vec<Pair>>,
371    keys: MultiMap<u128, Location>,
372    locations: FxHashMap<Location, usize>,
373    labels: MultiMap<Label, usize>,
374}
375
376impl IntrospectorBuilder {
377    /// Create an empty builder.
378    pub fn new() -> Self {
379        Self::default()
380    }
381
382    /// Processes the tags in the frame.
383    pub fn discover_in_frame(
384        &mut self,
385        sink: &mut Vec<Pair>,
386        frame: &Frame,
387        page: NonZeroUsize,
388        ts: Transform,
389    ) {
390        for (pos, item) in frame.items() {
391            match item {
392                FrameItem::Group(group) => {
393                    let ts = ts
394                        .pre_concat(Transform::translate(pos.x, pos.y))
395                        .pre_concat(group.transform);
396
397                    if let Some(parent) = group.parent {
398                        let mut nested = vec![];
399                        self.discover_in_frame(&mut nested, &group.frame, page, ts);
400                        self.register_insertion(parent.location, nested);
401                    } else {
402                        self.discover_in_frame(sink, &group.frame, page, ts);
403                    }
404                }
405                FrameItem::Tag(tag) => {
406                    self.discover_in_tag(
407                        sink,
408                        tag,
409                        Position { page, point: pos.transform(ts) },
410                    );
411                }
412                _ => {}
413            }
414        }
415    }
416
417    /// Handle a tag.
418    pub fn discover_in_tag(
419        &mut self,
420        sink: &mut Vec<Pair>,
421        tag: &Tag,
422        position: Position,
423    ) {
424        match tag {
425            Tag::Start(elem, flags) => {
426                if flags.introspectable {
427                    let loc = elem.location().unwrap();
428                    if self.seen.insert(loc) {
429                        sink.push((elem.clone(), position));
430                    }
431                }
432            }
433            Tag::End(loc, key, flags) => {
434                if flags.introspectable {
435                    self.keys.insert(*key, *loc);
436                }
437            }
438        }
439    }
440
441    /// Saves nested pairs as logically belonging to the `parent`.
442    pub fn register_insertion(&mut self, parent: Location, nested: Vec<Pair>) {
443        self.insertions.insert(parent, nested);
444    }
445
446    /// Build a complete introspector with all acceleration structures from a
447    /// list of top-level pairs.
448    pub fn finalize(mut self, root: Vec<Pair>) -> Introspector {
449        self.locations.reserve(self.seen.len());
450
451        // Save all pairs and their descendants in the correct order.
452        let mut elems = Vec::with_capacity(self.seen.len());
453        for pair in root {
454            self.visit(&mut elems, pair);
455        }
456
457        Introspector {
458            pages: self.pages,
459            page_numberings: self.page_numberings,
460            page_supplements: self.page_supplements,
461            html_ids: self.html_ids,
462            elems,
463            keys: self.keys,
464            locations: self.locations,
465            labels: self.labels,
466            queries: QueryCache::default(),
467        }
468    }
469
470    /// Saves a pair and all its descendants into `elems` and populates the
471    /// acceleration structures.
472    fn visit(&mut self, elems: &mut Vec<Pair>, pair: Pair) {
473        let elem = &pair.0;
474        let loc = elem.location().unwrap();
475        let idx = elems.len();
476
477        // Populate the location acceleration map.
478        self.locations.insert(loc, idx);
479
480        // Populate the label acceleration map.
481        if let Some(label) = elem.label() {
482            self.labels.insert(label, idx);
483        }
484
485        // Save the element.
486        elems.push(pair);
487
488        // Process potential descendants.
489        if let Some(insertions) = self.insertions.take(&loc) {
490            for pair in insertions.flatten() {
491                self.visit(elems, pair);
492            }
493        }
494    }
495}