reflexo_typst2vec/pass/
span2vec.rs

1use core::fmt;
2use std::collections::HashMap;
3use std::sync::atomic::AtomicUsize;
4use std::sync::Arc;
5
6use crossbeam_queue::SegQueue;
7use rayon::iter::{IntoParallelIterator, ParallelIterator};
8use reflexo::error::prelude::*;
9use reflexo::hash::Fingerprint;
10use std::sync::OnceLock;
11
12use crate::debug_loc::{
13    ElementPoint, FileLocation, FlatSourceLocation, SourceSpan, SourceSpanOffset,
14};
15
16/// Represents a node in the source mapping tree.
17pub struct Node {
18    pub kind: u8,
19    pub source_span: Fingerprint,
20    pub children: Vec<Node>,
21}
22
23type SrcVec = Vec<(usize, SourceNodeKind, Fingerprint)>;
24
25#[derive(Debug)]
26struct LazyVec {
27    is_sorted: bool,
28    val: SrcVec,
29}
30
31impl Default for LazyVec {
32    fn default() -> Self {
33        Self::new()
34    }
35}
36
37/// A Enum representing [`SourceNodeKind::Text`] or [`SourceNodeKind::Char`].
38const SOURCE_MAPPING_TYPE_TEXT: u32 = 0;
39/// A Enum representing [`SourceNodeKind::Group`].
40const SOURCE_MAPPING_TYPE_GROUP: u32 = 1;
41/// A Enum representing [`SourceNodeKind::Image`].
42const SOURCE_MAPPING_TYPE_IMAGE: u32 = 2;
43/// A Enum representing [`SourceNodeKind::Shape`].
44const SOURCE_MAPPING_TYPE_SHAPE: u32 = 3;
45/// A Enum representing [`SourceNodeKind::Page`].
46const SOURCE_MAPPING_TYPE_PAGE: u32 = 4;
47/// A Enum representing internal glyph offset of [`SOURCE_MAPPING_TYPE_TEXT`].
48const SOURCE_MAPPING_TYPE_CHAR_INDEX: u32 = 5;
49
50impl LazyVec {
51    fn new() -> Self {
52        Self {
53            is_sorted: false,
54            val: Vec::new(),
55        }
56    }
57
58    fn ensure_sorted(&mut self) {
59        if !self.is_sorted {
60            self.val.sort_by_key(|x| x.0);
61            self.is_sorted = true;
62        }
63    }
64
65    fn get(&mut self, idx: usize) -> Option<&(usize, SourceNodeKind, Fingerprint)> {
66        self.ensure_sorted();
67        self.val.get(idx)
68    }
69}
70
71const SPAN_ROUTING: usize = 63;
72
73/// The unevaluated span info.
74enum RawSpanInfo {
75    /// A region to be inserted into the tree.
76    Region(SourceRegion),
77    /// A belonging relation to children queue to calculate the parents.
78    XContainsY { x: usize, y: usize },
79}
80
81type RawSpanInfoQueue = crossbeam_queue::SegQueue<RawSpanInfo>;
82
83struct LazySpanCollector {
84    val: [RawSpanInfoQueue; SPAN_ROUTING + 1],
85}
86
87impl Default for LazySpanCollector {
88    fn default() -> Self {
89        Self {
90            val: std::array::from_fn(|_| crossbeam_queue::SegQueue::new()),
91        }
92    }
93}
94
95impl LazySpanCollector {
96    fn reset(&mut self) {
97        *self = Self::default();
98    }
99
100    fn shard(&self, region: usize) -> &RawSpanInfoQueue {
101        // lower bits of region.idx is the index of the queue
102        let idx = region & SPAN_ROUTING;
103        &self.val[idx]
104    }
105
106    fn push(&self, region: SourceRegion) {
107        // Inserts XContainsY relation into Y's queue to calculate the parents.
108        match &region.kind {
109            SourceNodeKind::Page { region: ch } | SourceNodeKind::Group { region: ch } => {
110                self.shard(*ch).push(RawSpanInfo::XContainsY {
111                    x: region.region,
112                    y: *ch,
113                });
114            }
115            SourceNodeKind::Char(..)
116            | SourceNodeKind::Text(..)
117            | SourceNodeKind::Image(..)
118            | SourceNodeKind::Shape(..)
119            | SourceNodeKind::Doc => {}
120        }
121
122        // Inserts the region into its own queue.
123        self.shard(region.region).push(RawSpanInfo::Region(region));
124    }
125}
126
127#[derive(Default)]
128struct LazyRegionInfo {
129    /// A map from parent region id to a list of children.
130    children: HashMap<usize, LazyVec>,
131    /// A map from child region id to its parent.
132    parents: HashMap<usize, usize>,
133    /// A map from span to belonging region ids.
134    span_indice: HashMap<SourceSpan, Vec<usize>>,
135}
136
137impl fmt::Debug for LazyRegionInfo {
138    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
139        f.debug_struct("LazyRegionInfo")
140            .field("children_cnt", &self.children.len())
141            .field("parents_cnt", &self.parents.len())
142            .field("span_indice_cnt", &self.span_indice.len())
143            .finish()
144    }
145}
146
147impl From<SegQueue<RawSpanInfo>> for LazyRegionInfo {
148    fn from(value: SegQueue<RawSpanInfo>) -> Self {
149        let mut children = HashMap::new();
150        let mut parents = HashMap::new();
151
152        let mut span_indice = HashMap::new();
153        let mut insert_span = |span: SourceSpan, region: usize| {
154            span_indice
155                .entry(span)
156                .or_insert_with(Vec::new)
157                .push(region);
158        };
159
160        for i in value.into_iter() {
161            match i {
162                RawSpanInfo::XContainsY { x, y } => {
163                    parents.insert(y, x);
164                }
165                RawSpanInfo::Region(region) => {
166                    match &region.kind {
167                        SourceNodeKind::Char((s, _)) => {
168                            insert_span(*s, region.region);
169                        }
170                        SourceNodeKind::Text(chars) => {
171                            for s in chars.iter() {
172                                insert_span(s.0, region.region);
173                            }
174                        }
175                        SourceNodeKind::Image(s) => {
176                            insert_span(*s, region.region);
177                        }
178                        SourceNodeKind::Shape(s) => {
179                            insert_span(*s, region.region);
180                        }
181                        SourceNodeKind::Page { .. }
182                        | SourceNodeKind::Group { .. }
183                        | SourceNodeKind::Doc => {}
184                    }
185
186                    children
187                        .entry(region.region)
188                        .or_insert_with(LazyVec::new)
189                        .val
190                        .push((region.idx as usize, region.kind, region.item));
191                }
192            }
193        }
194
195        Self {
196            children,
197            parents,
198            span_indice,
199        }
200    }
201}
202
203struct LazySpanInfo {
204    /// A lazy ordered tree indexed by a region id.
205    ///
206    /// When one accesses it with a region id, it will
207    /// lazily sort the elements in the region.
208    elem_tree: [LazyRegionInfo; SPAN_ROUTING + 1],
209}
210
211impl LazySpanInfo {
212    fn get_mut(&mut self, doc_region: &usize) -> Option<&mut LazyVec> {
213        let idx = doc_region & SPAN_ROUTING;
214        self.elem_tree[idx].children.get_mut(doc_region)
215    }
216
217    fn get_parent(&self, doc_region: &usize) -> Option<usize> {
218        let idx = doc_region & SPAN_ROUTING;
219        self.elem_tree[idx].parents.get(doc_region).copied()
220    }
221}
222
223impl Default for LazySpanInfo {
224    fn default() -> Self {
225        Self {
226            elem_tree: std::array::from_fn(|_| LazyRegionInfo::default()),
227        }
228    }
229}
230
231impl From<LazySpanCollector> for LazySpanInfo {
232    fn from(collector: LazySpanCollector) -> Self {
233        let val = collector
234            .val
235            .into_par_iter()
236            .map(LazyRegionInfo::from)
237            .collect::<Vec<_>>()
238            .try_into()
239            .unwrap();
240
241        LazySpanInfo { elem_tree: val }
242    }
243}
244
245#[derive(Default)]
246pub struct Span2VecPass {
247    pub should_attach_debug_info: bool,
248
249    region_cnt: AtomicUsize,
250    pub doc_region: AtomicUsize,
251
252    span_tree: OnceLock<LazySpanInfo>,
253    collector: LazySpanCollector,
254}
255
256#[derive(Debug, Clone)]
257pub enum SourceNodeKind {
258    Doc,
259    Page { region: usize },
260    Group { region: usize },
261    Char((SourceSpan, u16)),
262    Text(Arc<[(SourceSpan, u16)]>),
263    Image(SourceSpan),
264    Shape(SourceSpan),
265}
266
267pub struct SourceRegion {
268    pub region: usize,
269    pub idx: u32,
270    pub kind: SourceNodeKind,
271    pub item: Fingerprint,
272}
273
274impl Span2VecPass {
275    pub fn set_should_attach_debug_info(&mut self, should_attach_debug_info: bool) {
276        assert!(
277            (SPAN_ROUTING + 1).is_power_of_two(),
278            "SPAN_ROUTING + 1 must be power of 2"
279        );
280        self.should_attach_debug_info = should_attach_debug_info;
281    }
282
283    pub fn reset(&mut self) {
284        // self.region_cnt = 0;
285        self.region_cnt
286            .store(1, std::sync::atomic::Ordering::SeqCst);
287        self.collector.reset();
288        self.span_tree = OnceLock::new();
289        self.doc_region
290            .store(0, std::sync::atomic::Ordering::SeqCst);
291    }
292
293    pub fn start(&self) -> usize {
294        self.region_cnt
295            .fetch_add(1, std::sync::atomic::Ordering::SeqCst)
296    }
297
298    pub fn push_span(&self, region: SourceRegion) {
299        self.collector.push(region);
300    }
301
302    /// Queries the element paths from the given span offset.
303    ///
304    /// Returns a list of paths, each path is a list of (kind, offset,
305    /// fingerprint)
306    /// + kind: the type of the element
307    /// + offset: the index of the element in its parent
308    /// + fingerprint: the fingerprint of the element
309    pub fn query_element_paths(
310        &mut self,
311        span_offset: SourceSpanOffset,
312    ) -> ZResult<Vec<Vec<ElementPoint>>> {
313        self.span_tree.get_or_init(|| {
314            log::info!("lazy spans are initializing");
315            std::mem::take(&mut self.collector).into()
316        });
317
318        let span_info = self
319            .span_tree
320            .get_mut()
321            .ok_or_else(|| error_once!("span info not initialized"))?;
322
323        let span = span_offset.span;
324
325        // Finds all the regions that contains the span.
326        let mut related_regions: Vec<usize> = span_info
327            .elem_tree
328            .iter_mut()
329            .flat_map(|s| s.span_indice.get(&span))
330            .flatten()
331            .copied()
332            .collect();
333        related_regions.sort();
334        related_regions.dedup();
335
336        // log::info!("pass check related_regions({related_regions:?}");
337
338        let doc_region = *self.doc_region.get_mut();
339        if doc_region == 0 {
340            return Err(error_once!("doc not initialized"));
341        }
342
343        let mut res = vec![];
344        for reg in related_regions {
345            let ch = span_info
346                .get_mut(&reg)
347                .ok_or_else(|| error_once!("related region not found", reg: reg))?;
348            ch.ensure_sorted();
349
350            for (idx, ch) in ch.val.iter().enumerate() {
351                match &ch.1 {
352                    SourceNodeKind::Char((s, _)) => {
353                        // todo: check upper bound
354                        if *s == span {
355                            log::info!("pass cursor char({s:?})");
356                            res.push(vec![(
357                                reg as u32,
358                                ElementPoint {
359                                    kind: SOURCE_MAPPING_TYPE_TEXT,
360                                    index: idx as u32,
361                                    fingerprint: "".to_owned(),
362                                },
363                            )]);
364                        }
365                    }
366                    SourceNodeKind::Text(chars) => {
367                        // log::info!("pass cursor check text({chars:?})");
368                        for (ch_idx, (s, byte_offset)) in chars.iter().enumerate() {
369                            // todo: it may not be monotonic
370                            let next = chars.get(ch_idx + 1);
371                            let byte_range = if matches!(next, Some((next, _)) if next == s) {
372                                (*byte_offset as usize)..(next.unwrap().1 as usize)
373                            } else {
374                                (*byte_offset as usize)..(usize::MAX)
375                            };
376                            if *s == span && byte_range.contains(&span_offset.offset) {
377                                log::info!("pass cursor text({s:?})");
378                                res.push(vec![
379                                    (
380                                        0u32,
381                                        ElementPoint {
382                                            kind: SOURCE_MAPPING_TYPE_CHAR_INDEX,
383                                            index: ch_idx as u32,
384                                            fingerprint: "".to_owned(),
385                                        },
386                                    ),
387                                    (
388                                        reg as u32,
389                                        ElementPoint {
390                                            kind: SOURCE_MAPPING_TYPE_TEXT,
391                                            index: idx as u32,
392                                            fingerprint: "".to_owned(),
393                                        },
394                                    ),
395                                ]);
396                            }
397                        }
398                    }
399                    SourceNodeKind::Image(s) => {
400                        if *s == span {
401                            log::info!("pass cursor image({s:?})");
402                            res.push(vec![(
403                                reg as u32,
404                                ElementPoint {
405                                    kind: SOURCE_MAPPING_TYPE_IMAGE,
406                                    index: idx as u32,
407                                    fingerprint: "".to_owned(),
408                                },
409                            )]);
410                        }
411                    }
412                    SourceNodeKind::Shape(s) => {
413                        if *s == span {
414                            log::info!("pass cursor shape({s:?})");
415                            res.push(vec![(
416                                reg as u32,
417                                ElementPoint {
418                                    kind: SOURCE_MAPPING_TYPE_SHAPE,
419                                    index: idx as u32,
420                                    fingerprint: "".to_owned(),
421                                },
422                            )]);
423                        }
424                    }
425                    SourceNodeKind::Page { .. }
426                    | SourceNodeKind::Group { .. }
427                    | SourceNodeKind::Doc => {}
428                }
429            }
430        }
431
432        log::info!("pass found candidates({res:?}), with root: {doc_region}");
433        for r in res.iter_mut() {
434            let reg = r.last().unwrap().0 as usize;
435            let mut cur = reg;
436            while cur != doc_region {
437                let par = span_info
438                    .get_parent(&cur)
439                    .ok_or_else(|| error_once!("parent not found", cur: cur))?;
440
441                let ch = span_info
442                    .get_mut(&par)
443                    .ok_or_else(|| error_once!("region children not found", reg: par))?;
444                ch.ensure_sorted();
445
446                // log::info!("found parent({cur:?}) -> ({par:?})");
447
448                let mut found = false;
449                for (idx, ch) in ch.val.iter().enumerate() {
450                    match &ch.1 {
451                        SourceNodeKind::Page { region } => {
452                            // log::info!("pass find check page({region:?})");
453                            if *region == cur {
454                                r.push((
455                                    par as u32,
456                                    ElementPoint {
457                                        kind: SOURCE_MAPPING_TYPE_PAGE,
458                                        index: idx as u32,
459                                        fingerprint: "".to_owned(),
460                                    },
461                                ));
462                                found = true;
463                                break;
464                            }
465                        }
466                        SourceNodeKind::Group { region } => {
467                            // log::info!("pass find check group({region:?})");
468                            if *region == cur {
469                                r.push((
470                                    par as u32,
471                                    ElementPoint {
472                                        kind: SOURCE_MAPPING_TYPE_GROUP,
473                                        index: idx as u32,
474                                        fingerprint: "".to_owned(),
475                                    },
476                                ));
477                                found = true;
478                                break;
479                            }
480                        }
481                        SourceNodeKind::Char(..)
482                        | SourceNodeKind::Text(..)
483                        | SourceNodeKind::Image(..)
484                        | SourceNodeKind::Shape(..)
485                        | SourceNodeKind::Doc => {}
486                    }
487                }
488
489                if !found {
490                    break;
491                }
492                cur = par;
493            }
494
495            if cur != doc_region {
496                log::info!("drop candidate({reg:?})");
497                r.clear();
498            } else {
499                r.reverse();
500            }
501        }
502
503        res.retain(|x| !x.is_empty());
504        Ok(res
505            .into_iter()
506            .map(|x| x.into_iter().map(|y| y.1).collect())
507            .collect())
508    }
509
510    pub fn query(
511        &mut self,
512        path: &[ElementPoint],
513    ) -> ZResult<Option<(SourceSpanOffset, SourceSpanOffset)>> {
514        self.span_tree.get_or_init(|| {
515            log::info!("lazy spans are initializing");
516            std::mem::take(&mut self.collector).into()
517        });
518
519        let doc_region = *self.doc_region.get_mut();
520        if doc_region == 0 {
521            return Err(error_once!("doc not initialized"));
522        }
523
524        let span_info = self
525            .span_tree
526            .get_mut()
527            .ok_or_else(|| error_once!("span info not initialized"))?;
528
529        let mut d = span_info
530            .get_mut(&doc_region)
531            .ok_or_else(|| error_once!("not found"))?;
532
533        log::info!("pass check remote path({path:?})");
534
535        let mut candidate: Option<(SourceSpanOffset, SourceSpanOffset)> = None;
536        let mut in_text_indice: Option<Arc<[(SourceSpan, u16)]>> = None;
537        for ElementPoint {
538            kind: remote_kind,
539            index: idx,
540            fingerprint: fg,
541        } in path
542        {
543            // Special case for char index
544            if SOURCE_MAPPING_TYPE_CHAR_INDEX == *remote_kind {
545                log::info!(
546                    "pass check remote_char_index({remote_kind}, {idx}) => {:?}",
547                    in_text_indice
548                );
549                let char_idx = *idx as usize;
550                if let Some(chars) = in_text_indice.as_ref() {
551                    let ch = chars.as_ref().get(char_idx);
552                    let Some(ch) = ch else {
553                        continue;
554                    };
555                    if !ch.0.is_detached() {
556                        candidate = Some(((*ch).into(), (*ch).into()));
557                    }
558                }
559                continue;
560            }
561
562            // Overwrite the previous text indice
563            in_text_indice = None;
564            // Find the child node
565            let ch = d
566                .get(*idx as usize)
567                .ok_or_else(|| error_once!("not found"))?;
568            if !fg.is_empty() && !fg.as_str().contains(&ch.2.as_svg_id("")) {
569                return Err(error_once!("fg not match", fg: fg, ch: ch.2.as_svg_id("")));
570            }
571
572            log::info!("pass check remote({remote_kind}, {idx}) => {:?}", ch.1);
573
574            match (*remote_kind, ch.1.clone()) {
575                (SOURCE_MAPPING_TYPE_PAGE, SourceNodeKind::Page { region }) => {
576                    d = span_info.get_mut(&region).ok_or_else(
577                        || error_once!("region not found", at: remote_kind, at_idx: idx),
578                    )?;
579                }
580                (SOURCE_MAPPING_TYPE_GROUP, SourceNodeKind::Group { region }) => {
581                    d = span_info.get_mut(&region).ok_or_else(
582                        || error_once!("region not found", at: remote_kind, at_idx: idx),
583                    )?;
584                }
585                (SOURCE_MAPPING_TYPE_TEXT, SourceNodeKind::Char(ch)) => {
586                    let is_attached = |x: (SourceSpan, u16)| x.0 != SourceSpan::detached();
587                    let st = is_attached(ch).then_some(ch);
588                    candidate = st.map(From::from).zip(st.map(From::from));
589
590                    // Generates a dynamic text indice here.
591                    //
592                    // We don't wrap it as creating `SourceNodeKind::Char`, because
593                    // it will be used rarely.
594                    //
595                    // This strategy would help us to reduce the time and memory
596                    // usage.
597                    in_text_indice = Some(Arc::new([ch]));
598                }
599                (SOURCE_MAPPING_TYPE_TEXT, SourceNodeKind::Text(chars)) => {
600                    let is_attached = |x: &&(SourceSpan, u16)| x.0 != SourceSpan::detached();
601                    let st = chars.iter().find(is_attached).copied();
602                    let ed = chars.iter().rev().find(is_attached).copied();
603                    candidate = st.map(From::from).zip(ed.map(From::from));
604                    in_text_indice = Some(chars.clone());
605                }
606                (SOURCE_MAPPING_TYPE_IMAGE, SourceNodeKind::Image(s)) => {
607                    candidate = Some((s.into(), s.into()));
608                }
609                (SOURCE_MAPPING_TYPE_SHAPE, SourceNodeKind::Shape(s)) => {
610                    candidate = Some((s.into(), s.into()));
611                }
612                _ => {
613                    return Err(error_once!(
614                        "invalid/mismatch node type",
615                        ty: remote_kind,
616                        actual: format!("{:?}", ch.1),
617                        parent: format!("{:?}", candidate),
618                        child_idx_in_parent: idx,
619                    ))
620                }
621            }
622        }
623
624        Ok(candidate)
625    }
626}
627
628pub struct SourceInfo {
629    pub files: Vec<FileLocation>,
630    pub spans: HashMap<Fingerprint, FlatSourceLocation>,
631}
632
633pub struct SourceInfoCollector {}
634
635impl SourceInfoCollector {
636    pub fn finallize(self) -> SourceInfo {
637        todo!()
638    }
639    pub fn intern(_i: &Node) {
640        todo!()
641    }
642}