metamath_rs/
segment_set.rs

1//! Incremental multi-file parser and result set.
2//!
3//! Again, if you're not writing an analysis pass you should be dealing with
4//! `database`, not this directly.
5//!
6//! The `parser` handles low-level parsing for slices of text, producing one or
7//! more segments; but this module handles the generation of slices, including
8//! the recursive loading of files from disk, implements the autosplit
9//! heuristic, and caching.
10//!
11//! # Caching
12//!
13//! The caching is the trickiest bit here.  Most analysis passes identify work
14//! granules by `SegmentId`, and then decide if work needs to be redone by
15//! checking if there is a new id or the address of the `Segment` for that ID
16//! has changed; since we're responsible for ID assignment, we can't use IDs for
17//! caching inside this module.
18//!
19//! `segment_set` implements two levels of caching:
20//!
21//! 1. When a file is loaded from disk, its pathname and modification time are
22//! saved, along with all of the segments which it generated.  If the file
23//! hasn't changed at all, we can reuse the segments after a single `stat`.
24//!
25//! 1. After a file is loaded and split, slices are cached by content.  This
26//! speeds up operation when local changes are made to large files, or when
27//! modification times cannot be used (such as some language server scenarios).
28//!
29//! In either case, some care is needed to retain only data which is relevant in
30//! the cache.  The caches are discarded on every read, but any data obtained
31//! from a cache miss is forwarded immediatly to the new cache; and hits in the
32//! first cache also copy the data for the second cache.  I'm not convinced the
33//! logic is right.
34//!
35//! # Diffing
36//!
37//! After the segments are read and parsed (possibly with cache hits), segment
38//! IDs need to be assigned.  The rule is that two segment IDs will never change
39//! relative order; thus adding and removing segments is easy, but we can't
40//! always reuse the ID even if we can reuse the `Segment`, if the order has
41//! changed.
42//!
43//! The most correct thing to do here would be a longest common subsequence
44//! calculation; currently we cheat and renumber all segments except for the
45//! common prefix and common suffix.  Thus, highly localized changes will keep
46//! most IDs the same, but if you make changes and the beginning and the end,
47//! most IDs will change and require new work from analysis passes.  A full LCS
48//! would make changing the beginning and end at the same time faster, and is
49//! attractive future work.
50
51use crate::database::{DbOptions, Executor, Promise};
52use crate::diag::Diagnostic;
53use crate::segment::{Comparer, Segment, SegmentOrder, SegmentRef};
54use crate::statement::{SegmentId, StatementAddress};
55use crate::util::{find_chapter_header, HashMap, HashSet};
56use crate::{parser, Span, StatementRef};
57use filetime::FileTime;
58use std::collections::VecDeque;
59use std::fs::{self, File};
60use std::hash::{Hash, Hasher};
61use std::io::{self, Read};
62use std::mem;
63use std::ops::RangeBounds;
64use std::str;
65use std::sync::Arc;
66
67/// Memory buffer wrapper which hashes by length.
68///
69/// We don't want to repeatedly hash tens of MBs of source text; but the
70/// segments are long enough that their lengths are likely to contain enough
71/// entropy already.
72#[derive(Eq, Clone, Debug)]
73struct LongBuf(Arc<Vec<u8>>);
74
75impl Hash for LongBuf {
76    fn hash<H: Hasher>(&self, state: &mut H) {
77        self.0.len().hash(state);
78    }
79}
80
81impl PartialEq for LongBuf {
82    fn eq(&self, other: &Self) -> bool {
83        self.0 == other.0
84    }
85}
86
87/// Tracks source information for a Segment that can be used by the diagnostic
88/// printer.
89///
90/// By separating this data from the segment itself, the segment can be reused
91/// even if its filename or (more likely) byte offset within a sliced file
92/// changes.
93///
94/// _This is likely to change when line number calculation is added._
95#[derive(Debug)]
96pub struct SourceInfo {
97    /// Name of the source file as loaded.
98    pub(crate) name: String,
99    /// Reference to the full unsliced source buffer.
100    pub(crate) text: Arc<Vec<u8>>,
101    /// Span of the parser input within the file; all spans reported by the
102    /// parser are relative to this.
103    pub(crate) span: Span,
104}
105
106impl SourceInfo {
107    /// Build a new `SourceInfo` from a text buffer.
108    #[must_use]
109    pub fn new(name: String, text: Arc<Vec<u8>>) -> Self {
110        Self {
111            span: Span::new(0, text.len()),
112            name,
113            text,
114        }
115    }
116}
117
118/// The result of parsing one or more segments from a single slice of a source
119/// file is a segment collection, a source file reference, and possibly text to
120/// use for inserting into the second cache.  If this parsing result applies to
121/// an I/O error, then it will not be inserted into the second cache as the
122/// "source" is not discriminating in that case (it will be empty).
123#[derive(Debug, Clone)]
124struct SliceSR(Option<LongBuf>, Vec<Arc<Segment>>, Arc<SourceInfo>);
125/// The result of parsing an actual source file is one or more slice results,
126/// and a key for the first cache if successful.
127#[derive(Debug, Clone)]
128struct FileSR(Option<(String, FileTime)>, Vec<SliceSR>);
129
130/// `SegmentSet` is a container for parsed databases.
131///
132/// If you're not writing an analysis pass you want to handle this through
133/// `Database`, but if you are then aliasing prevents using that and you'll be
134/// using this directly.
135///
136/// The `SegmentSet` tracks all of the segments reachable from the current start
137/// file with parsing results and cache information to allow fast incremental
138/// reloads.  It maintains the ordering of the segments, and provides a
139/// `SegmentOrder` object which can be used to compare segment IDs.  It is
140/// currently also a convenient access point for analysis passes to get the
141/// option block and the work queue executor, although those responsibilities
142/// may be moved.
143#[derive(Debug, Clone)]
144pub(crate) struct SegmentSet {
145    /// The option block controlling this database.
146    pub(crate) options: Arc<DbOptions>,
147    /// The work queue for use with this database.
148    pub(crate) exec: Executor,
149    /// Order structure which records the relative order of segment IDs created
150    /// by the `SegmentSet`.
151    pub(crate) order: Arc<SegmentOrder>,
152    /// Track segment and source info in parallel so they can be updated
153    /// independently in the slicing case and if a file is renamed.
154    segments: HashMap<SegmentId, (Arc<Segment>, Arc<SourceInfo>)>,
155    /// First cache as described in the module comment.
156    file_cache: HashMap<(String, FileTime), FileSR>,
157    /// Second cache as described in the module comment.
158    parse_cache: HashMap<LongBuf, Vec<Arc<Segment>>>,
159}
160
161impl SegmentSet {
162    /// Start a new empty segment set in the context of an option block and
163    /// executor which were previously created by the `Database`.
164    pub(crate) fn new(opts: Arc<DbOptions>, exec: &Executor) -> Self {
165        SegmentSet {
166            options: opts,
167            exec: exec.clone(),
168            order: Arc::default(),
169            segments: HashMap::default(),
170            parse_cache: HashMap::default(),
171            file_cache: HashMap::default(),
172        }
173    }
174
175    /// Reset the segment set to the empty state.
176    pub(crate) fn clear(&mut self) {
177        *Arc::make_mut(&mut self.order) = SegmentOrder::default();
178        self.segments = HashMap::default();
179        self.parse_cache = HashMap::default();
180        self.file_cache = HashMap::default();
181    }
182
183    /// Iterates over all loaded segments in logical order.
184    pub(crate) fn segments(
185        &self,
186        range: impl RangeBounds<SegmentId>,
187    ) -> impl DoubleEndedIterator<Item = SegmentRef<'_>> + Clone {
188        self.order.range(range).filter_map(move |id| {
189            Some(SegmentRef {
190                id,
191                segment: &self.segments.get(&id)?.0,
192            })
193        })
194    }
195
196    /// Fetch a handle to a loaded segment given its ID.
197    pub(crate) fn segment(&self, seg_id: SegmentId) -> SegmentRef<'_> {
198        SegmentRef {
199            id: seg_id,
200            segment: &self.segments[&seg_id].0,
201        }
202    }
203
204    /// Fetch a handle to a loaded segment given a possibly stale ID.
205    pub(crate) fn segment_opt(&self, seg_id: SegmentId) -> Option<SegmentRef<'_>> {
206        self.segments
207            .get(&seg_id)
208            .map(|(seg, _srcinfo)| SegmentRef {
209                id: seg_id,
210                segment: seg,
211            })
212    }
213
214    /// Fetch source information for a loaded segment given its ID.
215    pub(crate) fn source_info(&self, seg_id: SegmentId) -> &Arc<SourceInfo> {
216        &self.segments[&seg_id].1
217    }
218
219    /// Fetches a handle to a statement given a global address.
220    pub(crate) fn statement(&self, addr: StatementAddress) -> StatementRef<'_> {
221        self.segment(addr.segment_id).statement(addr.index)
222    }
223
224    /// Fetches a handle to a statement given a global address,
225    /// or return a dummy statement when the index is `NO_STATEMENT`.
226    pub(crate) fn statement_or_dummy(&self, addr: StatementAddress) -> StatementRef<'_> {
227        self.segment(addr.segment_id).statement_or_dummy(addr.index)
228    }
229
230    /// Reports any parse errors associated with loaded segments.
231    pub(crate) fn parse_diagnostics(&self) -> Vec<(StatementAddress, Diagnostic)> {
232        let mut out = Vec::new();
233        for sref in self.segments(..) {
234            for &(ix, ref d) in &sref.diagnostics {
235                out.push((StatementAddress::new(sref.id, ix), d.clone()));
236            }
237        }
238        out
239    }
240
241    /// Replaces the content of the `SegmentSet` with data loaded from disk
242    /// files or memory.
243    ///
244    /// Each element of `data` intercedes a file with the same name for the
245    /// purposes of file inclusion statements.  If a match is not made in
246    /// `data`, it will be accessed as a file relative to the current directory.
247    pub(crate) fn read(&mut self, path: String, data: Vec<(String, Vec<u8>)>) {
248        // data which is kept during the recursive load process, which does
249        // _not_ have access to the SegmentSet
250        struct RecState {
251            options: Arc<DbOptions>,
252            /// second cache from the last load
253            old_by_content: HashMap<LongBuf, Vec<Arc<Segment>>>,
254            /// second cache which will be saved after this load is done
255            new_by_content: HashMap<LongBuf, Vec<Arc<Segment>>>,
256            /// first cache from the last load
257            old_by_time: HashMap<(String, FileTime), FileSR>,
258            /// first cache which will be saved after this load is done
259            new_by_time: HashMap<(String, FileTime), FileSR>,
260            /// segments which have been placed in the order so far
261            segments: SegList,
262            included: HashSet<String>,
263            preload: HashMap<String, Vec<u8>>,
264            exec: Executor,
265        }
266
267        /// one or more segments with provenance, the output of the cache
268        type SegList = Vec<(Arc<Segment>, Arc<SourceInfo>)>;
269
270        /// Given a buffer of data from a file, split it and queue jobs to do
271        /// the parsing.
272        fn split_and_parse(
273            state: &RecState,
274            path: String,
275            timestamp: Option<FileTime>,
276            buf: Vec<u8>,
277        ) -> Promise<FileSR> {
278            let mut parts = Vec::new();
279            let buf = Arc::new(buf);
280            // see if we need to parse this file in multiple slices.  the
281            // slicing is a slight incompatibility (no chapter headers inside
282            // groups) but is needed for full parallelism
283            if state.options.autosplit && buf.len() > 1_048_576 {
284                let mut sstart = 0;
285                loop {
286                    if let Some(chap) = find_chapter_header(&buf[sstart..]) {
287                        parts.push(sstart..sstart + chap);
288                        sstart += chap;
289                    } else {
290                        parts.push(sstart..buf.len());
291                        break;
292                    }
293                }
294            } else {
295                parts.push(0..buf.len());
296            }
297
298            let mut promises = Vec::new();
299            for range in parts {
300                let partbuf = if range == (0..buf.len()) {
301                    buf.clone()
302                } else {
303                    Arc::new(buf[range.clone()].to_owned())
304                };
305
306                let srcinfo = Arc::new(SourceInfo {
307                    name: path.clone(),
308                    text: buf.clone(),
309                    span: Span::new(range.start, range.end),
310                });
311
312                let cachekey = LongBuf(partbuf.clone());
313
314                // probe the old second cache
315                if let Some(eseg) = state.old_by_content.get(&cachekey) {
316                    // hit, don't queue a job.  keep the name so that it
317                    // gets reinserted into the new second cache.
318                    let sres = SliceSR(Some(cachekey), eseg.clone(), srcinfo);
319                    promises.push(Promise::new(sres));
320                } else {
321                    let trace = state.options.trace_recalc;
322                    // parse it on a worker thread
323                    promises.push(state.exec.exec(partbuf.len(), move || {
324                        if trace {
325                            println!("parse({:?})", parser::guess_buffer_name(&partbuf));
326                        }
327                        SliceSR(Some(cachekey), parser::parse_segments(&partbuf), srcinfo)
328                    }));
329                }
330            }
331            // make a wrapper promise to build the proper file result.  this
332            // does _not_ run on a worker thread
333            Promise::join(promises)
334                .map(move |srlist| FileSR(timestamp.map(move |s| (path, s)), srlist))
335        }
336
337        // read a file from disk (intercessions have already been checked, but
338        // the first cache has not) and split/parse it;
339        // returns by Result Error on I/O error
340        fn canonicalize_and_read(state: &RecState, path: String) -> io::Result<Promise<FileSR>> {
341            let metadata = fs::metadata(&path)?;
342            let time = FileTime::from_last_modification_time(&metadata);
343
344            // probe 1st cache
345            if let Some(old_fsr) = state.old_by_time.get(&(path.clone(), time)) {
346                Ok(Promise::new(old_fsr.clone()))
347            } else {
348                // miss, but we have the file size, so try to read in one
349                // call to a buffer we won't have to move
350                let mut fh = File::open(&path)?;
351                let mut buf = Vec::with_capacity(metadata.len() as usize + 1);
352                // note: File's read_to_end uses the buffer capacity to choose how much to read
353                fh.read_to_end(&mut buf)?;
354
355                Ok(split_and_parse(state, path, Some(time), buf))
356            }
357        }
358
359        // We have a filename and an incomplete database in the RecState, read
360        // it and queue tasks to parse
361        fn read_and_parse(state: &mut RecState, path: String) -> Promise<FileSR> {
362            // THIS IS WRONG: https://github.com/sorear/smetamath-rs/issues/22
363
364            // We do need to avoid issuing multiple parses for the same file,
365            // but catching it here leads to misassociation
366            if !state.included.insert(path.clone()) {
367                return Promise::new(FileSR(None, Vec::new()));
368            }
369            // check intercessions
370            match state.preload.get(&path).cloned() {
371                None => {
372                    // read from FS
373                    canonicalize_and_read(state, path.clone()).unwrap_or_else(|cerr| {
374                        // read failed, insert a bogus segment so we have a
375                        // place to hang the errors
376                        let sinfo = SourceInfo {
377                            name: path.clone(),
378                            text: Arc::new(Vec::new()),
379                            span: Span::NULL,
380                        };
381                        let seg = parser::dummy_segment(From::from(cerr));
382                        // cache keys are None so this won't pollute any caches
383                        Promise::new(FileSR(
384                            None,
385                            vec![SliceSR(None, vec![seg], Arc::new(sinfo))],
386                        ))
387                    })
388                }
389                Some(data) => split_and_parse(state, path, None, data),
390            }
391        }
392
393        // File data has come back from the worker thread, make sure it's in the
394        // first and second caches as appropriate, even if it came from a hit
395        // earlier
396        fn flat(state: &mut RecState, inp: FileSR) -> SegList {
397            let mut out = Vec::new();
398            if let Some(key) = inp.0.clone() {
399                // insert into 1st cache
400                state.new_by_time.insert(key, inp.clone());
401            }
402            for SliceSR(nameo, vec, sinfo) in inp.1 {
403                if let Some(name) = nameo {
404                    // insert into 2nd cache
405                    state.new_by_content.insert(name, vec.clone());
406                }
407                for aseg in vec {
408                    // we have a contextualized segment
409                    out.push((aseg, sinfo.clone()));
410                }
411            }
412            out
413        }
414
415        // The main recursive parsing driver; we've already parsed one file, now
416        // incorporate it into the database under construction while parsing
417        // includes
418        fn recurse(state: &mut RecState, segments: SegList) {
419            let mut promises = VecDeque::new();
420
421            for seg in &segments {
422                if seg.0.next_file != Span::NULL {
423                    let chain = str::from_utf8(seg.0.next_file.as_ref(&seg.0.buffer))
424                        .expect("parser verified ASCII")
425                        .to_owned();
426                    // parse this include in the background
427                    promises.push_back(read_and_parse(state, chain));
428                }
429            }
430            for seg in segments {
431                let has_next = seg.0.next_file != Span::NULL;
432                state.segments.push(seg);
433                if has_next {
434                    // wait for include to be done parsing, incorporate it and
435                    // recurse
436                    let pp = promises.pop_front().unwrap().wait();
437                    let nsegs = flat(state, pp);
438                    recurse(state, nsegs);
439                }
440            }
441        }
442
443        // Note that we clear out the caches immediately, and only copy forward
444        // things that are actually used, to avoid memory bloat
445
446        let mut state = RecState {
447            options: self.options.clone(),
448            old_by_content: mem::take(&mut self.parse_cache),
449            new_by_content: HashMap::default(),
450            old_by_time: mem::take(&mut self.file_cache),
451            new_by_time: HashMap::default(),
452            segments: Vec::new(),
453            included: HashSet::default(),
454            preload: data.into_iter().collect(),
455            exec: self.exec.clone(),
456        };
457
458        // parse and recursively incorporate the initial file
459        let isegs = read_and_parse(&mut state, path);
460        let isegs = flat(&mut state, isegs.wait());
461        recurse(&mut state, isegs);
462
463        // we now have the new and old segment lists, time for ID allocaton.
464        // see module comment about LCS options
465        let mut old_segs = Vec::new();
466        for (&seg_id, seg) in &self.segments {
467            old_segs.push((seg_id, seg.clone()));
468        }
469        old_segs.sort_by(|x, y| self.order.cmp(&x.0, &y.0));
470        let mut new_segs = state.segments;
471
472        let mut old_r = 0..old_segs.len();
473        let mut new_r = 0..new_segs.len();
474
475        // LCS lite
476        while old_r.start < old_r.end
477            && new_r.start < new_r.end
478            && Arc::ptr_eq(&(old_segs[old_r.start].1).0, &new_segs[new_r.start].0)
479        {
480            old_r.start += 1;
481            new_r.start += 1;
482        }
483
484        while old_r.start < old_r.end
485            && new_r.start < new_r.end
486            && Arc::ptr_eq(&(old_segs[old_r.end - 1].1).0, &new_segs[new_r.end - 1].0)
487        {
488            old_r.end -= 1;
489            new_r.end -= 1;
490        }
491
492        // reuse as many IDs as possible even if the segment isn't exactly the
493        // same.  Hopefully the corresponding new and old segments are _similar_
494        // and later passes will be able to leverage that similarity
495        self.parse_cache = state.new_by_content;
496        self.file_cache = state.new_by_time;
497
498        while old_r.start < old_r.end && new_r.start < new_r.end {
499            self.segments
500                .insert(old_segs[old_r.start].0, new_segs[new_r.start].clone());
501            new_r.start += 1;
502            old_r.start += 1;
503        }
504
505        // now actually reassign the IDs
506        let order = Arc::make_mut(&mut self.order);
507        for &(id, _) in &old_segs[old_r.clone()] {
508            order.free_id(id);
509            self.segments.remove(&id);
510        }
511
512        let before = if old_r.end == old_segs.len() {
513            SegmentOrder::START
514        } else {
515            old_segs[old_r.end].0
516        };
517
518        for seg in new_segs.drain(new_r) {
519            let id = order.new_before(before);
520            self.segments.insert(id, seg);
521        }
522    }
523}
metamath_rs/segment_set.rs

metamath_rs/
segment_set.rs