metamath_rs/segment_set.rs
1//! Incremental multi-file parser and result set.
2//!
3//! Again, if you're not writing an analysis pass you should be dealing with
4//! `database`, not this directly.
5//!
6//! The `parser` handles low-level parsing for slices of text, producing one or
7//! more segments; but this module handles the generation of slices, including
8//! the recursive loading of files from disk, implements the autosplit
9//! heuristic, and caching.
10//!
11//! # Caching
12//!
13//! The caching is the trickiest bit here. Most analysis passes identify work
14//! granules by `SegmentId`, and then decide if work needs to be redone by
15//! checking if there is a new id or the address of the `Segment` for that ID
16//! has changed; since we're responsible for ID assignment, we can't use IDs for
17//! caching inside this module.
18//!
19//! `segment_set` implements two levels of caching:
20//!
21//! 1. When a file is loaded from disk, its pathname and modification time are
22//! saved, along with all of the segments which it generated. If the file
23//! hasn't changed at all, we can reuse the segments after a single `stat`.
24//!
25//! 1. After a file is loaded and split, slices are cached by content. This
26//! speeds up operation when local changes are made to large files, or when
27//! modification times cannot be used (such as some language server scenarios).
28//!
29//! In either case, some care is needed to retain only data which is relevant in
30//! the cache. The caches are discarded on every read, but any data obtained
31//! from a cache miss is forwarded immediatly to the new cache; and hits in the
32//! first cache also copy the data for the second cache. I'm not convinced the
33//! logic is right.
34//!
35//! # Diffing
36//!
37//! After the segments are read and parsed (possibly with cache hits), segment
38//! IDs need to be assigned. The rule is that two segment IDs will never change
39//! relative order; thus adding and removing segments is easy, but we can't
40//! always reuse the ID even if we can reuse the `Segment`, if the order has
41//! changed.
42//!
43//! The most correct thing to do here would be a longest common subsequence
44//! calculation; currently we cheat and renumber all segments except for the
45//! common prefix and common suffix. Thus, highly localized changes will keep
46//! most IDs the same, but if you make changes and the beginning and the end,
47//! most IDs will change and require new work from analysis passes. A full LCS
48//! would make changing the beginning and end at the same time faster, and is
49//! attractive future work.
50
51use crate::database::{DbOptions, Executor, Promise};
52use crate::diag::Diagnostic;
53use crate::segment::{Comparer, Segment, SegmentOrder, SegmentRef};
54use crate::statement::{SegmentId, StatementAddress};
55use crate::util::{find_chapter_header, HashMap, HashSet};
56use crate::{parser, Span, StatementRef};
57use filetime::FileTime;
58use std::collections::VecDeque;
59use std::fs::{self, File};
60use std::hash::{Hash, Hasher};
61use std::io::{self, Read};
62use std::mem;
63use std::ops::RangeBounds;
64use std::str;
65use std::sync::Arc;
66
67/// Memory buffer wrapper which hashes by length.
68///
69/// We don't want to repeatedly hash tens of MBs of source text; but the
70/// segments are long enough that their lengths are likely to contain enough
71/// entropy already.
72#[derive(Eq, Clone, Debug)]
73struct LongBuf(Arc<Vec<u8>>);
74
75impl Hash for LongBuf {
76 fn hash<H: Hasher>(&self, state: &mut H) {
77 self.0.len().hash(state);
78 }
79}
80
81impl PartialEq for LongBuf {
82 fn eq(&self, other: &Self) -> bool {
83 self.0 == other.0
84 }
85}
86
87/// Tracks source information for a Segment that can be used by the diagnostic
88/// printer.
89///
90/// By separating this data from the segment itself, the segment can be reused
91/// even if its filename or (more likely) byte offset within a sliced file
92/// changes.
93///
94/// _This is likely to change when line number calculation is added._
95#[derive(Debug)]
96pub struct SourceInfo {
97 /// Name of the source file as loaded.
98 pub(crate) name: String,
99 /// Reference to the full unsliced source buffer.
100 pub(crate) text: Arc<Vec<u8>>,
101 /// Span of the parser input within the file; all spans reported by the
102 /// parser are relative to this.
103 pub(crate) span: Span,
104}
105
106impl SourceInfo {
107 /// Build a new `SourceInfo` from a text buffer.
108 #[must_use]
109 pub fn new(name: String, text: Arc<Vec<u8>>) -> Self {
110 Self {
111 span: Span::new(0, text.len()),
112 name,
113 text,
114 }
115 }
116}
117
118/// The result of parsing one or more segments from a single slice of a source
119/// file is a segment collection, a source file reference, and possibly text to
120/// use for inserting into the second cache. If this parsing result applies to
121/// an I/O error, then it will not be inserted into the second cache as the
122/// "source" is not discriminating in that case (it will be empty).
123#[derive(Debug, Clone)]
124struct SliceSR(Option<LongBuf>, Vec<Arc<Segment>>, Arc<SourceInfo>);
125/// The result of parsing an actual source file is one or more slice results,
126/// and a key for the first cache if successful.
127#[derive(Debug, Clone)]
128struct FileSR(Option<(String, FileTime)>, Vec<SliceSR>);
129
130/// `SegmentSet` is a container for parsed databases.
131///
132/// If you're not writing an analysis pass you want to handle this through
133/// `Database`, but if you are then aliasing prevents using that and you'll be
134/// using this directly.
135///
136/// The `SegmentSet` tracks all of the segments reachable from the current start
137/// file with parsing results and cache information to allow fast incremental
138/// reloads. It maintains the ordering of the segments, and provides a
139/// `SegmentOrder` object which can be used to compare segment IDs. It is
140/// currently also a convenient access point for analysis passes to get the
141/// option block and the work queue executor, although those responsibilities
142/// may be moved.
143#[derive(Debug, Clone)]
144pub(crate) struct SegmentSet {
145 /// The option block controlling this database.
146 pub(crate) options: Arc<DbOptions>,
147 /// The work queue for use with this database.
148 pub(crate) exec: Executor,
149 /// Order structure which records the relative order of segment IDs created
150 /// by the `SegmentSet`.
151 pub(crate) order: Arc<SegmentOrder>,
152 /// Track segment and source info in parallel so they can be updated
153 /// independently in the slicing case and if a file is renamed.
154 segments: HashMap<SegmentId, (Arc<Segment>, Arc<SourceInfo>)>,
155 /// First cache as described in the module comment.
156 file_cache: HashMap<(String, FileTime), FileSR>,
157 /// Second cache as described in the module comment.
158 parse_cache: HashMap<LongBuf, Vec<Arc<Segment>>>,
159}
160
161impl SegmentSet {
162 /// Start a new empty segment set in the context of an option block and
163 /// executor which were previously created by the `Database`.
164 pub(crate) fn new(opts: Arc<DbOptions>, exec: &Executor) -> Self {
165 SegmentSet {
166 options: opts,
167 exec: exec.clone(),
168 order: Arc::default(),
169 segments: HashMap::default(),
170 parse_cache: HashMap::default(),
171 file_cache: HashMap::default(),
172 }
173 }
174
175 /// Reset the segment set to the empty state.
176 pub(crate) fn clear(&mut self) {
177 *Arc::make_mut(&mut self.order) = SegmentOrder::default();
178 self.segments = HashMap::default();
179 self.parse_cache = HashMap::default();
180 self.file_cache = HashMap::default();
181 }
182
183 /// Iterates over all loaded segments in logical order.
184 pub(crate) fn segments(
185 &self,
186 range: impl RangeBounds<SegmentId>,
187 ) -> impl DoubleEndedIterator<Item = SegmentRef<'_>> + Clone {
188 self.order.range(range).filter_map(move |id| {
189 Some(SegmentRef {
190 id,
191 segment: &self.segments.get(&id)?.0,
192 })
193 })
194 }
195
196 /// Fetch a handle to a loaded segment given its ID.
197 pub(crate) fn segment(&self, seg_id: SegmentId) -> SegmentRef<'_> {
198 SegmentRef {
199 id: seg_id,
200 segment: &self.segments[&seg_id].0,
201 }
202 }
203
204 /// Fetch a handle to a loaded segment given a possibly stale ID.
205 pub(crate) fn segment_opt(&self, seg_id: SegmentId) -> Option<SegmentRef<'_>> {
206 self.segments
207 .get(&seg_id)
208 .map(|(seg, _srcinfo)| SegmentRef {
209 id: seg_id,
210 segment: seg,
211 })
212 }
213
214 /// Fetch source information for a loaded segment given its ID.
215 pub(crate) fn source_info(&self, seg_id: SegmentId) -> &Arc<SourceInfo> {
216 &self.segments[&seg_id].1
217 }
218
219 /// Fetches a handle to a statement given a global address.
220 pub(crate) fn statement(&self, addr: StatementAddress) -> StatementRef<'_> {
221 self.segment(addr.segment_id).statement(addr.index)
222 }
223
224 /// Fetches a handle to a statement given a global address,
225 /// or return a dummy statement when the index is `NO_STATEMENT`.
226 pub(crate) fn statement_or_dummy(&self, addr: StatementAddress) -> StatementRef<'_> {
227 self.segment(addr.segment_id).statement_or_dummy(addr.index)
228 }
229
230 /// Reports any parse errors associated with loaded segments.
231 pub(crate) fn parse_diagnostics(&self) -> Vec<(StatementAddress, Diagnostic)> {
232 let mut out = Vec::new();
233 for sref in self.segments(..) {
234 for &(ix, ref d) in &sref.diagnostics {
235 out.push((StatementAddress::new(sref.id, ix), d.clone()));
236 }
237 }
238 out
239 }
240
241 /// Replaces the content of the `SegmentSet` with data loaded from disk
242 /// files or memory.
243 ///
244 /// Each element of `data` intercedes a file with the same name for the
245 /// purposes of file inclusion statements. If a match is not made in
246 /// `data`, it will be accessed as a file relative to the current directory.
247 pub(crate) fn read(&mut self, path: String, data: Vec<(String, Vec<u8>)>) {
248 // data which is kept during the recursive load process, which does
249 // _not_ have access to the SegmentSet
250 struct RecState {
251 options: Arc<DbOptions>,
252 /// second cache from the last load
253 old_by_content: HashMap<LongBuf, Vec<Arc<Segment>>>,
254 /// second cache which will be saved after this load is done
255 new_by_content: HashMap<LongBuf, Vec<Arc<Segment>>>,
256 /// first cache from the last load
257 old_by_time: HashMap<(String, FileTime), FileSR>,
258 /// first cache which will be saved after this load is done
259 new_by_time: HashMap<(String, FileTime), FileSR>,
260 /// segments which have been placed in the order so far
261 segments: SegList,
262 included: HashSet<String>,
263 preload: HashMap<String, Vec<u8>>,
264 exec: Executor,
265 }
266
267 /// one or more segments with provenance, the output of the cache
268 type SegList = Vec<(Arc<Segment>, Arc<SourceInfo>)>;
269
270 /// Given a buffer of data from a file, split it and queue jobs to do
271 /// the parsing.
272 fn split_and_parse(
273 state: &RecState,
274 path: String,
275 timestamp: Option<FileTime>,
276 buf: Vec<u8>,
277 ) -> Promise<FileSR> {
278 let mut parts = Vec::new();
279 let buf = Arc::new(buf);
280 // see if we need to parse this file in multiple slices. the
281 // slicing is a slight incompatibility (no chapter headers inside
282 // groups) but is needed for full parallelism
283 if state.options.autosplit && buf.len() > 1_048_576 {
284 let mut sstart = 0;
285 loop {
286 if let Some(chap) = find_chapter_header(&buf[sstart..]) {
287 parts.push(sstart..sstart + chap);
288 sstart += chap;
289 } else {
290 parts.push(sstart..buf.len());
291 break;
292 }
293 }
294 } else {
295 parts.push(0..buf.len());
296 }
297
298 let mut promises = Vec::new();
299 for range in parts {
300 let partbuf = if range == (0..buf.len()) {
301 buf.clone()
302 } else {
303 Arc::new(buf[range.clone()].to_owned())
304 };
305
306 let srcinfo = Arc::new(SourceInfo {
307 name: path.clone(),
308 text: buf.clone(),
309 span: Span::new(range.start, range.end),
310 });
311
312 let cachekey = LongBuf(partbuf.clone());
313
314 // probe the old second cache
315 if let Some(eseg) = state.old_by_content.get(&cachekey) {
316 // hit, don't queue a job. keep the name so that it
317 // gets reinserted into the new second cache.
318 let sres = SliceSR(Some(cachekey), eseg.clone(), srcinfo);
319 promises.push(Promise::new(sres));
320 } else {
321 let trace = state.options.trace_recalc;
322 // parse it on a worker thread
323 promises.push(state.exec.exec(partbuf.len(), move || {
324 if trace {
325 println!("parse({:?})", parser::guess_buffer_name(&partbuf));
326 }
327 SliceSR(Some(cachekey), parser::parse_segments(&partbuf), srcinfo)
328 }));
329 }
330 }
331 // make a wrapper promise to build the proper file result. this
332 // does _not_ run on a worker thread
333 Promise::join(promises)
334 .map(move |srlist| FileSR(timestamp.map(move |s| (path, s)), srlist))
335 }
336
337 // read a file from disk (intercessions have already been checked, but
338 // the first cache has not) and split/parse it;
339 // returns by Result Error on I/O error
340 fn canonicalize_and_read(state: &RecState, path: String) -> io::Result<Promise<FileSR>> {
341 let metadata = fs::metadata(&path)?;
342 let time = FileTime::from_last_modification_time(&metadata);
343
344 // probe 1st cache
345 if let Some(old_fsr) = state.old_by_time.get(&(path.clone(), time)) {
346 Ok(Promise::new(old_fsr.clone()))
347 } else {
348 // miss, but we have the file size, so try to read in one
349 // call to a buffer we won't have to move
350 let mut fh = File::open(&path)?;
351 let mut buf = Vec::with_capacity(metadata.len() as usize + 1);
352 // note: File's read_to_end uses the buffer capacity to choose how much to read
353 fh.read_to_end(&mut buf)?;
354
355 Ok(split_and_parse(state, path, Some(time), buf))
356 }
357 }
358
359 // We have a filename and an incomplete database in the RecState, read
360 // it and queue tasks to parse
361 fn read_and_parse(state: &mut RecState, path: String) -> Promise<FileSR> {
362 // THIS IS WRONG: https://github.com/sorear/smetamath-rs/issues/22
363
364 // We do need to avoid issuing multiple parses for the same file,
365 // but catching it here leads to misassociation
366 if !state.included.insert(path.clone()) {
367 return Promise::new(FileSR(None, Vec::new()));
368 }
369 // check intercessions
370 match state.preload.get(&path).cloned() {
371 None => {
372 // read from FS
373 canonicalize_and_read(state, path.clone()).unwrap_or_else(|cerr| {
374 // read failed, insert a bogus segment so we have a
375 // place to hang the errors
376 let sinfo = SourceInfo {
377 name: path.clone(),
378 text: Arc::new(Vec::new()),
379 span: Span::NULL,
380 };
381 let seg = parser::dummy_segment(From::from(cerr));
382 // cache keys are None so this won't pollute any caches
383 Promise::new(FileSR(
384 None,
385 vec![SliceSR(None, vec![seg], Arc::new(sinfo))],
386 ))
387 })
388 }
389 Some(data) => split_and_parse(state, path, None, data),
390 }
391 }
392
393 // File data has come back from the worker thread, make sure it's in the
394 // first and second caches as appropriate, even if it came from a hit
395 // earlier
396 fn flat(state: &mut RecState, inp: FileSR) -> SegList {
397 let mut out = Vec::new();
398 if let Some(key) = inp.0.clone() {
399 // insert into 1st cache
400 state.new_by_time.insert(key, inp.clone());
401 }
402 for SliceSR(nameo, vec, sinfo) in inp.1 {
403 if let Some(name) = nameo {
404 // insert into 2nd cache
405 state.new_by_content.insert(name, vec.clone());
406 }
407 for aseg in vec {
408 // we have a contextualized segment
409 out.push((aseg, sinfo.clone()));
410 }
411 }
412 out
413 }
414
415 // The main recursive parsing driver; we've already parsed one file, now
416 // incorporate it into the database under construction while parsing
417 // includes
418 fn recurse(state: &mut RecState, segments: SegList) {
419 let mut promises = VecDeque::new();
420
421 for seg in &segments {
422 if seg.0.next_file != Span::NULL {
423 let chain = str::from_utf8(seg.0.next_file.as_ref(&seg.0.buffer))
424 .expect("parser verified ASCII")
425 .to_owned();
426 // parse this include in the background
427 promises.push_back(read_and_parse(state, chain));
428 }
429 }
430 for seg in segments {
431 let has_next = seg.0.next_file != Span::NULL;
432 state.segments.push(seg);
433 if has_next {
434 // wait for include to be done parsing, incorporate it and
435 // recurse
436 let pp = promises.pop_front().unwrap().wait();
437 let nsegs = flat(state, pp);
438 recurse(state, nsegs);
439 }
440 }
441 }
442
443 // Note that we clear out the caches immediately, and only copy forward
444 // things that are actually used, to avoid memory bloat
445
446 let mut state = RecState {
447 options: self.options.clone(),
448 old_by_content: mem::take(&mut self.parse_cache),
449 new_by_content: HashMap::default(),
450 old_by_time: mem::take(&mut self.file_cache),
451 new_by_time: HashMap::default(),
452 segments: Vec::new(),
453 included: HashSet::default(),
454 preload: data.into_iter().collect(),
455 exec: self.exec.clone(),
456 };
457
458 // parse and recursively incorporate the initial file
459 let isegs = read_and_parse(&mut state, path);
460 let isegs = flat(&mut state, isegs.wait());
461 recurse(&mut state, isegs);
462
463 // we now have the new and old segment lists, time for ID allocaton.
464 // see module comment about LCS options
465 let mut old_segs = Vec::new();
466 for (&seg_id, seg) in &self.segments {
467 old_segs.push((seg_id, seg.clone()));
468 }
469 old_segs.sort_by(|x, y| self.order.cmp(&x.0, &y.0));
470 let mut new_segs = state.segments;
471
472 let mut old_r = 0..old_segs.len();
473 let mut new_r = 0..new_segs.len();
474
475 // LCS lite
476 while old_r.start < old_r.end
477 && new_r.start < new_r.end
478 && Arc::ptr_eq(&(old_segs[old_r.start].1).0, &new_segs[new_r.start].0)
479 {
480 old_r.start += 1;
481 new_r.start += 1;
482 }
483
484 while old_r.start < old_r.end
485 && new_r.start < new_r.end
486 && Arc::ptr_eq(&(old_segs[old_r.end - 1].1).0, &new_segs[new_r.end - 1].0)
487 {
488 old_r.end -= 1;
489 new_r.end -= 1;
490 }
491
492 // reuse as many IDs as possible even if the segment isn't exactly the
493 // same. Hopefully the corresponding new and old segments are _similar_
494 // and later passes will be able to leverage that similarity
495 self.parse_cache = state.new_by_content;
496 self.file_cache = state.new_by_time;
497
498 while old_r.start < old_r.end && new_r.start < new_r.end {
499 self.segments
500 .insert(old_segs[old_r.start].0, new_segs[new_r.start].clone());
501 new_r.start += 1;
502 old_r.start += 1;
503 }
504
505 // now actually reassign the IDs
506 let order = Arc::make_mut(&mut self.order);
507 for &(id, _) in &old_segs[old_r.clone()] {
508 order.free_id(id);
509 self.segments.remove(&id);
510 }
511
512 let before = if old_r.end == old_segs.len() {
513 SegmentOrder::START
514 } else {
515 old_segs[old_r.end].0
516 };
517
518 for seg in new_segs.drain(new_r) {
519 let id = order.new_before(before);
520 self.segments.insert(id, seg);
521 }
522 }
523}