metamath_rs/
segment.rs

1//! The `Segment` data structure, which holds most of the actual data in the file.
2
3use std::{
4    cmp::Ordering,
5    ops::{Bound, Deref, RangeBounds},
6    sync::Arc,
7};
8
9use crate::{
10    diag::Diagnostic,
11    statement::{
12        Command, FloatDef, GlobalDv, HeadingDef, LabelDef, LocalVarDef, SegmentId, Span, Statement,
13        StatementAddress, StatementIndex, StatementIter, SymbolDef, TokenAddress, DUMMY_STATEMENT,
14        NO_STATEMENT,
15    },
16    Database, StatementRef,
17};
18
19/// Data structure which tracks the logical order of segment IDs, since they are
20/// not intrinsically ordered.
21///
22/// This is an example of an "order-maintenance data structure", actually a very
23/// simple one. We can plug in the Dietz & Sleator 1987 algorithm if this gets
24/// too slow.
25///
26/// IDs are never reused after being released from the order, so they can be
27/// used safely as part of change-tracking structures.
28///
29/// `SegmentOrder` implements the [`Comparer`] trait, allowing it to be used
30/// polymorphically with the `cmp` method to order lists of segments,
31/// statements, or tokens.
32#[derive(Clone, Debug)]
33pub(crate) struct SegmentOrder {
34    high_water: u32,
35    order: Vec<SegmentId>,
36    reverse: Vec<usize>,
37}
38
39impl Default for SegmentOrder {
40    fn default() -> Self {
41        // pre-assign 1 as "start".  (think "cyclic order")
42        SegmentOrder {
43            high_water: 2,
44            order: vec![SegmentId(1)],
45            reverse: vec![0; 2],
46        }
47    }
48}
49
50impl SegmentOrder {
51    /// Each segment ordering has a single ID which will not be used otherwise;
52    /// pass this to `new_before` to get an ID larger than all created IDs.
53    pub(crate) const START: SegmentId = SegmentId(1);
54
55    fn alloc_id(&mut self) -> SegmentId {
56        let index = self.high_water;
57        assert!(index < u32::MAX);
58        self.high_water += 1;
59        SegmentId(index)
60    }
61
62    fn reindex(&mut self) {
63        self.reverse = vec![0; self.high_water as usize];
64        for (ordinal, &id) in self.order.iter().enumerate() {
65            self.reverse[id.0 as usize] = ordinal;
66        }
67    }
68
69    /// Indicates that an ID will no longer be used, allowing some memory to be
70    /// freed.
71    ///
72    /// The ID itself will not be reissued.
73    pub(crate) fn free_id(&mut self, id: SegmentId) {
74        self.order.remove(self.reverse[id.0 as usize]);
75        self.reindex();
76    }
77
78    /// Gets a new ID, and adds it to the order before the named ID, or at the
79    /// end if you pass `start()`.
80    pub(crate) fn new_before(&mut self, after: SegmentId) -> SegmentId {
81        let id = self.alloc_id();
82        self.order.insert(self.reverse[after.0 as usize], id);
83        self.reindex();
84        id
85    }
86
87    pub(crate) fn range(&self, range: impl RangeBounds<SegmentId>) -> SegmentIter<'_> {
88        let start = match range.start_bound() {
89            Bound::Included(id) => self.reverse[id.0 as usize],
90            Bound::Excluded(id) => self.reverse[id.0 as usize] + 1,
91            Bound::Unbounded => 0,
92        };
93        let end = match range.end_bound() {
94            Bound::Included(id) => self.reverse[id.0 as usize] + 1,
95            Bound::Excluded(id) => self.reverse[id.0 as usize],
96            Bound::Unbounded => self.order.len(),
97        };
98        SegmentIter(self.order[start..end].iter())
99    }
100}
101
102#[derive(Clone)]
103pub(crate) struct SegmentIter<'a>(std::slice::Iter<'a, SegmentId>);
104
105impl<'a> Iterator for SegmentIter<'a> {
106    type Item = SegmentId;
107
108    fn next(&mut self) -> Option<Self::Item> {
109        self.0.next().copied()
110    }
111
112    fn size_hint(&self) -> (usize, Option<usize>) {
113        (self.len(), Some(self.len()))
114    }
115}
116
117impl<'a> ExactSizeIterator for SegmentIter<'a> {
118    fn len(&self) -> usize {
119        self.0.len()
120    }
121}
122
123impl<'a> DoubleEndedIterator for SegmentIter<'a> {
124    fn next_back(&mut self) -> Option<Self::Item> {
125        self.0.next_back().copied()
126    }
127}
128
129/// A trait for objects which can be used to order other datatypes.
130pub trait Comparer<T> {
131    /// Compares two objects, like `Ord::cmp`, but with additional state data
132    /// from the comparer that can be used for the ordering.
133    fn cmp(&self, left: &T, right: &T) -> Ordering;
134
135    /// Returns true if the `left` argument is (strictly) less than the `right` argument
136    #[inline]
137    fn lt(&self, left: &T, right: &T) -> bool {
138        self.cmp(left, right) == Ordering::Less
139    }
140
141    /// Returns true if the `left` argument is less or equal to the `right` argument
142    #[inline]
143    fn le(&self, left: &T, right: &T) -> bool {
144        self.cmp(left, right) != Ordering::Greater
145    }
146}
147
148impl Comparer<SegmentId> for SegmentOrder {
149    fn cmp(&self, left: &SegmentId, right: &SegmentId) -> Ordering {
150        self.reverse[left.0 as usize].cmp(&self.reverse[right.0 as usize])
151    }
152}
153
154impl Comparer<StatementAddress> for SegmentOrder {
155    fn cmp(&self, left: &StatementAddress, right: &StatementAddress) -> Ordering {
156        self.cmp(&left.segment_id, &right.segment_id)
157            .then_with(|| left.index.cmp(&right.index))
158    }
159}
160
161impl Comparer<TokenAddress> for SegmentOrder {
162    fn cmp(&self, left: &TokenAddress, right: &TokenAddress) -> Ordering {
163        self.cmp(&left.statement, &right.statement)
164            .then_with(|| left.token_index.cmp(&right.token_index))
165    }
166}
167
168impl Comparer<SegmentId> for Database {
169    fn cmp(&self, left: &SegmentId, right: &SegmentId) -> Ordering {
170        self.parse_result().order.cmp(left, right)
171    }
172}
173
174impl Comparer<StatementAddress> for Database {
175    fn cmp(&self, left: &StatementAddress, right: &StatementAddress) -> Ordering {
176        self.parse_result().order.cmp(left, right)
177    }
178}
179
180impl Comparer<TokenAddress> for Database {
181    fn cmp(&self, left: &TokenAddress, right: &TokenAddress) -> Ordering {
182        self.parse_result().order.cmp(left, right)
183    }
184}
185
186impl<'a, T, C: Comparer<T>> Comparer<T> for &'a C {
187    fn cmp(&self, left: &T, right: &T) -> Ordering {
188        (*self).cmp(left, right)
189    }
190}
191
192/// Shared reference to a buffer which will be parsed.
193///
194/// We use `u8` throughout the parser because Metamath databases are limited to
195/// US-ASCII by the spec.  Since math symbols are used as tokens, if we wanted
196/// to allow UTF-8 in the future it would be best to continue using `u8`,
197/// although there would need to be a validity check (valid UTF-8 encodings are
198/// always canonical) in `Scanner::get_raw` and the eighth-bit hack in
199/// `scan_expression` would need to be reverted.
200pub(crate) type BufferRef = Arc<Vec<u8>>;
201
202/// A parsed segment, containing parsed statement data and some extractions.
203///
204/// This is the main result of the parse which is provided to `segment_set`,
205/// although most users will want to use a `SegmentRef` instead.
206///
207/// This is a "dense" segment, which must be fully rebuilt in order to make any
208/// change.  We may in the future have an "unpacked" segment which is used for
209/// active editing, as well as a "lazy" or "mmap" segment type for fast
210/// incremental startup.
211#[derive(Debug)]
212pub struct Segment {
213    /// The original string used to construct this segment.
214    ///
215    /// Don't assume the segment is wholly determined by this string.
216    pub buffer: BufferRef,
217    /// List of statements in this segment; can be indexed with any
218    /// `StatementIndex`.
219    pub(crate) statements: Vec<Statement>,
220    /// All math strings and proof strings in this segment, concatenated for
221    /// fewer allocations.
222    pub(crate) span_pool: Vec<Span>,
223    /// Any errors detected while parsing this segment.
224    pub diagnostics: Vec<(StatementIndex, Diagnostic)>,
225    /// The file, if any, which is included at the end of this segment, as a
226    /// reference into the buffer.
227    pub next_file: Span,
228    /// Global `$d` statements extracted for nameck.
229    pub global_dvs: Vec<GlobalDv>,
230    /// Global `$v` and `$c` statements extracted for nameck.
231    pub symbols: Vec<SymbolDef>,
232    /// Local `$v` statements extracted for nameck.
233    pub local_vars: Vec<LocalVarDef>,
234    /// Global labelled statements extracted for nameck.
235    pub labels: Vec<LabelDef>,
236    /// Global `$f` statements extracted for nameck.
237    pub floats: Vec<FloatDef>,
238    /// Top-level headings extracted for outline
239    pub outline: Vec<HeadingDef>,
240    /// Parser commands provided in $t typesetting comments.
241    pub t_commands: Vec<(StatementIndex, Command)>,
242    /// Parser commands provided in $j additional information comments.
243    pub j_commands: Vec<(StatementIndex, Command)>,
244}
245
246/// A pointer to a segment which knows its identity.
247///
248/// `SegmentRef` objects are constructed from outside by the `segment_set`.
249#[derive(Copy, Clone, Debug)]
250pub struct SegmentRef<'a> {
251    /// The underlying segment from the parser.
252    pub segment: &'a Arc<Segment>,
253    /// The global ID which has been assigned to the segment.
254    pub id: SegmentId,
255}
256
257impl<'a> Deref for SegmentRef<'a> {
258    type Target = Arc<Segment>;
259
260    #[inline]
261    fn deref(&self) -> &Arc<Segment> {
262        self.segment
263    }
264}
265
266impl<'a> SegmentRef<'a> {
267    /// Fetch a single statement from this segment by its local index.
268    #[must_use]
269    pub fn statement(self, index: StatementIndex) -> StatementRef<'a> {
270        StatementRef {
271            segment: self,
272            statement: &self.segment.statements[index as usize],
273            index,
274        }
275    }
276
277    /// Fetch a single statement from this segment by its local index,
278    /// or return a dummy statement when the index is `NO_STATEMENT`.
279    #[must_use]
280    pub(crate) fn statement_or_dummy(self, index: StatementIndex) -> StatementRef<'a> {
281        StatementRef {
282            segment: self,
283            statement: if index == NO_STATEMENT {
284                &DUMMY_STATEMENT
285            } else {
286                &self.segment.statements[index as usize]
287            },
288            index,
289        }
290    }
291
292    /// Returns the source size of the segment, a proxy for computational
293    /// difficulty which drives the `database::Executor` bin-packing heuristics.
294    #[must_use]
295    pub fn bytes(self) -> usize {
296        self.buffer.len()
297    }
298
299    /// Returns a new empty statement iterator.
300    fn empty(self) -> StatementIter<'a> {
301        StatementIter {
302            slice_iter: [].iter(),
303            segment: self,
304            index: 0,
305        }
306    }
307
308    /// Returns an iterator over a range of statement indices in the current segment.
309    #[must_use]
310    pub fn range(self, range: impl RangeBounds<StatementIndex>) -> StatementIter<'a> {
311        let start = match range.start_bound() {
312            Bound::Included(&i) => i,
313            Bound::Excluded(&i) => i + 1,
314            Bound::Unbounded => 0,
315        };
316        let end = match range.end_bound() {
317            Bound::Included(&i) => i as usize + 1,
318            Bound::Excluded(&i) => i as usize,
319            Bound::Unbounded => self.segment.statements.len(),
320        };
321        StatementIter {
322            slice_iter: self.segment.statements[start as usize..end].iter(),
323            segment: self,
324            index: start,
325        }
326    }
327
328    /// Returns an iterator over a range of statement addresses in the current segment.
329    #[must_use]
330    pub(crate) fn range_address(
331        self,
332        order: &SegmentOrder,
333        range: impl RangeBounds<StatementAddress>,
334    ) -> StatementIter<'a> {
335        let start = match range.start_bound() {
336            Bound::Included(addr) => match order.cmp(&addr.segment_id, &self.id) {
337                Ordering::Less => Bound::Unbounded,
338                Ordering::Equal => Bound::Included(addr.index),
339                Ordering::Greater => return self.empty(),
340            },
341            Bound::Excluded(addr) => match order.cmp(&addr.segment_id, &self.id) {
342                Ordering::Less => Bound::Unbounded,
343                Ordering::Equal => Bound::Excluded(addr.index),
344                Ordering::Greater => return self.empty(),
345            },
346            Bound::Unbounded => Bound::Unbounded,
347        };
348        let end = match range.end_bound() {
349            Bound::Included(addr) => match order.cmp(&addr.segment_id, &self.id) {
350                Ordering::Less => return self.empty(),
351                Ordering::Equal => Bound::Included(addr.index),
352                Ordering::Greater => Bound::Unbounded,
353            },
354            Bound::Excluded(addr) => match order.cmp(&addr.segment_id, &self.id) {
355                Ordering::Less => return self.empty(),
356                Ordering::Equal => Bound::Excluded(addr.index),
357                Ordering::Greater => Bound::Unbounded,
358            },
359            Bound::Unbounded => Bound::Unbounded,
360        };
361        self.range((start, end))
362    }
363}
364
365impl<'a> IntoIterator for SegmentRef<'a> {
366    type Item = StatementRef<'a>;
367    type IntoIter = StatementIter<'a>;
368
369    /// An iterator over the statements in this segment.
370    fn into_iter(self) -> Self::IntoIter {
371        StatementIter {
372            slice_iter: self.segment.statements.iter(),
373            segment: self,
374            index: 0,
375        }
376    }
377}
metamath_rs/segment.rs

metamath_rs/
segment.rs