miden_parsing/
source.rs

1use std::char;
2use std::ops::Range;
3use std::path::PathBuf;
4use std::sync::Arc;
5
6use miden_diagnostics::*;
7
8pub type SourceResult<T> = std::result::Result<T, SourceError>;
9
10/// [Source] is an abstraction for files which are read via [Scanner]
11pub trait Source: Sized {
12    /// Create a new implementation of this source from the given [SourceFile]
13    fn new(src: Arc<SourceFile>) -> Self;
14
15    /// Read the next character from the source
16    fn read(&mut self) -> Option<(SourceIndex, char)>;
17
18    /// Peek the next character from the source
19    fn peek(&mut self) -> Option<(SourceIndex, char)>;
20
21    /// Get a [SourceSpan] corresponding to the entire source file
22    fn span(&self) -> SourceSpan;
23
24    /// Get a string slice of the underlying source content from the given range
25    fn slice(&self, span: impl Into<Range<usize>>) -> &str;
26}
27
28#[derive(Debug, thiserror::Error)]
29pub enum SourceError {
30    #[error("error reading {path:?}: {source:?}")]
31    RootFileIO {
32        source: std::io::Error,
33        path: PathBuf,
34    },
35
36    #[error("invalid source path")]
37    InvalidPath { reason: String },
38}
39impl ToDiagnostic for SourceError {
40    fn to_diagnostic(self) -> Diagnostic {
41        match self {
42            SourceError::RootFileIO { source, path: _ } => {
43                Diagnostic::error().with_message(source.to_string())
44            }
45            SourceError::InvalidPath { reason } => {
46                Diagnostic::error().with_message(format!("invalid path: {}", reason))
47            }
48        }
49    }
50}
51
52/// An implementation of [Source] which reads from a [SourceFile]
53pub struct FileMapSource {
54    src: Arc<SourceFile>,
55    bytes: *const [u8],
56    start: SourceIndex,
57    peek: Option<(SourceIndex, char)>,
58    end: usize,
59    pos: usize,
60    eof: bool,
61}
62impl FileMapSource {
63    fn peek_char(&self) -> Option<(SourceIndex, char)> {
64        self.peek
65    }
66
67    fn next_char(&mut self) -> Option<(SourceIndex, char)> {
68        // If we've peeked a char already, return that
69        let result = if self.peek.is_some() {
70            std::mem::replace(&mut self.peek, None)
71        } else {
72            let next = unsafe { self.next_char_internal() };
73            match next {
74                None => {
75                    self.eof = true;
76                    return None;
77                }
78                result => result,
79            }
80        };
81
82        // Reset peek
83        self.peek = unsafe { self.next_char_internal() };
84
85        result
86    }
87
88    #[inline]
89    unsafe fn next_char_internal(&mut self) -> Option<(SourceIndex, char)> {
90        let mut pos = self.pos;
91        let end = self.end;
92        if pos == end {
93            self.eof = true;
94        }
95
96        if self.eof {
97            return None;
98        }
99
100        let start = self.start + pos;
101
102        let bytes: &[u8] = &*self.bytes;
103
104        // Decode UTF-8
105        let x = *bytes.get_unchecked(pos);
106        if x < 128 {
107            self.pos = pos + 1;
108            return Some((start, char::from_u32_unchecked(x as u32)));
109        }
110
111        // Multibyte case follows
112        // Decode from a byte combination out of: [[[x y] z] w]
113        // NOTE: Performance is sensitive to the exact formulation here
114        let init = Self::utf8_first_byte(x, 2);
115
116        pos += 1;
117        let y = if pos == end {
118            0u8
119        } else {
120            *bytes.get_unchecked(pos)
121        };
122        let mut ch = Self::utf8_acc_cont_byte(init, y);
123        if x >= 0xE0 {
124            // [[x y z] w] case
125            // 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
126            pos += 1;
127            let z = if pos == end {
128                0u8
129            } else {
130                *bytes.get_unchecked(pos)
131            };
132            let y_z = Self::utf8_acc_cont_byte((y & Self::CONT_MASK) as u32, z);
133            ch = init << 12 | y_z;
134            if x >= 0xF0 {
135                // [x y z w] case
136                // use only the lower 3 bits of `init`
137                pos += 1;
138                let w = if pos == end {
139                    0u8
140                } else {
141                    *bytes.get_unchecked(pos)
142                };
143                ch = (init & 7) << 18 | Self::utf8_acc_cont_byte(y_z, w);
144            }
145        }
146
147        pos += 1;
148        if pos >= end {
149            self.eof = true
150        }
151        self.pos = pos;
152
153        Some((start, char::from_u32_unchecked(ch)))
154    }
155
156    /// Returns the initial codepoint accumulator for the first byte.
157    /// The first byte is special, only want bottom 5 bits for width 2, 4 bits
158    /// for width 3, and 3 bits for width 4.
159    #[inline]
160    fn utf8_first_byte(byte: u8, width: u32) -> u32 {
161        (byte & (0x7F >> width)) as u32
162    }
163
164    /// Returns the value of `ch` updated with continuation byte `byte`.
165    #[inline]
166    fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 {
167        (ch << 6) | (byte & Self::CONT_MASK) as u32
168    }
169
170    /// Mask of the value bits of a continuation byte.
171    const CONT_MASK: u8 = 0b0011_1111;
172}
173impl Source for FileMapSource {
174    fn new(src: Arc<SourceFile>) -> Self {
175        let start = SourceIndex::new(src.id(), ByteIndex(0));
176        let mut source = Self {
177            src,
178            bytes: &[],
179            peek: None,
180            start,
181            end: 0,
182            pos: 0,
183            eof: false,
184        };
185        let s = source.src.source();
186        let bytes = s.as_bytes();
187        source.end = bytes.len();
188        source.bytes = bytes;
189        source.peek = unsafe { source.next_char_internal() };
190        source
191    }
192
193    #[inline]
194    fn read(&mut self) -> Option<(SourceIndex, char)> {
195        self.next_char()
196    }
197
198    #[inline]
199    fn peek(&mut self) -> Option<(SourceIndex, char)> {
200        self.peek_char()
201    }
202
203    #[inline]
204    fn span(&self) -> SourceSpan {
205        self.src.source_span()
206    }
207
208    #[inline]
209    fn slice(&self, span: impl Into<Range<usize>>) -> &str {
210        self.src.source_slice(span).unwrap()
211    }
212}
213
214impl Iterator for FileMapSource {
215    type Item = (SourceIndex, char);
216
217    fn next(&mut self) -> Option<Self::Item> {
218        self.read()
219    }
220}
221
222#[cfg(test)]
223mod test {
224    use pretty_assertions::assert_eq;
225
226    use super::*;
227
228    fn read_all_chars(source: FileMapSource) -> Vec<char> {
229        source.map(|result| result.1).collect()
230    }
231
232    #[test]
233    fn file_source() {
234        let expected = vec!['h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', '!'];
235
236        let codemap = CodeMap::default();
237
238        let id1 = codemap.add("nofile", "hello world!".to_string());
239        let file1 = codemap.get(id1).unwrap();
240        let source1 = FileMapSource::new(file1);
241        let chars = read_all_chars(source1);
242
243        assert_eq!(expected, chars);
244
245        let id2 = codemap.add("nofile", "hello world!".to_string());
246        let file2 = codemap.get(id2).unwrap();
247        let mut source2 = FileMapSource::new(file2);
248        assert_eq!(
249            Some((SourceIndex::new(id2, ByteIndex(0)), 'h')),
250            source2.peek()
251        );
252        assert_eq!(
253            Some((SourceIndex::new(id2, ByteIndex(0)), 'h')),
254            source2.next()
255        );
256
257        let id3 = codemap.add("nofile", "éé".to_string());
258        let file3 = codemap.get(id3).unwrap();
259        let mut source3 = FileMapSource::new(file3);
260        assert_eq!(
261            Some((SourceIndex::new(id3, ByteIndex(0)), 'é')),
262            source3.peek()
263        );
264        assert_eq!(
265            Some((SourceIndex::new(id3, ByteIndex(0)), 'é')),
266            source3.next()
267        );
268        assert_eq!(
269            Some((SourceIndex::new(id3, ByteIndex(2)), 'é')),
270            source3.peek()
271        );
272        assert_eq!(
273            Some((SourceIndex::new(id3, ByteIndex(2)), 'é')),
274            source3.next()
275        );
276    }
277}