punktf_lib/template/
source.rs

1//! This modules holds structures related to source files, ways to analyze them
2//! and to convert between [byte positions](`super::span::BytePos`) and
3//! [character positions](`super::span::CharPos`).
4
5use std::ops::Deref;
6use std::path::Path;
7use std::{fmt, vec};
8
9use unicode_width::UnicodeWidthChar;
10
11use super::span::{BytePos, ByteSpan, CharPos, Pos};
12
13/// Describes a location within a source file. The line is 1 indexed while
14/// the column is 0 indexed.
15#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
16pub struct Location {
17	/// One indexed line number.
18	line: usize,
19
20	/// Zero indexed column.
21	column: usize,
22}
23
24impl Location {
25	/// Returns the one indexed line number.
26	pub const fn line(&self) -> usize {
27		self.line
28	}
29
30	/// Returns the zero indexed column.
31	pub const fn column(&self) -> usize {
32		self.column
33	}
34
35	/// Returns the string representation of the location. For displaying
36	/// purposes both the line number and column are one indexed.
37	pub fn display(&self) -> String {
38		format!("{}:{}", self.line, self.column + 1)
39	}
40}
41
42// COPYRIGHT by Rust project contributors
43// <https://github.com/rust-lang/rust/graphs/contributors>
44//
45// Inspired by <https://github.com/rust-lang/rust/blob/362e0f55eb1f36d279e5c4a58fb0fe5f9a2c579d/compiler/rustc_span/src/lib.rs#L273>.
46/// This struct holds the origin from which a [`Source`] came from.
47#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
48pub enum SourceOrigin<'a> {
49	/// The origin is a file located at the path.
50	File(&'a Path),
51
52	/// An unknown/anonymous origin (mainly used for testing).
53	Anonymous,
54}
55
56impl<'a> fmt::Display for SourceOrigin<'a> {
57	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
58		match self {
59			Self::File(path) => fmt::Display::fmt(&path.display(), f),
60			Self::Anonymous => f.write_str("anonymous"),
61		}
62	}
63}
64
65// COPYRIGHT by Rust project contributors
66// <https://github.com/rust-lang/rust/graphs/contributors>
67//
68// Copied from <https://github.com/rust-lang/rust/blob/362e0f55eb1f36d279e5c4a58fb0fe5f9a2c579d/compiler/rustc_span/src/lib.rs#L1059>.
69/// Identifies the [position](`super::span::BytePos`) and the width of a
70/// multi-byte character in a [`Source`].
71#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
72pub struct MultiByteChar {
73	/// Position of the character in the [`Source`].
74	pos: BytePos,
75
76	/// Width in bytes of the character, `>= 2`.
77	bytes: u8,
78}
79
80impl MultiByteChar {
81	/// Returns the [position](`super::span::BytePos`) of the character.
82	pub const fn pos(&self) -> &BytePos {
83		&self.pos
84	}
85
86	/// Returns the width in bytes of the character.
87	pub const fn width(&self) -> u8 {
88		self.bytes
89	}
90}
91
92// COPYRIGHT by Rust project contributors
93// <https://github.com/rust-lang/rust/graphs/contributors>
94//
95// Copied from <https://github.com/rust-lang/rust/blob/362e0f55eb1f36d279e5c4a58fb0fe5f9a2c579d/compiler/rustc_span/src/lib.rs#L1068>.
96/// Identifies the [position](`super::span::BytePos`) of a
97/// special width character in a [`Source`]. Special width in this context
98/// means a character with `byte width != 1`.
99#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
100pub enum SpecialWidthChar {
101	/// A zero width character. These are mostly ASCII control characters.
102	ZeroWidth(BytePos),
103	/// A full width char.
104	Wide(BytePos),
105	/// Tab byte `\t`/`0x09`
106	Tab(BytePos),
107}
108
109impl SpecialWidthChar {
110	/// Tries to create a new `SpecialWidthChar`. The kind is derived from the
111	/// given `width`.
112	///
113	/// # Panics
114	///
115	/// Panics if `width` can not be mapped to a `SpecialWidthChar`.
116	pub fn new(pos: BytePos, width: usize) -> Self {
117		match width {
118			0 => Self::ZeroWidth(pos),
119			2 => Self::Wide(pos),
120			4 => Self::Tab(pos),
121			_ => panic!("Unsupported width for SpecialWidthChar: {width}"),
122		}
123	}
124
125	/// Returns the width in bytes of the character.
126	pub const fn width(&self) -> usize {
127		match self {
128			Self::ZeroWidth(_) => 0,
129			Self::Wide(_) => 2,
130			Self::Tab(_) => 4,
131		}
132	}
133
134	/// Returns the [position](`super::span::BytePos`) of the character.
135	pub const fn pos(&self) -> &BytePos {
136		match self {
137			Self::ZeroWidth(p) | Self::Wide(p) | Self::Tab(p) => p,
138		}
139	}
140}
141
142// COPYRIGHT by Rust project contributors
143// <https://github.com/rust-lang/rust/graphs/contributors>
144//
145// Copied from <https://github.com/rust-lang/rust/blob/362e0f55eb1f36d279e5c4a58fb0fe5f9a2c579d/compiler/rustc_span/src/analyze_source_file.rs#L207> with slight adaptations.
146/// Collects all [`SpecialWidthChar`], [`MultiByteChar`] and
147/// [positions](`super::span::BytePos`) of line breaks contained within
148/// `content`.
149fn analyze_source(content: &'_ str) -> (Vec<BytePos>, Vec<SpecialWidthChar>, Vec<MultiByteChar>) {
150	// start first line at index 0
151	let mut i = 0;
152
153	let mut lines = vec![BytePos::new(0)];
154	let mut special_width_chars = Vec::new();
155	let mut multi_byte_chars = Vec::new();
156
157	while i < content.len() {
158		let byte = content.as_bytes()[i];
159
160		let mut char_len = 1;
161
162		// all chars between 0-31 are ascii control characters
163		if byte < 32 {
164			match byte {
165				b'\n' => lines.push(BytePos::from_usize(i + 1)),
166				b'\t' => special_width_chars.push(SpecialWidthChar::Tab(BytePos::from_usize(i))),
167				_ => special_width_chars.push(SpecialWidthChar::ZeroWidth(BytePos::from_usize(i))),
168			}
169		} else if byte > 127 {
170			// bigger than `DEL`, could be multi-byte char
171			let chr = content[i..].chars().next().expect("A valid char");
172			char_len = chr.len_utf8();
173
174			let pos = BytePos::from_usize(i);
175
176			if char_len > 1 {
177				multi_byte_chars.push(MultiByteChar {
178					pos,
179					bytes: char_len as u8,
180				})
181			}
182
183			let char_width = UnicodeWidthChar::width(chr).unwrap_or(0);
184
185			if char_width != 1 {
186				special_width_chars.push(SpecialWidthChar::new(pos, char_width));
187			}
188		}
189
190		i += char_len;
191	}
192
193	(lines, special_width_chars, multi_byte_chars)
194}
195
196// COPYRIGHT by Rust project contributors
197// <https://github.com/rust-lang/rust/graphs/contributors>
198//
199// Inspired by <https://github.com/rust-lang/rust/blob/362e0f55eb1f36d279e5c4a58fb0fe5f9a2c579d/compiler/rustc_span/src/lib.rs#L1246>.
200/// Holds the contents of a file together with the origins where the content
201/// came from. Besides the origin it also holds some information used in error
202/// reporting.
203#[derive(Debug, Clone, PartialEq, Eq, Hash)]
204pub struct Source<'a> {
205	/// Origin of the source file.
206	pub(crate) origin: SourceOrigin<'a>,
207
208	/// Content of the source file.
209	pub(crate) content: &'a str,
210
211	/// [Positions](`super::span::BytePos`) of all characters which start a new line in [`Source::content`].
212	pub(crate) lines: Vec<BytePos>,
213
214	/// All [`SpecialWidthChar`] contained in [`Source::content`].
215	pub(crate) special_width_chars: Vec<SpecialWidthChar>,
216
217	/// All [`MultiByteChar`] contained in [`Source::content`].
218	pub(crate) multi_byte_chars: Vec<MultiByteChar>,
219}
220
221impl<'a> Source<'a> {
222	/// Creates a new source for the given `origin` and `content`.
223	pub fn new(origin: SourceOrigin<'a>, content: &'a str) -> Self {
224		let (lines, special_width_chars, multi_byte_chars) = analyze_source(content);
225
226		Self {
227			origin,
228			content,
229			lines,
230			special_width_chars,
231			multi_byte_chars,
232		}
233	}
234
235	/// Creates a new source with [`SourceOrigin::Anonymous`] and the given
236	/// `content`.
237	pub fn anonymous(content: &'a str) -> Self {
238		Self::new(SourceOrigin::Anonymous, content)
239	}
240
241	/// Creates a new source with [`SourceOrigin::File`] and the given
242	/// `content`.
243	pub fn file(path: &'a Path, content: &'a str) -> Self {
244		Self::new(SourceOrigin::File(path), content)
245	}
246
247	/// Translates a (byte position)[`super::span::BytePos`] into a (character
248	/// position)[`super::span::CharPos`]. These two positions may diverge to
249	/// to [`MultiByteChar`] or [`SpecialWidthChar`], as they count as a single
250	/// character but are made up of multiple/no bytes.
251	pub fn get_charpos(&self, pos: BytePos) -> CharPos {
252		let mut offset = 0;
253		let mut count = 0;
254
255		for swc in &self.special_width_chars {
256			if swc.pos() < &pos {
257				offset += swc.width();
258				count += 1;
259			} else {
260				// as the pos's are sorted we can abort after the first bigger
261				// pos
262				break;
263			}
264		}
265
266		for mbc in &self.multi_byte_chars {
267			if mbc.pos() < &pos {
268				offset += 1;
269				count += mbc.width() as usize;
270			} else {
271				// as the pos's are sorted we can abort after the first bigger
272				// pos
273				break;
274			}
275		}
276
277		let cpos = CharPos::from_usize((pos.as_usize() + offset) - count);
278
279		log::trace!("Translating pos: {} > {}", pos, cpos,);
280
281		cpos
282	}
283
284	/// Returns the zero indexed line index `pos` is located on.
285	pub fn get_pos_line_idx(&self, pos: BytePos) -> usize {
286		match self.lines.binary_search(&pos) {
287			Ok(idx) => idx,
288			Err(idx) => idx - 1,
289		}
290	}
291
292	/// Converts a [position](`super::span::BytePos`) to a [`Location`].
293	pub fn get_pos_location(&self, pos: BytePos) -> Location {
294		let line_idx = self.get_pos_line_idx(pos);
295		let line_start = self.lines[line_idx];
296
297		let pos_cpos = self.get_charpos(pos);
298		let line_start_cpos = self.get_charpos(line_start);
299
300		Location {
301			line: line_idx + 1,
302			column: (pos_cpos.as_usize() - line_start_cpos.as_usize()),
303		}
304	}
305
306	/// Get's the contents of a line which is located at the zero indexed `idx`.
307	pub fn get_idx_line(&self, idx: usize) -> &'a str {
308		let line_end_idx = self.lines.get(idx + 1);
309
310		let line_start = self.lines[idx];
311
312		// end of the line (-1 to get the last char of the line)
313		let line_end = BytePos::from_usize(
314			line_end_idx.map_or_else(|| self.content.len(), |&idx| idx.as_usize() - 1),
315		);
316
317		&self.content[ByteSpan::new(line_start, line_end)]
318	}
319
320	/// Get's the contents of a line on which `pos` is located on.
321	pub fn get_pos_line(&self, pos: BytePos) -> &'a str {
322		self.get_idx_line(self.get_pos_line_idx(pos))
323	}
324
325	/// Returns the origin of the source.
326	pub const fn origin(&self) -> &SourceOrigin<'_> {
327		&self.origin
328	}
329
330	/// Returns the whole content of the source.
331	pub const fn content(&self) -> &str {
332		self.content
333	}
334}
335
336impl Deref for Source<'_> {
337	type Target = str;
338
339	fn deref(&self) -> &Self::Target {
340		self.content
341	}
342}
343
344#[cfg(test)]
345mod tests {
346	use super::*;
347
348	#[test]
349	fn location_lines() {
350		crate::tests::setup_test_env();
351
352		let content = r#"Hello
353World
354Foo
355Bar"#;
356
357		let src = Source::anonymous(content);
358
359		assert_eq!(
360			src.get_pos_location(BytePos::new(0)),
361			Location { line: 1, column: 0 }
362		);
363		assert_eq!(
364			src.get_pos_location(BytePos::new(6)),
365			Location { line: 2, column: 0 }
366		);
367	}
368
369	#[test]
370	fn location_special() {
371		crate::tests::setup_test_env();
372
373		let content = "\tA\r\n\t\tHello";
374
375		let src = Source::anonymous(content);
376
377		assert_eq!(
378			src.get_pos_location(BytePos::new(1)),
379			Location { line: 1, column: 4 }
380		);
381
382		assert_eq!(
383			src.get_pos_location(BytePos::new(6)),
384			Location { line: 2, column: 8 }
385		);
386	}
387}