solar_interface/source_map/
analyze.rs1use super::MultiByteChar;
2use crate::pos::RelativeBytePos;
3
4pub(crate) fn analyze_source_file(src: &str) -> (Vec<RelativeBytePos>, Vec<MultiByteChar>) {
10 let lines_upper_bound = 1 + src.len() / 32;
12 let mut lines = Vec::with_capacity(lines_upper_bound);
13 lines.push(RelativeBytePos::from_u32(0));
14
15 let mut multi_byte_chars = vec![];
16
17 analyze_source_file_dispatch(src, &mut lines, &mut multi_byte_chars);
19
20 if let Some(&last_line_start) = lines.last() {
24 let source_file_end = RelativeBytePos::from_usize(src.len());
25 assert!(source_file_end >= last_line_start);
26 if last_line_start == source_file_end {
27 lines.pop();
28 }
29 }
30
31 (lines, multi_byte_chars)
32}
33
34fn analyze_source_file_dispatch(
35 src: &str,
36 lines: &mut Vec<RelativeBytePos>,
37 multi_byte_chars: &mut Vec<MultiByteChar>,
38) {
39 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
40 if is_x86_feature_detected!("sse2") {
41 unsafe { analyze_source_file_sse2(src, lines, multi_byte_chars) };
42 return;
43 }
44 analyze_source_file_generic(
45 src,
46 src.len(),
47 RelativeBytePos::from_u32(0),
48 lines,
49 multi_byte_chars,
50 );
51}
52
53#[target_feature(enable = "sse2")]
58#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
59unsafe fn analyze_source_file_sse2(
60 src: &str,
61 lines: &mut Vec<RelativeBytePos>,
62 multi_byte_chars: &mut Vec<MultiByteChar>,
63) {
64 #[cfg(target_arch = "x86")]
65 use std::arch::x86::*;
66 #[cfg(target_arch = "x86_64")]
67 use std::arch::x86_64::*;
68
69 const CHUNK_SIZE: usize = 16;
70
71 let (chunks, tail) = src.as_bytes().as_chunks::<CHUNK_SIZE>();
72
73 let mut intra_chunk_offset = 0;
78
79 for (chunk_index, chunk) in chunks.iter().enumerate() {
80 let chunk = unsafe { _mm_loadu_si128(chunk.as_ptr() as *const __m128i) };
83
84 let multibyte_test = _mm_cmplt_epi8(chunk, _mm_set1_epi8(0));
87 let multibyte_mask = _mm_movemask_epi8(multibyte_test);
89
90 if multibyte_mask == 0 {
92 assert!(intra_chunk_offset == 0);
93
94 let newlines_test = _mm_cmpeq_epi8(chunk, _mm_set1_epi8(b'\n' as i8));
96 let mut newlines_mask = _mm_movemask_epi8(newlines_test);
97
98 let output_offset = RelativeBytePos::from_usize(chunk_index * CHUNK_SIZE + 1);
99
100 while newlines_mask != 0 {
101 let index = newlines_mask.trailing_zeros();
102
103 lines.push(RelativeBytePos(index) + output_offset);
104
105 newlines_mask &= newlines_mask - 1;
107 }
108 } else {
109 let scan_start = chunk_index * CHUNK_SIZE + intra_chunk_offset;
112 intra_chunk_offset = analyze_source_file_generic(
113 &src[scan_start..],
114 CHUNK_SIZE - intra_chunk_offset,
115 RelativeBytePos::from_usize(scan_start),
116 lines,
117 multi_byte_chars,
118 );
119 }
120 }
121
122 let tail_start = src.len() - tail.len() + intra_chunk_offset;
124 if tail_start < src.len() {
125 analyze_source_file_generic(
126 &src[tail_start..],
127 src.len() - tail_start,
128 RelativeBytePos::from_usize(tail_start),
129 lines,
130 multi_byte_chars,
131 );
132 }
133}
134
135fn analyze_source_file_generic(
139 src: &str,
140 scan_len: usize,
141 output_offset: RelativeBytePos,
142 lines: &mut Vec<RelativeBytePos>,
143 multi_byte_chars: &mut Vec<MultiByteChar>,
144) -> usize {
145 assert!(src.len() >= scan_len);
146 let mut i = 0;
147 let src_bytes = src.as_bytes();
148
149 while i < scan_len {
150 let byte = unsafe {
151 *src_bytes.get_unchecked(i)
153 };
154
155 let mut char_len = 1;
158
159 if byte == b'\n' {
160 let pos = RelativeBytePos::from_usize(i) + output_offset;
161 lines.push(pos + RelativeBytePos(1));
162 } else if byte >= 128 {
163 let c = src[i..].chars().next().unwrap();
165 char_len = c.len_utf8();
166
167 let pos = RelativeBytePos::from_usize(i) + output_offset;
168 assert!((2..=4).contains(&char_len));
169 let mbc = MultiByteChar { pos, bytes: char_len as u8 };
170 multi_byte_chars.push(mbc);
171 }
172
173 i += char_len;
174 }
175
176 i - scan_len
177}
178
179#[cfg(test)]
180mod tests {
181 use super::*;
182
183 macro_rules! test {
184 (
185 case:
186 $test_name:ident,text:
187 $text:expr,lines:
188 $lines:expr,multi_byte_chars:
189 $multi_byte_chars:expr,
190 ) => {
191 #[test]
192 fn $test_name() {
193 let (lines, multi_byte_chars) = analyze_source_file($text);
194
195 let expected_lines: Vec<RelativeBytePos> =
196 $lines.into_iter().map(RelativeBytePos).collect();
197
198 assert_eq!(lines, expected_lines);
199
200 let expected_mbcs: Vec<MultiByteChar> = $multi_byte_chars
201 .into_iter()
202 .map(|(pos, bytes)| MultiByteChar { pos: RelativeBytePos(pos), bytes })
203 .collect();
204
205 assert_eq!(multi_byte_chars, expected_mbcs);
206 }
207 };
208 }
209
210 test!(
211 case: empty_text,
212 text: "",
213 lines: vec![],
214 multi_byte_chars: vec![],
215 );
216
217 test!(
218 case: newlines_short,
219 text: "a\nc",
220 lines: vec![0, 2],
221 multi_byte_chars: vec![],
222 );
223
224 test!(
225 case: newlines_long,
226 text: "012345678\nabcdef012345678\na",
227 lines: vec![0, 10, 26],
228 multi_byte_chars: vec![],
229 );
230
231 test!(
232 case: newline_and_multi_byte_char_in_same_chunk,
233 text: "01234β789\nbcdef0123456789abcdef",
234 lines: vec![0, 11],
235 multi_byte_chars: vec![(5, 2)],
236 );
237
238 test!(
239 case: newline_and_control_char_in_same_chunk,
240 text: "01234\u{07}6789\nbcdef0123456789abcdef",
241 lines: vec![0, 11],
242 multi_byte_chars: vec![],
243 );
244
245 test!(
246 case: multi_byte_char_short,
247 text: "aβc",
248 lines: vec![0],
249 multi_byte_chars: vec![(1, 2)],
250 );
251
252 test!(
253 case: multi_byte_char_long,
254 text: "0123456789abcΔf012345β",
255 lines: vec![0],
256 multi_byte_chars: vec![(13, 2), (22, 2)],
257 );
258
259 test!(
260 case: multi_byte_char_across_chunk_boundary,
261 text: "0123456789abcdeΔ123456789abcdef01234",
262 lines: vec![0],
263 multi_byte_chars: vec![(15, 2)],
264 );
265
266 test!(
267 case: multi_byte_char_across_chunk_boundary_tail,
268 text: "0123456789abcdeΔ....",
269 lines: vec![0],
270 multi_byte_chars: vec![(15, 2)],
271 );
272
273 test!(
274 case: non_narrow_short,
275 text: "0\t2",
276 lines: vec![0],
277 multi_byte_chars: vec![],
278 );
279
280 test!(
281 case: non_narrow_long,
282 text: "01\t3456789abcdef01234567\u{07}9",
283 lines: vec![0],
284 multi_byte_chars: vec![],
285 );
286
287 test!(
288 case: output_offset_all,
289 text: "01\t345\n789abcΔf01234567\u{07}9\nbcΔf",
290 lines: vec![0, 7, 27],
291 multi_byte_chars: vec![(13, 2), (29, 2)],
292 );
293}