1#![deny(missing_debug_implementations, missing_docs, rust_2018_idioms)]
4
5#[cfg(test)]
6mod tests;
7
8use nohash_hasher::IntMap;
9
10pub use text_size::{TextRange, TextSize};
11
12#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
14pub struct LineCol {
15 pub line: u32,
17 pub col: u32,
19}
20
21#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
23#[non_exhaustive]
24pub enum WideEncoding {
25 Utf16,
27 Utf32,
29}
30
31impl WideEncoding {
32 pub fn measure(&self, text: &str) -> usize {
34 match self {
35 WideEncoding::Utf16 => text.encode_utf16().count(),
36 WideEncoding::Utf32 => text.chars().count(),
37 }
38 }
39}
40
41#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
47pub struct WideLineCol {
48 pub line: u32,
50 pub col: u32,
52}
53
54#[derive(Debug, Clone, Copy, PartialEq, Eq)]
55struct WideChar {
56 start: TextSize,
58 end: TextSize,
60}
61
62impl WideChar {
63 fn len(&self) -> TextSize {
65 self.end - self.start
66 }
67
68 fn wide_len(&self, enc: WideEncoding) -> u32 {
70 match enc {
71 WideEncoding::Utf16 => {
72 if self.len() == TextSize::from(4) {
73 2
74 } else {
75 1
76 }
77 }
78 WideEncoding::Utf32 => 1,
79 }
80 }
81}
82
83#[derive(Debug, Clone, PartialEq, Eq)]
85pub struct LineIndex {
86 newlines: Box<[TextSize]>,
88 line_wide_chars: IntMap<u32, Box<[WideChar]>>,
90 len: TextSize,
92}
93
94impl LineIndex {
95 pub fn new(text: &str) -> LineIndex {
97 let (newlines, line_wide_chars) = analyze_source_file(text);
98 LineIndex {
99 newlines: newlines.into_boxed_slice(),
100 line_wide_chars,
101 len: TextSize::of(text),
102 }
103 }
104
105 pub fn line_col(&self, offset: TextSize) -> LineCol {
111 self.try_line_col(offset).expect("invalid offset")
112 }
113
114 pub fn try_line_col(&self, offset: TextSize) -> Option<LineCol> {
119 if offset > self.len {
120 return None;
121 }
122 let line = self.newlines.partition_point(|&it| it <= offset);
123 let start = self.start_offset(line)?;
124 let col = offset - start;
125 let ret = LineCol { line: line as u32, col: col.into() };
126 self.line_wide_chars
127 .get(&ret.line)
128 .into_iter()
129 .flat_map(|it| it.iter())
130 .all(|it| col <= it.start || it.end <= col)
131 .then_some(ret)
132 }
133
134 pub fn offset(&self, line_col: LineCol) -> Option<TextSize> {
136 self.start_offset(line_col.line as usize).map(|start| start + TextSize::from(line_col.col))
137 }
138
139 fn start_offset(&self, line: usize) -> Option<TextSize> {
140 match line.checked_sub(1) {
141 None => Some(TextSize::from(0)),
142 Some(it) => self.newlines.get(it).copied(),
143 }
144 }
145
146 pub fn to_wide(&self, enc: WideEncoding, line_col: LineCol) -> Option<WideLineCol> {
148 let mut col = line_col.col;
149 if let Some(wide_chars) = self.line_wide_chars.get(&line_col.line) {
150 for c in wide_chars.iter() {
151 if u32::from(c.end) <= line_col.col {
152 col = col.checked_sub(u32::from(c.len()) - c.wide_len(enc))?;
153 } else {
154 break;
157 }
158 }
159 }
160 Some(WideLineCol { line: line_col.line, col })
161 }
162
163 pub fn to_utf8(&self, enc: WideEncoding, line_col: WideLineCol) -> Option<LineCol> {
165 let mut col = line_col.col;
166 if let Some(wide_chars) = self.line_wide_chars.get(&line_col.line) {
167 for c in wide_chars.iter() {
168 if col > u32::from(c.start) {
169 col = col.checked_add(u32::from(c.len()) - c.wide_len(enc))?;
170 } else {
171 break;
174 }
175 }
176 }
177 Some(LineCol { line: line_col.line, col })
178 }
179
180 pub fn line(&self, line: u32) -> Option<TextRange> {
182 let start = self.start_offset(line as usize)?;
183 let next_newline = self.newlines.get(line as usize).copied().unwrap_or(self.len);
184 let line_length = next_newline - start;
185 Some(TextRange::new(start, start + line_length))
186 }
187
188 pub fn lines(&self, range: TextRange) -> impl Iterator<Item = TextRange> + '_ {
192 let lo = self.newlines.partition_point(|&it| it < range.start());
193 let hi = self.newlines.partition_point(|&it| it <= range.end());
194 let all = std::iter::once(range.start())
195 .chain(self.newlines[lo..hi].iter().copied())
196 .chain(std::iter::once(range.end()));
197
198 all.clone()
199 .zip(all.skip(1))
200 .map(|(lo, hi)| TextRange::new(lo, hi))
201 .filter(|it| !it.is_empty())
202 }
203
204 pub fn len(&self) -> TextSize {
206 self.len
207 }
208}
209
210fn analyze_source_file(src: &str) -> (Vec<TextSize>, IntMap<u32, Box<[WideChar]>>) {
212 assert!(src.len() < !0u32 as usize);
213 let mut lines = vec![];
214 let mut line_wide_chars = IntMap::<u32, Vec<WideChar>>::default();
215
216 analyze_source_file_dispatch(src, &mut lines, &mut line_wide_chars);
218
219 (lines, line_wide_chars.into_iter().map(|(k, v)| (k, v.into_boxed_slice())).collect())
220}
221
222#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
223fn analyze_source_file_dispatch(
224 src: &str,
225 lines: &mut Vec<TextSize>,
226 multi_byte_chars: &mut IntMap<u32, Vec<WideChar>>,
227) {
228 if is_x86_feature_detected!("sse2") {
229 unsafe {
231 analyze_source_file_sse2(src, lines, multi_byte_chars);
232 }
233 } else {
234 analyze_source_file_generic(src, src.len(), TextSize::from(0), lines, multi_byte_chars);
235 }
236}
237
238#[cfg(target_arch = "aarch64")]
239fn analyze_source_file_dispatch(
240 src: &str,
241 lines: &mut Vec<TextSize>,
242 multi_byte_chars: &mut IntMap<u32, Vec<WideChar>>,
243) {
244 if std::arch::is_aarch64_feature_detected!("neon") {
245 unsafe {
247 analyze_source_file_neon(src, lines, multi_byte_chars);
248 }
249 } else {
250 analyze_source_file_generic(src, src.len(), TextSize::from(0), lines, multi_byte_chars);
251 }
252}
253
254#[target_feature(enable = "sse2")]
259#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
260unsafe fn analyze_source_file_sse2(
261 src: &str,
262 lines: &mut Vec<TextSize>,
263 multi_byte_chars: &mut IntMap<u32, Vec<WideChar>>,
264) {
265 #[cfg(target_arch = "x86")]
266 use std::arch::x86::*;
267 #[cfg(target_arch = "x86_64")]
268 use std::arch::x86_64::*;
269
270 const CHUNK_SIZE: usize = 16;
271
272 let src_bytes = src.as_bytes();
273
274 let chunk_count = src.len() / CHUNK_SIZE;
275
276 let mut intra_chunk_offset = 0;
281
282 for chunk_index in 0..chunk_count {
283 let ptr = src_bytes.as_ptr() as *const __m128i;
284 let chunk = unsafe { _mm_loadu_si128(ptr.add(chunk_index)) };
287
288 let multibyte_test = unsafe { _mm_cmplt_epi8(chunk, _mm_set1_epi8(0)) };
291 let multibyte_mask = unsafe { _mm_movemask_epi8(multibyte_test) };
293
294 if multibyte_mask == 0 {
296 assert!(intra_chunk_offset == 0);
297
298 let newlines_test = unsafe { _mm_cmpeq_epi8(chunk, _mm_set1_epi8(b'\n' as i8)) };
300 let newlines_mask = unsafe { _mm_movemask_epi8(newlines_test) };
301
302 if newlines_mask != 0 {
303 let mut newlines_mask = 0xFFFF0000 | newlines_mask as u32;
305 let output_offset = TextSize::from((chunk_index * CHUNK_SIZE + 1) as u32);
306
307 loop {
308 let index = newlines_mask.trailing_zeros();
309
310 if index >= CHUNK_SIZE as u32 {
311 break;
313 }
314
315 lines.push(TextSize::from(index) + output_offset);
316
317 newlines_mask &= (!1) << index;
319 }
320 }
321 continue;
322 }
323
324 let scan_start = chunk_index * CHUNK_SIZE + intra_chunk_offset;
327 intra_chunk_offset = analyze_source_file_generic(
328 &src[scan_start..],
329 CHUNK_SIZE - intra_chunk_offset,
330 TextSize::from(scan_start as u32),
331 lines,
332 multi_byte_chars,
333 );
334 }
335
336 let tail_start = chunk_count * CHUNK_SIZE + intra_chunk_offset;
338 if tail_start < src.len() {
339 analyze_source_file_generic(
340 &src[tail_start..],
341 src.len() - tail_start,
342 TextSize::from(tail_start as u32),
343 lines,
344 multi_byte_chars,
345 );
346 }
347}
348
349#[target_feature(enable = "neon")]
350#[cfg(target_arch = "aarch64")]
351#[inline]
352unsafe fn move_mask(v: std::arch::aarch64::uint8x16_t) -> u64 {
358 use std::arch::aarch64::*;
359
360 let nibble_mask = unsafe { vshrn_n_u16(vreinterpretq_u16_u8(v), 4) };
361 unsafe { vget_lane_u64(vreinterpret_u64_u8(nibble_mask), 0) }
362}
363
364#[target_feature(enable = "neon")]
365#[cfg(target_arch = "aarch64")]
366unsafe fn analyze_source_file_neon(
367 src: &str,
368 lines: &mut Vec<TextSize>,
369 multi_byte_chars: &mut IntMap<u32, Vec<WideChar>>,
370) {
371 use std::arch::aarch64::*;
372
373 const CHUNK_SIZE: usize = 16;
374
375 let src_bytes = src.as_bytes();
376
377 let chunk_count = src.len() / CHUNK_SIZE;
378
379 let newline = unsafe { vdupq_n_s8(b'\n' as i8) };
380
381 let mut intra_chunk_offset = 0;
386
387 for chunk_index in 0..chunk_count {
388 let ptr = src_bytes.as_ptr() as *const i8;
389 let chunk = unsafe { vld1q_s8(ptr.add(chunk_index * CHUNK_SIZE)) };
390
391 let multibyte_test = unsafe { vcltzq_s8(chunk) };
394 let multibyte_mask = unsafe { move_mask(multibyte_test) };
396
397 if multibyte_mask == 0 {
399 assert!(intra_chunk_offset == 0);
400
401 let newlines_test = unsafe { vceqq_s8(chunk, newline) };
403 let mut newlines_mask = unsafe { move_mask(newlines_test) };
404
405 if newlines_mask != 0 {
407 let output_offset = TextSize::from((chunk_index * CHUNK_SIZE + 1) as u32);
408
409 while newlines_mask != 0 {
410 let trailing_zeros = newlines_mask.trailing_zeros();
411 let index = trailing_zeros / 4;
412
413 lines.push(TextSize::from(index) + output_offset);
414
415 newlines_mask &= (!0xF) << trailing_zeros;
417 }
418 }
419 continue;
420 }
421
422 let scan_start = chunk_index * CHUNK_SIZE + intra_chunk_offset;
423 intra_chunk_offset = analyze_source_file_generic(
424 &src[scan_start..],
425 CHUNK_SIZE - intra_chunk_offset,
426 TextSize::from(scan_start as u32),
427 lines,
428 multi_byte_chars,
429 );
430 }
431
432 let tail_start = chunk_count * CHUNK_SIZE + intra_chunk_offset;
433 if tail_start < src.len() {
434 analyze_source_file_generic(
435 &src[tail_start..],
436 src.len() - tail_start,
437 TextSize::from(tail_start as u32),
438 lines,
439 multi_byte_chars,
440 );
441 }
442}
443
444#[cfg(not(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64")))]
445fn analyze_source_file_dispatch(
447 src: &str,
448 lines: &mut Vec<TextSize>,
449 multi_byte_chars: &mut IntMap<u32, Vec<WideChar>>,
450) {
451 analyze_source_file_generic(src, src.len(), TextSize::from(0), lines, multi_byte_chars);
452}
453
454fn analyze_source_file_generic(
458 src: &str,
459 scan_len: usize,
460 output_offset: TextSize,
461 lines: &mut Vec<TextSize>,
462 multi_byte_chars: &mut IntMap<u32, Vec<WideChar>>,
463) -> usize {
464 assert!(src.len() >= scan_len);
465 let mut i = 0;
466 let src_bytes = src.as_bytes();
467
468 while i < scan_len {
469 let byte = unsafe {
470 *src_bytes.get_unchecked(i)
472 };
473
474 let mut char_len = 1;
477
478 if byte == b'\n' {
479 lines.push(TextSize::from(i as u32 + 1) + output_offset);
480 } else if byte >= 127 {
481 let c = src[i..].chars().next().unwrap();
483 char_len = c.len_utf8();
484
485 let pos = TextSize::from(i as u32) + output_offset
488 - lines.last().unwrap_or(&TextSize::default());
489
490 if char_len > 1 {
491 assert!((2..=4).contains(&char_len));
492 let mbc = WideChar { start: pos, end: pos + TextSize::from(char_len as u32) };
493 multi_byte_chars.entry(lines.len() as u32).or_default().push(mbc);
494 }
495 }
496
497 i += char_len;
498 }
499
500 i - scan_len
501}