string_offsets/lib.rs
1//! Converts string offsets between UTF-8 bytes, UTF-16 code units, Unicode code points, and lines.
2//!
3//! # Example
4//!
5//! ```
6//! use string_offsets::StringOffsets;
7//!
8//! let s = "☀️hello\n🗺️world\n";
9//! let offsets: StringOffsets = StringOffsets::new(s);
10//!
11//! // Find offsets where lines begin and end.
12//! assert_eq!(offsets.line_to_utf8s(0), 0..12); // note: 0-based line numbers
13//!
14//! // Translate string offsets between UTF-8 and other encodings.
15//! // This map emoji is 7 UTF-8 bytes...
16//! assert_eq!(&s[12..19], "🗺️");
17//! // ...but only 3 UTF-16 code units...
18//! assert_eq!(offsets.utf8_to_utf16(12), 8);
19//! assert_eq!(offsets.utf8_to_utf16(19), 11);
20//! // ...and only 2 Unicode code points.
21//! assert_eq!(offsets.utf8s_to_chars(12..19), 8..10);
22//! ```
23//!
24//! See [`StringOffsets`] for details.
25#![deny(missing_docs)]
26
27use std::{marker::PhantomData, ops::Range};
28
29#[cfg(feature = "wasm")]
30use wasm_bindgen::prelude::*;
31
32mod bitrank;
33mod config;
34#[cfg(feature = "wasm")]
35mod wasm;
36
37use bitrank::{BitRank, BitRankBuilder};
38use config::{Bool, ConfigType, True};
39
40pub use config::{AllConfig, OnlyLines};
41
42/// Converts positions within a given string between UTF-8 byte offsets (the usual in Rust), UTF-16
43/// code units, Unicode code points, and line numbers.
44///
45/// Rust strings are UTF-8, but JavaScript has UTF-16 strings, and in Python, strings are sequences
46/// of Unicode code points. It's therefore necessary to adjust string offsets when communicating
47/// across programming language boundaries. [`StringOffsets`] does these adjustments.
48///
49/// Each `StringOffsets` instance contains offset information for a single string. [Building the
50/// data structure](StringOffsets::new) takes O(n) time and memory, but then most conversions are
51/// O(1).
52///
53/// ["UTF-8 Conversions with BitRank"](https://adaptivepatchwork.com/2023/07/10/utf-conversion/)
54/// is a blog post explaining the implementation.
55///
56/// ## Converting offsets
57///
58/// The conversion methods follow a naming scheme that uses these terms for different kinds of
59/// offsets:
60///
61/// - `utf8` - UTF-8 byte offsets (Rust style).
62/// - `utf16` - UTF-16 code unit offsets (JavaScript style).
63/// - `char` - Count of Unicode scalar values (Python style).
64/// - `utf16_pos` - Zero-based line number and `utf16` offset within the line.
65/// - `char_pos` - Zero-based line number and `char` offset within the line.
66///
67/// For example, [`StringOffsets::utf8_to_utf16`] converts a Rust byte offset to a number that will
68/// index to the same position in a JavaScript string. Offsets are expressed as `usize` or [`Pos`]
69/// values.
70///
71/// All methods accept arguments that are past the end of the string, interpreting them as pointing
72/// to the end of the string.
73///
74/// ## Converting ranges
75///
76/// Some methods translate position *ranges*. These are expressed as `Range<usize>` except for
77/// `line`, which is a `usize`:
78///
79/// - `line` - Zero-based line numbers. The range a `line` refers to is the whole line, including
80/// the trailing newline character if any.
81/// - `lines` - A range of line numbers.
82/// - `utf8s` - UTF-8 byte ranges.
83/// - `utf16s` - UTF-16 code unit ranges.
84/// - `chars` - Ranges of Unicode scalar values.
85///
86/// When mapping offsets to line ranges, it is important to use a `_to_lines` function in order to
87/// end up with the correct line range. We have these methods because if you tried to do it
88/// yourself you would screw it up; use them! (And see the source code for
89/// [`StringOffsets::utf8s_to_lines`] if you don't believe us.)
90///
91/// ## Complexity
92///
93/// Most operations run in O(1) time. A few require O(log n) time. The memory consumed by this
94/// data structure is typically less than the memory occupied by the actual content. In the best
95/// case, it requires ~45% of the content space.
96/// One can reduce memory requirements further by only requesting the necessary features via the
97/// configuration type.
98pub struct StringOffsets<C: ConfigType = AllConfig> {
99 /// Vector storing, for every line, the byte position at which the line starts.
100 line_begins: Vec<u32>,
101
102 /// Encoded bitrank where the rank of a byte position corresponds to the line number to which
103 /// the byte belongs.
104 utf8_to_line: BitRank,
105
106 /// Encoded bitrank where the start of a utf8 code point is marked with a 1 bit.
107 /// The rank of a byte position + 1 corresponds to the char position + 1 to which
108 /// the byte belongs.
109 utf8_to_char: BitRank,
110
111 /// Encoded bitrank where a multi word utf16 code point is marked with a 1 bit.
112 /// Converting a byte position into a utf16 word position is achieved by combining utf8_to_char
113 /// and utf8_to_utf16 rank information.
114 utf8_to_utf16: BitRank,
115
116 /// Marks, for every line, whether it consists only of whitespace characters.
117 whitespace_only: Vec<bool>,
118
119 /// Configuration type.
120 _config: PhantomData<C>,
121}
122
123/// A position in a string, specified by line and column number.
124#[cfg_attr(feature = "wasm", wasm_bindgen)]
125#[derive(Debug, Clone, Copy, PartialEq, Eq)]
126pub struct Pos {
127 /// Zero-indexed line number.
128 pub line: usize,
129 /// Zero-indexed column number. The units of this field depend on the method that produces the
130 /// value. See [`StringOffsets::utf8_to_char_pos`], [`StringOffsets::utf8_to_utf16_pos`].
131 pub col: usize,
132}
133
134// The actual conversion implementation between utf8, utf16, chars, and line numbers.
135// New methods must follow the existing conventions:
136//
137// - All conversions saturate when the input is out of bounds.
138// - Lines INCLUDE the terminating newline.
139// - Line numbers and column numbers are 0-based.
140// - `.xyz_to_lines(range)` methods behave like `.utf8_to_lines(the corresponding byte range)`.
141//
142// This last one is tricky, because in these methods, `range.begin` "rounds down" to the beginning
143// of the line, but `range.end` "rounds up"; and because there are many corner cases.
144//
145// E.g.: The empty character range at the end of one line cannot be distinguished from the empty
146// character range at the start of the subsequent line! This ambiguity is resolved by returning the
147// line which starts with the empty character range.
148//
149// Question: Consider whether we should return an empty line range in this case which would
150// probably be consistent from a mathematical point of view. But then we should also return empty
151// line ranges for empty character ranges in the middle of a line...
152impl<C: ConfigType> StringOffsets<C> {
153 /// Create a new converter to work with offsets into the given string.
154 pub fn new(content: &str) -> Self {
155 new_converter(content.as_bytes())
156 }
157
158 /// Create a new converter to work with offsets into the given byte-string.
159 ///
160 /// If `content` is UTF-8, this is just like [`StringOffsets::new`]. Otherwise, the
161 /// conversion methods will produce unspecified (but memory-safe) results.
162 pub fn from_bytes(content: &[u8]) -> Self {
163 new_converter(content)
164 }
165}
166
167impl<C: ConfigType<HasLines = True>> StringOffsets<C> {
168 /// Returns the number of bytes in the string.
169 pub fn len(&self) -> usize {
170 self.line_begins.last().copied().unwrap_or(0) as usize
171 }
172
173 /// Returns whether there are no bytes in the string.
174 pub fn is_empty(&self) -> bool {
175 self.line_begins.is_empty()
176 }
177
178 /// Returns the number of lines in the string.
179 pub fn lines(&self) -> usize {
180 self.line_begins.len() - 1
181 }
182
183 /// Return the byte offset of the first character on the specified (zero-based) line.
184 ///
185 /// If `line_number` is greater than or equal to the number of lines in the text, this returns
186 /// the length of the string.
187 pub fn line_to_utf8_begin(&self, line_number: usize) -> usize {
188 self.line_begins[line_number.min(self.lines())] as usize
189 }
190
191 /// UTF-8 offset of the first character of a line.
192 pub fn line_to_utf8_end(&self, line_number: usize) -> usize {
193 self.line_to_utf8_begin(line_number + 1)
194 }
195
196 /// Return the zero-based line number of the line containing the specified UTF-8 offset.
197 /// Newline characters count as part of the preceding line.
198 pub fn utf8_to_line(&self, byte_number: usize) -> usize {
199 self.utf8_to_line.rank(byte_number)
200 }
201
202 /// Returns the range of line numbers containing the substring specified by the Rust-style
203 /// range `bytes`. Newline characters count as part of the preceding line.
204 ///
205 /// If `bytes` is an empty range at a position within or at the beginning of a line, this
206 /// returns a nonempty range containing the line number of that one line. An empty range at or
207 /// beyond the end of the string translates to an empty range of line numbers.
208 pub fn utf8s_to_lines(&self, bytes: Range<usize>) -> Range<usize> {
209 // The fiddly parts of this formula are necessary because `bytes.start` rounds down to the
210 // beginning of the line, but `bytes.end` "rounds up" to the end of the line. the final
211 // `+1` is to produce a half-open range.
212 self.utf8_to_line(bytes.start)
213 ..self
214 .lines()
215 .min(self.utf8_to_line(bytes.end.saturating_sub(1).max(bytes.start)) + 1)
216 }
217
218 /// UTF-8 offset one past the end of a line (the offset of the start of the next line).
219 pub fn line_to_utf8s(&self, line_number: usize) -> Range<usize> {
220 self.line_to_utf8_begin(line_number)..self.line_to_utf8_end(line_number)
221 }
222
223 /// UTF-8 offsets for the beginning and end of a range of lines, including the newline if any.
224 pub fn lines_to_utf8s(&self, line_numbers: Range<usize>) -> Range<usize> {
225 self.line_to_utf8_begin(line_numbers.start)..self.line_to_utf8_begin(line_numbers.end)
226 }
227}
228
229impl<C: ConfigType<HasChars = True, HasLines = True>> StringOffsets<C> {
230 /// Returns the number of Unicode characters on the specified line.
231 pub fn line_chars(&self, line_number: usize) -> usize {
232 let r = self.utf8s_to_chars(self.line_to_utf8s(line_number));
233 r.end - r.start
234 }
235
236 /// UTF-32 offset of the first character of a line.
237 ///
238 /// That is, return the offset that would point to the start of that line in a UTF-32
239 /// representation of the source string.
240 pub fn line_to_char_begin(&self, line_number: usize) -> usize {
241 self.utf8_to_char(self.line_to_utf8_begin(line_number))
242 }
243
244 /// UTF-32 offset one past the end of a line (the offset of the start of the next line).
245 pub fn line_to_char_end(&self, line_number: usize) -> usize {
246 self.utf8_to_char(self.line_to_utf8_end(line_number))
247 }
248
249 /// UTF-32 offsets for the beginning and end of a line, including the newline if any.
250 pub fn line_to_chars(&self, line_number: usize) -> Range<usize> {
251 self.utf8s_to_chars(self.line_to_utf8s(line_number))
252 }
253
254 /// UTF-32 offsets for the beginning and end of a range of lines, including the newline if any.
255 pub fn lines_to_chars(&self, line_numbers: Range<usize>) -> Range<usize> {
256 self.utf8s_to_chars(self.lines_to_utf8s(line_numbers))
257 }
258
259 /// Converts a UTF-8 offset to a zero-based line number and UTF-32 offset within the
260 /// line.
261 pub fn utf8_to_char_pos(&self, byte_number: usize) -> Pos {
262 let line = self.utf8_to_line(byte_number);
263 let line_start_char_number = self.line_to_char_begin(line);
264 let char_idx = self.utf8_to_char(byte_number);
265 Pos {
266 line,
267 col: char_idx - line_start_char_number,
268 }
269 }
270
271 /// Returns the range of line numbers containing the substring specified by the UTF-32
272 /// range `chars`. Newline characters count as part of the preceding line.
273 pub fn chars_to_lines(&self, chars: Range<usize>) -> Range<usize> {
274 self.utf8s_to_lines(self.chars_to_utf8s(chars))
275 }
276}
277
278impl<C: ConfigType<HasWhitespace = True>> StringOffsets<C> {
279 /// Returns true if the specified line is empty except for whitespace.
280 pub fn only_whitespaces(&self, line_number: usize) -> bool {
281 self.whitespace_only
282 .get(line_number)
283 .copied()
284 .unwrap_or(true)
285 }
286}
287
288impl<C: ConfigType<HasChars = True>> StringOffsets<C> {
289 /// Converts a UTF-8 offset to a UTF-32 offset.
290 pub fn utf8_to_char(&self, byte_number: usize) -> usize {
291 self.utf8_to_char.rank(byte_number + 1) - 1
292 }
293
294 /// Converts a UTF-32 offset to a UTF-8 offset.
295 pub fn char_to_utf8(&self, char_number: usize) -> usize {
296 let mut byte_number = char_number;
297 for _ in 0..128 {
298 let char_number2 = self.utf8_to_char(byte_number);
299 if char_number2 == char_number {
300 return byte_number;
301 }
302 byte_number += char_number - char_number2;
303 }
304 // If we couldn't find the char within 128 steps, then the char_number might be invalid!
305 // This does not usually happen. For consistency with the rest of the code, we simply return
306 // the max utf8 position in this case.
307 if char_number >= self.utf8_to_char.max_rank() {
308 return self
309 .line_begins
310 .last()
311 .copied()
312 .expect("last entry represents the length of the file!")
313 as usize;
314 }
315 let limit = *self.line_begins.last().expect("no line begins") as usize;
316 // Otherwise, we keep searching, but are a bit more careful and add a check that we don't run into an infinite loop.
317 loop {
318 let char_number2 = self.utf8_to_char(byte_number);
319 if char_number2 == char_number {
320 return byte_number;
321 }
322 byte_number += char_number - char_number2;
323 assert!(byte_number < limit);
324 }
325 }
326
327 /// Converts a UTF-8 offset range to a UTF-32 offset range.
328 pub fn utf8s_to_chars(&self, bytes: Range<usize>) -> Range<usize> {
329 self.utf8_to_char(bytes.start)..self.utf8_to_char(bytes.end)
330 }
331
332 /// Converts a UTF-32 offset range to a UTF-8 offset range.
333 pub fn chars_to_utf8s(&self, chars: Range<usize>) -> Range<usize> {
334 self.char_to_utf8(chars.start)..self.char_to_utf8(chars.end)
335 }
336}
337
338impl<C: ConfigType<HasChars = True, HasUtf16 = True>> StringOffsets<C> {
339 /// Converts a UTF-8 offset to a UTF-16 offset.
340 pub fn utf8_to_utf16(&self, byte_number: usize) -> usize {
341 self.utf8_to_char(byte_number) + self.utf8_to_utf16.rank(byte_number)
342 }
343}
344
345impl<C: ConfigType<HasChars = True, HasLines = True, HasUtf16 = True>> StringOffsets<C> {
346 /// UTF-16 offset of the first character of a line.
347 ///
348 /// That is, return the offset that would point to the start of that line in a UTF-16
349 /// representation of the source string.
350 pub fn line_to_utf16_begin(&self, line_number: usize) -> usize {
351 self.utf8_to_utf16(self.line_to_utf8_begin(line_number))
352 }
353
354 /// UTF-16 offset one past the end of a line (the offset of the start of the next line).
355 pub fn line_to_utf16_end(&self, line_number: usize) -> usize {
356 self.utf8_to_utf16(self.line_to_utf8_end(line_number))
357 }
358
359 /// Converts a UTF-8 offset to a zero-based line number and UTF-16 offset within the
360 /// line.
361 pub fn utf8_to_utf16_pos(&self, byte_number: usize) -> Pos {
362 let line = self.utf8_to_line(byte_number);
363 let line_start_char_number = self.line_to_utf16_begin(line);
364 let char_idx = self.utf8_to_utf16(byte_number);
365 Pos {
366 line,
367 col: char_idx - line_start_char_number,
368 }
369 }
370}
371
372fn new_converter<C: ConfigType>(content: &[u8]) -> StringOffsets<C> {
373 let n = content.len();
374 let mut utf8_builder =
375 BitRankBuilder::with_capacity(if C::HasChars::VALUE { n + 1 } else { 0 });
376 let mut utf16_builder = BitRankBuilder::with_capacity(if C::HasUtf16::VALUE { n } else { 0 });
377 let mut line_builder = BitRankBuilder::with_capacity(if C::HasLines::VALUE { n } else { 0 });
378 let mut line_begins = vec![0];
379 let mut whitespace_only = vec![];
380 let mut only_whitespaces = true; // true if all characters in the current line are whitespaces.
381 for (i, &c) in content.iter().enumerate() {
382 // Note: We expect here proper utf8 encoded strings! Otherwise, the conversion will have undefined behaviour.
383 if C::HasChars::VALUE && is_char_boundary(c) {
384 utf8_builder.push(i);
385 }
386 if C::HasUtf16::VALUE && two_utf16(c) {
387 utf16_builder.push(i);
388 }
389 if c == b'\n' {
390 if C::HasWhitespace::VALUE {
391 whitespace_only.push(only_whitespaces);
392 only_whitespaces = true; // reset for next line.
393 }
394 if C::HasLines::VALUE {
395 line_begins.push(i as u32 + 1);
396 line_builder.push(i);
397 }
398 } else if C::HasWhitespace::VALUE {
399 only_whitespaces = only_whitespaces && matches!(c, b'\t' | b'\r' | b' ');
400 }
401 }
402 if C::HasChars::VALUE {
403 utf8_builder.push(n);
404 }
405 if line_begins.last() != Some(&(n as u32)) {
406 if C::HasWhitespace::VALUE {
407 whitespace_only.push(only_whitespaces);
408 }
409 if C::HasLines::VALUE {
410 line_begins.push(n as u32);
411 line_builder.push(n - 1);
412 }
413 }
414
415 StringOffsets {
416 line_begins,
417 utf8_to_line: line_builder.finish(),
418 whitespace_only,
419 utf8_to_char: utf8_builder.finish(),
420 utf8_to_utf16: utf16_builder.finish(),
421 _config: PhantomData,
422 }
423}
424
425/// Returns true if, in a UTF-8 string, `b` indicates the first byte of a character.
426fn is_char_boundary(b: u8) -> bool {
427 b as i8 >= -0x40 // NB: b < 128 || b >= 192
428}
429
430fn two_utf16(c: u8) -> bool {
431 c & 0b1111_0000 == 0b1111_0000
432}
433
434#[cfg(test)]
435mod tests {
436 use super::*;
437
438 /// Returns the number of bytes a UTF-8 char occupies, given the first byte of the UTF-8 encoding.
439 /// Returns 0 if the byte is not a valid first byte of a UTF-8 char.
440 fn utf8_width(c: u8) -> usize {
441 // Every nibble represents the utf8 length given the first 4 bits of a utf8 encoded byte.
442 const UTF8_WIDTH: u64 = 0x4322_0000_1111_1111;
443 ((UTF8_WIDTH >> ((c >> 4) * 4)) & 0xf) as usize
444 }
445
446 fn utf8_to_utf16_width(content: &[u8]) -> usize {
447 let len = utf8_width(content[0]);
448 match len {
449 0 => 0,
450 1..=3 => 1,
451 4 => 2,
452 _ => panic!("invalid utf8 char width: {}", len),
453 }
454 }
455
456 #[test]
457 fn test_utf8_char_width() {
458 for c in '\0'..=char::MAX {
459 let mut dst = [0; 4];
460 let len = c.encode_utf8(&mut dst).len();
461 assert_eq!(len, utf8_width(dst[0]), "char: {:?} {len}", dst[0] >> 4);
462 }
463
464 for b in 0..=255u8 {
465 if !is_char_boundary(b) {
466 assert_eq!(utf8_width(b), 0, "char: {:?}", b >> 4);
467 } else {
468 assert!(utf8_width(b) > 0, "char: {:?}", b >> 4);
469 }
470 }
471 }
472
473 #[test]
474 fn test_utf8_to_utf16_len() {
475 for c in '\0'..=char::MAX {
476 let mut dst = [0; 4];
477 let _len = c.encode_utf8(&mut dst).len();
478 assert_eq!(utf8_to_utf16_width(&dst), c.len_utf16());
479 }
480
481 for b in 0..=255u8 {
482 if !is_char_boundary(b) {
483 assert_eq!(utf8_to_utf16_width(&[b]), 0);
484 }
485 }
486 }
487
488 #[test]
489 fn test_line_map() {
490 let content = r#"a short line.
491followed by another one.
492no terminating newline!"#;
493 let lines: StringOffsets = StringOffsets::new(content);
494 assert_eq!(lines.line_to_utf8s(0), 0..14);
495 assert_eq!(&content[0..14], "a short line.\n");
496 assert_eq!(lines.line_to_utf8s(1), 14..39);
497 assert_eq!(&content[14..39], "followed by another one.\n");
498 assert_eq!(lines.line_to_utf8s(2), 39..62);
499 assert_eq!(&content[39..62], "no terminating newline!");
500 assert_eq!(lines.utf8_to_line(0), 0);
501 assert_eq!(lines.utf8_to_line(13), 0);
502 assert_eq!(lines.utf8_to_line(14), 1);
503 assert_eq!(lines.utf8_to_line(38), 1);
504 assert_eq!(lines.utf8_to_line(39), 2);
505 assert_eq!(lines.utf8_to_line(61), 2);
506 assert_eq!(lines.utf8_to_line(62), 3); // <<-- this character is beyond the content.
507 assert_eq!(lines.utf8_to_line(100), 3);
508 assert_eq!(lines.utf8s_to_chars(4..10), 4..10);
509 assert_eq!(lines.chars_to_utf8s(4..10), 4..10);
510
511 assert_eq!(content.len(), 62);
512 assert_eq!(lines.lines_to_utf8s(2..3), 39..62);
513 assert_eq!(lines.lines_to_utf8s(2..4), 39..62);
514 assert_eq!(lines.lines_to_chars(2..4), 39..62);
515 assert_eq!(lines.utf8s_to_lines(39..62), 2..3);
516 assert_eq!(lines.utf8s_to_lines(39..63), 2..3); // The "invalid" utf8 position results in a valid line position.
517 assert_eq!(lines.char_to_utf8(62), 62);
518 assert_eq!(lines.char_to_utf8(63), 62); // char 63 doesn't exist, so we map to the closest valid utf8 position.
519
520 // Empty ranges
521 assert_eq!(lines.utf8s_to_lines(0..0), 0..1);
522 assert_eq!(lines.utf8s_to_lines(13..13), 0..1);
523 assert_eq!(lines.utf8s_to_lines(14..14), 1..2);
524 assert_eq!(lines.utf8s_to_lines(38..38), 1..2);
525 assert_eq!(lines.utf8s_to_lines(39..39), 2..3);
526 assert_eq!(lines.utf8s_to_lines(61..61), 2..3);
527 assert_eq!(lines.utf8s_to_lines(62..62), 3..3);
528 assert_eq!(lines.utf8s_to_lines(63..63), 3..3);
529 }
530
531 fn pos(line: usize, col: usize) -> Pos {
532 Pos { line, col }
533 }
534
535 #[test]
536 fn test_convert_ascii() {
537 let content = r#"line0
538line1"#;
539 let lines: StringOffsets = StringOffsets::new(content);
540 assert_eq!(lines.utf8_to_char_pos(0), pos(0, 0));
541 assert_eq!(lines.utf8_to_char_pos(1), pos(0, 1));
542 assert_eq!(lines.utf8_to_char_pos(6), pos(1, 0));
543 assert_eq!(lines.utf8_to_char_pos(7), pos(1, 1));
544 }
545
546 #[test]
547 fn test_convert_unicode() {
548 // Á - 2 bytes utf8
549 let content = r#"❤️ line0
550line1
551✅ line2"#;
552 let lines: StringOffsets = StringOffsets::new(content);
553 assert_eq!(lines.utf8_to_char_pos(0), pos(0, 0)); // ❤️ takes 6 bytes to represent in utf8 (2 code points)
554 assert_eq!(lines.utf8_to_char_pos(1), pos(0, 0));
555 assert_eq!(lines.utf8_to_char_pos(2), pos(0, 0));
556 assert_eq!(lines.utf8_to_char_pos(3), pos(0, 1));
557 assert_eq!(lines.utf8_to_char_pos(4), pos(0, 1));
558 assert_eq!(lines.utf8_to_char_pos(5), pos(0, 1));
559
560 assert_eq!(lines.utf8_to_char_pos(6), pos(0, 2)); // <space>
561 assert_eq!(lines.utf8_to_char_pos(7), pos(0, 3)); // line
562 // ^
563
564 assert_eq!(lines.utf8_to_char_pos(13), pos(1, 0)); // line
565 // ^
566
567 assert_eq!(lines.utf8_to_char_pos(19), pos(2, 0)); // ✅ takes 3 bytes to represent in utf8 (1 code point)
568 assert_eq!(lines.utf8_to_char_pos(20), pos(2, 0));
569 assert_eq!(lines.utf8_to_char_pos(21), pos(2, 0));
570
571 assert_eq!(lines.utf8_to_char_pos(22), pos(2, 1)); // <space>
572
573 assert_eq!(lines.utf8_to_utf16_pos(0), pos(0, 0)); // ❤️ takes 4 bytes to represent in utf16 (2 code points)
574 assert_eq!(lines.utf8_to_utf16_pos(1), pos(0, 0));
575 assert_eq!(lines.utf8_to_utf16_pos(2), pos(0, 0));
576 assert_eq!(lines.utf8_to_utf16_pos(3), pos(0, 1));
577 }
578
579 #[test]
580 fn test_small() {
581 // Á - 2 bytes utf8
582 let content = r#"❤️ line0 ❤️Á 👋"#;
583 let lines: StringOffsets = StringOffsets::new(content);
584 let mut utf16_index = 0;
585 let mut char_index = 0;
586 for (byte_index, char) in content.char_indices() {
587 assert_eq!(lines.utf8_to_char(byte_index), char_index);
588 assert_eq!(lines.utf8_to_utf16(byte_index), utf16_index);
589 char_index += 1;
590 utf16_index += char.len_utf16();
591 }
592 assert_eq!(lines.utf8_to_char(content.len()), char_index);
593 assert_eq!(lines.utf8_to_utf16(content.len()), utf16_index);
594 }
595
596 #[test]
597 fn test_variable_lengths() {
598 let content = r#"❤️Á 👋"#;
599 // ^~ utf8: 1 char, 4 bytes, utf16: 2 code units
600 // ^~~~ utf8: 1 char, 1 byte, utf16: 1 code unit
601 // ^~~~~ utf8: 1 char, 2 bytes, utf16: 1 code unit
602 // ^~~~~~ utf8: 2 chars, 3 byte ea., utf16: 2 code units
603 let lines: StringOffsets = StringOffsets::new(content);
604
605 // UTF-16 positions
606 assert_eq!(lines.utf8_to_utf16_pos(0), pos(0, 0)); // ❤️
607 assert_eq!(lines.utf8_to_utf16_pos(1), pos(0, 0));
608 assert_eq!(lines.utf8_to_utf16_pos(2), pos(0, 0));
609 assert_eq!(lines.utf8_to_utf16_pos(3), pos(0, 1));
610 assert_eq!(lines.utf8_to_utf16_pos(5), pos(0, 1));
611 assert_eq!(lines.utf8_to_utf16_pos(4), pos(0, 1));
612 assert_eq!(lines.utf8_to_utf16_pos(6), pos(0, 2)); // Á
613 assert_eq!(lines.utf8_to_utf16_pos(7), pos(0, 2));
614 assert_eq!(lines.utf8_to_utf16_pos(8), pos(0, 3)); // <space>
615 assert_eq!(lines.utf8_to_utf16_pos(9), pos(0, 4)); // 👋
616
617 // These middle utf8 byte positions don't have valid mappings:
618 // assert_eq!(lines.utf8_to_utf16_pos(10), pos(0, 4));
619 // assert_eq!(lines.utf8_to_utf16_pos(11), pos(0, 5));
620 //
621 // 👋 in utf16: 0xd83d 0xdc4b
622 // 👋 in utf8: 0xf0 0x9f 0x91 0x8b
623 // ^ ^
624 // It's not really defined where these inner bytes map to and it
625 // doesn't matter because we would never report those byte offset as
626 // they are in the middle of a character and therefore invalid.
627
628 assert_eq!(lines.utf8_to_utf16_pos(12), pos(0, 5));
629
630 // UTF-8 positions
631 assert_eq!(lines.utf8_to_char_pos(0), pos(0, 0)); // ❤️
632 assert_eq!(lines.utf8_to_char_pos(1), pos(0, 0));
633 assert_eq!(lines.utf8_to_char_pos(2), pos(0, 0));
634 assert_eq!(lines.utf8_to_char_pos(3), pos(0, 1));
635 assert_eq!(lines.utf8_to_char_pos(4), pos(0, 1));
636 assert_eq!(lines.utf8_to_char_pos(5), pos(0, 1));
637 assert_eq!(lines.utf8_to_char_pos(6), pos(0, 2)); // Á
638 assert_eq!(lines.utf8_to_char_pos(7), pos(0, 2));
639 assert_eq!(lines.utf8_to_char_pos(8), pos(0, 3)); // <space>
640 assert_eq!(lines.utf8_to_char_pos(9), pos(0, 4)); // 👋
641 assert_eq!(lines.utf8_to_char_pos(10), pos(0, 4));
642 assert_eq!(lines.utf8_to_char_pos(11), pos(0, 4));
643 assert_eq!(lines.utf8_to_char_pos(12), pos(0, 4));
644 }
645
646 #[test]
647 fn test_critical_input_len() {
648 let content = [b'a'; 16384];
649 let lines: StringOffsets = StringOffsets::from_bytes(&content);
650 assert_eq!(lines.utf8_to_utf16_pos(16384), pos(1, 0));
651 }
652}