string_offsets/
lib.rs

1//! Converts string offsets between UTF-8 bytes, UTF-16 code units, Unicode code points, and lines.
2//!
3//! # Example
4//!
5//! ```
6//! use string_offsets::StringOffsets;
7//!
8//! let s = "☀️hello\n🗺️world\n";
9//! let offsets: StringOffsets = StringOffsets::new(s);
10//!
11//! // Find offsets where lines begin and end.
12//! assert_eq!(offsets.line_to_utf8s(0), 0..12);  // note: 0-based line numbers
13//!
14//! // Translate string offsets between UTF-8 and other encodings.
15//! // This map emoji is 7 UTF-8 bytes...
16//! assert_eq!(&s[12..19], "🗺️");
17//! // ...but only 3 UTF-16 code units...
18//! assert_eq!(offsets.utf8_to_utf16(12), 8);
19//! assert_eq!(offsets.utf8_to_utf16(19), 11);
20//! // ...and only 2 Unicode code points.
21//! assert_eq!(offsets.utf8s_to_chars(12..19), 8..10);
22//! ```
23//!
24//! See [`StringOffsets`] for details.
25#![deny(missing_docs)]
26
27use std::{marker::PhantomData, ops::Range};
28
29#[cfg(feature = "wasm")]
30use wasm_bindgen::prelude::*;
31
32mod bitrank;
33mod config;
34#[cfg(feature = "wasm")]
35mod wasm;
36
37use bitrank::{BitRank, BitRankBuilder};
38use config::{Bool, ConfigType, True};
39
40pub use config::{AllConfig, OnlyLines};
41
42/// Converts positions within a given string between UTF-8 byte offsets (the usual in Rust), UTF-16
43/// code units, Unicode code points, and line numbers.
44///
45/// Rust strings are UTF-8, but JavaScript has UTF-16 strings, and in Python, strings are sequences
46/// of Unicode code points. It's therefore necessary to adjust string offsets when communicating
47/// across programming language boundaries. [`StringOffsets`] does these adjustments.
48///
49/// Each `StringOffsets` instance contains offset information for a single string. [Building the
50/// data structure](StringOffsets::new) takes O(n) time and memory, but then most conversions are
51/// O(1).
52///
53/// ["UTF-8 Conversions with BitRank"](https://adaptivepatchwork.com/2023/07/10/utf-conversion/)
54/// is a blog post explaining the implementation.
55///
56/// ## Converting offsets
57///
58/// The conversion methods follow a naming scheme that uses these terms for different kinds of
59/// offsets:
60///
61/// - `utf8` - UTF-8 byte offsets (Rust style).
62/// - `utf16` - UTF-16 code unit offsets (JavaScript style).
63/// - `char` - Count of Unicode scalar values (Python style).
64/// - `utf16_pos` - Zero-based line number and `utf16` offset within the line.
65/// - `char_pos` - Zero-based line number and `char` offset within the line.
66///
67/// For example, [`StringOffsets::utf8_to_utf16`] converts a Rust byte offset to a number that will
68/// index to the same position in a JavaScript string. Offsets are expressed as `usize` or [`Pos`]
69/// values.
70///
71/// All methods accept arguments that are past the end of the string, interpreting them as pointing
72/// to the end of the string.
73///
74/// ## Converting ranges
75///
76/// Some methods translate position *ranges*. These are expressed as `Range<usize>` except for
77/// `line`, which is a `usize`:
78///
79/// - `line` - Zero-based line numbers. The range a `line` refers to is the whole line, including
80///   the trailing newline character if any.
81/// - `lines` - A range of line numbers.
82/// - `utf8s` - UTF-8 byte ranges.
83/// - `utf16s` - UTF-16 code unit ranges.
84/// - `chars` - Ranges of Unicode scalar values.
85///
86/// When mapping offsets to line ranges, it is important to use a `_to_lines` function in order to
87/// end up with the correct line range. We have these methods because if you tried to do it
88/// yourself you would screw it up; use them! (And see the source code for
89/// [`StringOffsets::utf8s_to_lines`] if you don't believe us.)
90///
91/// ## Complexity
92///
93/// Most operations run in O(1) time. A few require O(log n) time. The memory consumed by this
94/// data structure is typically less than the memory occupied by the actual content. In the best
95/// case, it requires ~45% of the content space.
96/// One can reduce memory requirements further by only requesting the necessary features via the
97/// configuration type.
98pub struct StringOffsets<C: ConfigType = AllConfig> {
99    /// Vector storing, for every line, the byte position at which the line starts.
100    line_begins: Vec<u32>,
101
102    /// Encoded bitrank where the rank of a byte position corresponds to the line number to which
103    /// the byte belongs.
104    utf8_to_line: BitRank,
105
106    /// Encoded bitrank where the start of a utf8 code point is marked with a 1 bit.
107    /// The rank of a byte position + 1 corresponds to the char position + 1 to which
108    /// the byte belongs.
109    utf8_to_char: BitRank,
110
111    /// Encoded bitrank where a multi word utf16 code point is marked with a 1 bit.
112    /// Converting a byte position into a utf16 word position is achieved by combining utf8_to_char
113    /// and utf8_to_utf16 rank information.
114    utf8_to_utf16: BitRank,
115
116    /// Marks, for every line, whether it consists only of whitespace characters.
117    whitespace_only: Vec<bool>,
118
119    /// Configuration type.
120    _config: PhantomData<C>,
121}
122
123/// A position in a string, specified by line and column number.
124#[cfg_attr(feature = "wasm", wasm_bindgen)]
125#[derive(Debug, Clone, Copy, PartialEq, Eq)]
126pub struct Pos {
127    /// Zero-indexed line number.
128    pub line: usize,
129    /// Zero-indexed column number. The units of this field depend on the method that produces the
130    /// value. See [`StringOffsets::utf8_to_char_pos`], [`StringOffsets::utf8_to_utf16_pos`].
131    pub col: usize,
132}
133
134// The actual conversion implementation between utf8, utf16, chars, and line numbers.
135// New methods must follow the existing conventions:
136//
137// - All conversions saturate when the input is out of bounds.
138// - Lines INCLUDE the terminating newline.
139// - Line numbers and column numbers are 0-based.
140// - `.xyz_to_lines(range)` methods behave like `.utf8_to_lines(the corresponding byte range)`.
141//
142// This last one is tricky, because in these methods, `range.begin` "rounds down" to the beginning
143// of the line, but `range.end` "rounds up"; and because there are many corner cases.
144//
145// E.g.: The empty character range at the end of one line cannot be distinguished from the empty
146// character range at the start of the subsequent line! This ambiguity is resolved by returning the
147// line which starts with the empty character range.
148//
149// Question: Consider whether we should return an empty line range in this case which would
150// probably be consistent from a mathematical point of view. But then we should also return empty
151// line ranges for empty character ranges in the middle of a line...
152impl<C: ConfigType> StringOffsets<C> {
153    /// Create a new converter to work with offsets into the given string.
154    pub fn new(content: &str) -> Self {
155        new_converter(content.as_bytes())
156    }
157
158    /// Create a new converter to work with offsets into the given byte-string.
159    ///
160    /// If `content` is UTF-8, this is just like [`StringOffsets::new`]. Otherwise, the
161    /// conversion methods will produce unspecified (but memory-safe) results.
162    pub fn from_bytes(content: &[u8]) -> Self {
163        new_converter(content)
164    }
165}
166
167impl<C: ConfigType<HasLines = True>> StringOffsets<C> {
168    /// Returns the number of bytes in the string.
169    pub fn len(&self) -> usize {
170        self.line_begins.last().copied().unwrap_or(0) as usize
171    }
172
173    /// Returns whether there are no bytes in the string.
174    pub fn is_empty(&self) -> bool {
175        self.line_begins.is_empty()
176    }
177
178    /// Returns the number of lines in the string.
179    pub fn lines(&self) -> usize {
180        self.line_begins.len() - 1
181    }
182
183    /// Return the byte offset of the first character on the specified (zero-based) line.
184    ///
185    /// If `line_number` is greater than or equal to the number of lines in the text, this returns
186    /// the length of the string.
187    pub fn line_to_utf8_begin(&self, line_number: usize) -> usize {
188        self.line_begins[line_number.min(self.lines())] as usize
189    }
190
191    /// UTF-8 offset of the first character of a line.
192    pub fn line_to_utf8_end(&self, line_number: usize) -> usize {
193        self.line_to_utf8_begin(line_number + 1)
194    }
195
196    /// Return the zero-based line number of the line containing the specified UTF-8 offset.
197    /// Newline characters count as part of the preceding line.
198    pub fn utf8_to_line(&self, byte_number: usize) -> usize {
199        self.utf8_to_line.rank(byte_number)
200    }
201
202    /// Returns the range of line numbers containing the substring specified by the Rust-style
203    /// range `bytes`. Newline characters count as part of the preceding line.
204    ///
205    /// If `bytes` is an empty range at a position within or at the beginning of a line, this
206    /// returns a nonempty range containing the line number of that one line. An empty range at or
207    /// beyond the end of the string translates to an empty range of line numbers.
208    pub fn utf8s_to_lines(&self, bytes: Range<usize>) -> Range<usize> {
209        // The fiddly parts of this formula are necessary because `bytes.start` rounds down to the
210        // beginning of the line, but `bytes.end` "rounds up" to the end of the line. the final
211        // `+1` is to produce a half-open range.
212        self.utf8_to_line(bytes.start)
213            ..self
214                .lines()
215                .min(self.utf8_to_line(bytes.end.saturating_sub(1).max(bytes.start)) + 1)
216    }
217
218    /// UTF-8 offset one past the end of a line (the offset of the start of the next line).
219    pub fn line_to_utf8s(&self, line_number: usize) -> Range<usize> {
220        self.line_to_utf8_begin(line_number)..self.line_to_utf8_end(line_number)
221    }
222
223    /// UTF-8 offsets for the beginning and end of a range of lines, including the newline if any.
224    pub fn lines_to_utf8s(&self, line_numbers: Range<usize>) -> Range<usize> {
225        self.line_to_utf8_begin(line_numbers.start)..self.line_to_utf8_begin(line_numbers.end)
226    }
227}
228
229impl<C: ConfigType<HasChars = True, HasLines = True>> StringOffsets<C> {
230    /// Returns the number of Unicode characters on the specified line.
231    pub fn line_chars(&self, line_number: usize) -> usize {
232        let r = self.utf8s_to_chars(self.line_to_utf8s(line_number));
233        r.end - r.start
234    }
235
236    /// UTF-32 offset of the first character of a line.
237    ///
238    /// That is, return the offset that would point to the start of that line in a UTF-32
239    /// representation of the source string.
240    pub fn line_to_char_begin(&self, line_number: usize) -> usize {
241        self.utf8_to_char(self.line_to_utf8_begin(line_number))
242    }
243
244    /// UTF-32 offset one past the end of a line (the offset of the start of the next line).
245    pub fn line_to_char_end(&self, line_number: usize) -> usize {
246        self.utf8_to_char(self.line_to_utf8_end(line_number))
247    }
248
249    /// UTF-32 offsets for the beginning and end of a line, including the newline if any.
250    pub fn line_to_chars(&self, line_number: usize) -> Range<usize> {
251        self.utf8s_to_chars(self.line_to_utf8s(line_number))
252    }
253
254    /// UTF-32 offsets for the beginning and end of a range of lines, including the newline if any.
255    pub fn lines_to_chars(&self, line_numbers: Range<usize>) -> Range<usize> {
256        self.utf8s_to_chars(self.lines_to_utf8s(line_numbers))
257    }
258
259    /// Converts a UTF-8 offset to a zero-based line number and UTF-32 offset within the
260    /// line.
261    pub fn utf8_to_char_pos(&self, byte_number: usize) -> Pos {
262        let line = self.utf8_to_line(byte_number);
263        let line_start_char_number = self.line_to_char_begin(line);
264        let char_idx = self.utf8_to_char(byte_number);
265        Pos {
266            line,
267            col: char_idx - line_start_char_number,
268        }
269    }
270
271    /// Returns the range of line numbers containing the substring specified by the UTF-32
272    /// range `chars`. Newline characters count as part of the preceding line.
273    pub fn chars_to_lines(&self, chars: Range<usize>) -> Range<usize> {
274        self.utf8s_to_lines(self.chars_to_utf8s(chars))
275    }
276}
277
278impl<C: ConfigType<HasWhitespace = True>> StringOffsets<C> {
279    /// Returns true if the specified line is empty except for whitespace.
280    pub fn only_whitespaces(&self, line_number: usize) -> bool {
281        self.whitespace_only
282            .get(line_number)
283            .copied()
284            .unwrap_or(true)
285    }
286}
287
288impl<C: ConfigType<HasChars = True>> StringOffsets<C> {
289    /// Converts a UTF-8 offset to a UTF-32 offset.
290    pub fn utf8_to_char(&self, byte_number: usize) -> usize {
291        self.utf8_to_char.rank(byte_number + 1) - 1
292    }
293
294    /// Converts a UTF-32 offset to a UTF-8 offset.
295    pub fn char_to_utf8(&self, char_number: usize) -> usize {
296        let mut byte_number = char_number;
297        for _ in 0..128 {
298            let char_number2 = self.utf8_to_char(byte_number);
299            if char_number2 == char_number {
300                return byte_number;
301            }
302            byte_number += char_number - char_number2;
303        }
304        // If we couldn't find the char within 128 steps, then the char_number might be invalid!
305        // This does not usually happen. For consistency with the rest of the code, we simply return
306        // the max utf8 position in this case.
307        if char_number >= self.utf8_to_char.max_rank() {
308            return self
309                .line_begins
310                .last()
311                .copied()
312                .expect("last entry represents the length of the file!")
313                as usize;
314        }
315        let limit = *self.line_begins.last().expect("no line begins") as usize;
316        // Otherwise, we keep searching, but are a bit more careful and add a check that we don't run into an infinite loop.
317        loop {
318            let char_number2 = self.utf8_to_char(byte_number);
319            if char_number2 == char_number {
320                return byte_number;
321            }
322            byte_number += char_number - char_number2;
323            assert!(byte_number < limit);
324        }
325    }
326
327    /// Converts a UTF-8 offset range to a UTF-32 offset range.
328    pub fn utf8s_to_chars(&self, bytes: Range<usize>) -> Range<usize> {
329        self.utf8_to_char(bytes.start)..self.utf8_to_char(bytes.end)
330    }
331
332    /// Converts a UTF-32 offset range to a UTF-8 offset range.
333    pub fn chars_to_utf8s(&self, chars: Range<usize>) -> Range<usize> {
334        self.char_to_utf8(chars.start)..self.char_to_utf8(chars.end)
335    }
336}
337
338impl<C: ConfigType<HasChars = True, HasUtf16 = True>> StringOffsets<C> {
339    /// Converts a UTF-8 offset to a UTF-16 offset.
340    pub fn utf8_to_utf16(&self, byte_number: usize) -> usize {
341        self.utf8_to_char(byte_number) + self.utf8_to_utf16.rank(byte_number)
342    }
343}
344
345impl<C: ConfigType<HasChars = True, HasLines = True, HasUtf16 = True>> StringOffsets<C> {
346    /// UTF-16 offset of the first character of a line.
347    ///
348    /// That is, return the offset that would point to the start of that line in a UTF-16
349    /// representation of the source string.
350    pub fn line_to_utf16_begin(&self, line_number: usize) -> usize {
351        self.utf8_to_utf16(self.line_to_utf8_begin(line_number))
352    }
353
354    /// UTF-16 offset one past the end of a line (the offset of the start of the next line).
355    pub fn line_to_utf16_end(&self, line_number: usize) -> usize {
356        self.utf8_to_utf16(self.line_to_utf8_end(line_number))
357    }
358
359    /// Converts a UTF-8 offset to a zero-based line number and UTF-16 offset within the
360    /// line.
361    pub fn utf8_to_utf16_pos(&self, byte_number: usize) -> Pos {
362        let line = self.utf8_to_line(byte_number);
363        let line_start_char_number = self.line_to_utf16_begin(line);
364        let char_idx = self.utf8_to_utf16(byte_number);
365        Pos {
366            line,
367            col: char_idx - line_start_char_number,
368        }
369    }
370}
371
372fn new_converter<C: ConfigType>(content: &[u8]) -> StringOffsets<C> {
373    let n = content.len();
374    let mut utf8_builder =
375        BitRankBuilder::with_capacity(if C::HasChars::VALUE { n + 1 } else { 0 });
376    let mut utf16_builder = BitRankBuilder::with_capacity(if C::HasUtf16::VALUE { n } else { 0 });
377    let mut line_builder = BitRankBuilder::with_capacity(if C::HasLines::VALUE { n } else { 0 });
378    let mut line_begins = vec![0];
379    let mut whitespace_only = vec![];
380    let mut only_whitespaces = true; // true if all characters in the current line are whitespaces.
381    for (i, &c) in content.iter().enumerate() {
382        // Note: We expect here proper utf8 encoded strings! Otherwise, the conversion will have undefined behaviour.
383        if C::HasChars::VALUE && is_char_boundary(c) {
384            utf8_builder.push(i);
385        }
386        if C::HasUtf16::VALUE && two_utf16(c) {
387            utf16_builder.push(i);
388        }
389        if c == b'\n' {
390            if C::HasWhitespace::VALUE {
391                whitespace_only.push(only_whitespaces);
392                only_whitespaces = true; // reset for next line.
393            }
394            if C::HasLines::VALUE {
395                line_begins.push(i as u32 + 1);
396                line_builder.push(i);
397            }
398        } else if C::HasWhitespace::VALUE {
399            only_whitespaces = only_whitespaces && matches!(c, b'\t' | b'\r' | b' ');
400        }
401    }
402    if C::HasChars::VALUE {
403        utf8_builder.push(n);
404    }
405    if line_begins.last() != Some(&(n as u32)) {
406        if C::HasWhitespace::VALUE {
407            whitespace_only.push(only_whitespaces);
408        }
409        if C::HasLines::VALUE {
410            line_begins.push(n as u32);
411            line_builder.push(n - 1);
412        }
413    }
414
415    StringOffsets {
416        line_begins,
417        utf8_to_line: line_builder.finish(),
418        whitespace_only,
419        utf8_to_char: utf8_builder.finish(),
420        utf8_to_utf16: utf16_builder.finish(),
421        _config: PhantomData,
422    }
423}
424
425/// Returns true if, in a UTF-8 string, `b` indicates the first byte of a character.
426fn is_char_boundary(b: u8) -> bool {
427    b as i8 >= -0x40 // NB: b < 128 || b >= 192
428}
429
430fn two_utf16(c: u8) -> bool {
431    c & 0b1111_0000 == 0b1111_0000
432}
433
434#[cfg(test)]
435mod tests {
436    use super::*;
437
438    /// Returns the number of bytes a UTF-8 char occupies, given the first byte of the UTF-8 encoding.
439    /// Returns 0 if the byte is not a valid first byte of a UTF-8 char.
440    fn utf8_width(c: u8) -> usize {
441        // Every nibble represents the utf8 length given the first 4 bits of a utf8 encoded byte.
442        const UTF8_WIDTH: u64 = 0x4322_0000_1111_1111;
443        ((UTF8_WIDTH >> ((c >> 4) * 4)) & 0xf) as usize
444    }
445
446    fn utf8_to_utf16_width(content: &[u8]) -> usize {
447        let len = utf8_width(content[0]);
448        match len {
449            0 => 0,
450            1..=3 => 1,
451            4 => 2,
452            _ => panic!("invalid utf8 char width: {}", len),
453        }
454    }
455
456    #[test]
457    fn test_utf8_char_width() {
458        for c in '\0'..=char::MAX {
459            let mut dst = [0; 4];
460            let len = c.encode_utf8(&mut dst).len();
461            assert_eq!(len, utf8_width(dst[0]), "char: {:?} {len}", dst[0] >> 4);
462        }
463
464        for b in 0..=255u8 {
465            if !is_char_boundary(b) {
466                assert_eq!(utf8_width(b), 0, "char: {:?}", b >> 4);
467            } else {
468                assert!(utf8_width(b) > 0, "char: {:?}", b >> 4);
469            }
470        }
471    }
472
473    #[test]
474    fn test_utf8_to_utf16_len() {
475        for c in '\0'..=char::MAX {
476            let mut dst = [0; 4];
477            let _len = c.encode_utf8(&mut dst).len();
478            assert_eq!(utf8_to_utf16_width(&dst), c.len_utf16());
479        }
480
481        for b in 0..=255u8 {
482            if !is_char_boundary(b) {
483                assert_eq!(utf8_to_utf16_width(&[b]), 0);
484            }
485        }
486    }
487
488    #[test]
489    fn test_line_map() {
490        let content = r#"a short line.
491followed by another one.
492no terminating newline!"#;
493        let lines: StringOffsets = StringOffsets::new(content);
494        assert_eq!(lines.line_to_utf8s(0), 0..14);
495        assert_eq!(&content[0..14], "a short line.\n");
496        assert_eq!(lines.line_to_utf8s(1), 14..39);
497        assert_eq!(&content[14..39], "followed by another one.\n");
498        assert_eq!(lines.line_to_utf8s(2), 39..62);
499        assert_eq!(&content[39..62], "no terminating newline!");
500        assert_eq!(lines.utf8_to_line(0), 0);
501        assert_eq!(lines.utf8_to_line(13), 0);
502        assert_eq!(lines.utf8_to_line(14), 1);
503        assert_eq!(lines.utf8_to_line(38), 1);
504        assert_eq!(lines.utf8_to_line(39), 2);
505        assert_eq!(lines.utf8_to_line(61), 2);
506        assert_eq!(lines.utf8_to_line(62), 3); // <<-- this character is beyond the content.
507        assert_eq!(lines.utf8_to_line(100), 3);
508        assert_eq!(lines.utf8s_to_chars(4..10), 4..10);
509        assert_eq!(lines.chars_to_utf8s(4..10), 4..10);
510
511        assert_eq!(content.len(), 62);
512        assert_eq!(lines.lines_to_utf8s(2..3), 39..62);
513        assert_eq!(lines.lines_to_utf8s(2..4), 39..62);
514        assert_eq!(lines.lines_to_chars(2..4), 39..62);
515        assert_eq!(lines.utf8s_to_lines(39..62), 2..3);
516        assert_eq!(lines.utf8s_to_lines(39..63), 2..3); // The "invalid" utf8 position results in a valid line position.
517        assert_eq!(lines.char_to_utf8(62), 62);
518        assert_eq!(lines.char_to_utf8(63), 62); // char 63 doesn't exist, so we map to the closest valid utf8 position.
519
520        // Empty ranges
521        assert_eq!(lines.utf8s_to_lines(0..0), 0..1);
522        assert_eq!(lines.utf8s_to_lines(13..13), 0..1);
523        assert_eq!(lines.utf8s_to_lines(14..14), 1..2);
524        assert_eq!(lines.utf8s_to_lines(38..38), 1..2);
525        assert_eq!(lines.utf8s_to_lines(39..39), 2..3);
526        assert_eq!(lines.utf8s_to_lines(61..61), 2..3);
527        assert_eq!(lines.utf8s_to_lines(62..62), 3..3);
528        assert_eq!(lines.utf8s_to_lines(63..63), 3..3);
529    }
530
531    fn pos(line: usize, col: usize) -> Pos {
532        Pos { line, col }
533    }
534
535    #[test]
536    fn test_convert_ascii() {
537        let content = r#"line0
538line1"#;
539        let lines: StringOffsets = StringOffsets::new(content);
540        assert_eq!(lines.utf8_to_char_pos(0), pos(0, 0));
541        assert_eq!(lines.utf8_to_char_pos(1), pos(0, 1));
542        assert_eq!(lines.utf8_to_char_pos(6), pos(1, 0));
543        assert_eq!(lines.utf8_to_char_pos(7), pos(1, 1));
544    }
545
546    #[test]
547    fn test_convert_unicode() {
548        // Á - 2 bytes utf8
549        let content = r#"❤️ line0
550line1
551✅ line2"#;
552        let lines: StringOffsets = StringOffsets::new(content);
553        assert_eq!(lines.utf8_to_char_pos(0), pos(0, 0)); // ❤️ takes 6 bytes to represent in utf8 (2 code points)
554        assert_eq!(lines.utf8_to_char_pos(1), pos(0, 0));
555        assert_eq!(lines.utf8_to_char_pos(2), pos(0, 0));
556        assert_eq!(lines.utf8_to_char_pos(3), pos(0, 1));
557        assert_eq!(lines.utf8_to_char_pos(4), pos(0, 1));
558        assert_eq!(lines.utf8_to_char_pos(5), pos(0, 1));
559
560        assert_eq!(lines.utf8_to_char_pos(6), pos(0, 2)); // <space>
561        assert_eq!(lines.utf8_to_char_pos(7), pos(0, 3)); // line
562                                                          // ^
563
564        assert_eq!(lines.utf8_to_char_pos(13), pos(1, 0)); // line
565                                                           // ^
566
567        assert_eq!(lines.utf8_to_char_pos(19), pos(2, 0)); // ✅ takes 3 bytes to represent in utf8 (1 code point)
568        assert_eq!(lines.utf8_to_char_pos(20), pos(2, 0));
569        assert_eq!(lines.utf8_to_char_pos(21), pos(2, 0));
570
571        assert_eq!(lines.utf8_to_char_pos(22), pos(2, 1)); // <space>
572
573        assert_eq!(lines.utf8_to_utf16_pos(0), pos(0, 0)); // ❤️ takes 4 bytes to represent in utf16 (2 code points)
574        assert_eq!(lines.utf8_to_utf16_pos(1), pos(0, 0));
575        assert_eq!(lines.utf8_to_utf16_pos(2), pos(0, 0));
576        assert_eq!(lines.utf8_to_utf16_pos(3), pos(0, 1));
577    }
578
579    #[test]
580    fn test_small() {
581        // Á - 2 bytes utf8
582        let content = r#"❤️ line0 ❤️Á 👋"#;
583        let lines: StringOffsets = StringOffsets::new(content);
584        let mut utf16_index = 0;
585        let mut char_index = 0;
586        for (byte_index, char) in content.char_indices() {
587            assert_eq!(lines.utf8_to_char(byte_index), char_index);
588            assert_eq!(lines.utf8_to_utf16(byte_index), utf16_index);
589            char_index += 1;
590            utf16_index += char.len_utf16();
591        }
592        assert_eq!(lines.utf8_to_char(content.len()), char_index);
593        assert_eq!(lines.utf8_to_utf16(content.len()), utf16_index);
594    }
595
596    #[test]
597    fn test_variable_lengths() {
598        let content = r#"❤️Á 👋"#;
599        //                   ^~ utf8: 1 char, 4 bytes, utf16: 2 code units
600        //                 ^~~~ utf8: 1 char, 1 byte, utf16: 1 code unit
601        //                ^~~~~ utf8: 1 char, 2 bytes, utf16: 1 code unit
602        //               ^~~~~~ utf8: 2 chars, 3 byte ea., utf16: 2 code units
603        let lines: StringOffsets = StringOffsets::new(content);
604
605        // UTF-16 positions
606        assert_eq!(lines.utf8_to_utf16_pos(0), pos(0, 0)); // ❤️
607        assert_eq!(lines.utf8_to_utf16_pos(1), pos(0, 0));
608        assert_eq!(lines.utf8_to_utf16_pos(2), pos(0, 0));
609        assert_eq!(lines.utf8_to_utf16_pos(3), pos(0, 1));
610        assert_eq!(lines.utf8_to_utf16_pos(5), pos(0, 1));
611        assert_eq!(lines.utf8_to_utf16_pos(4), pos(0, 1));
612        assert_eq!(lines.utf8_to_utf16_pos(6), pos(0, 2)); // Á
613        assert_eq!(lines.utf8_to_utf16_pos(7), pos(0, 2));
614        assert_eq!(lines.utf8_to_utf16_pos(8), pos(0, 3)); // <space>
615        assert_eq!(lines.utf8_to_utf16_pos(9), pos(0, 4)); // 👋
616
617        // These middle utf8 byte positions don't have valid mappings:
618        // assert_eq!(lines.utf8_to_utf16_pos(10), pos(0, 4));
619        // assert_eq!(lines.utf8_to_utf16_pos(11), pos(0, 5));
620        //
621        // 👋 in utf16: 0xd83d 0xdc4b
622        // 👋 in utf8: 0xf0 0x9f 0x91 0x8b
623        //                  ^    ^
624        // It's not really defined where these inner bytes map to and it
625        // doesn't matter because we would never report those byte offset as
626        // they are in the middle of a character and therefore invalid.
627
628        assert_eq!(lines.utf8_to_utf16_pos(12), pos(0, 5));
629
630        // UTF-8 positions
631        assert_eq!(lines.utf8_to_char_pos(0), pos(0, 0)); // ❤️
632        assert_eq!(lines.utf8_to_char_pos(1), pos(0, 0));
633        assert_eq!(lines.utf8_to_char_pos(2), pos(0, 0));
634        assert_eq!(lines.utf8_to_char_pos(3), pos(0, 1));
635        assert_eq!(lines.utf8_to_char_pos(4), pos(0, 1));
636        assert_eq!(lines.utf8_to_char_pos(5), pos(0, 1));
637        assert_eq!(lines.utf8_to_char_pos(6), pos(0, 2)); // Á
638        assert_eq!(lines.utf8_to_char_pos(7), pos(0, 2));
639        assert_eq!(lines.utf8_to_char_pos(8), pos(0, 3)); // <space>
640        assert_eq!(lines.utf8_to_char_pos(9), pos(0, 4)); // 👋
641        assert_eq!(lines.utf8_to_char_pos(10), pos(0, 4));
642        assert_eq!(lines.utf8_to_char_pos(11), pos(0, 4));
643        assert_eq!(lines.utf8_to_char_pos(12), pos(0, 4));
644    }
645
646    #[test]
647    fn test_critical_input_len() {
648        let content = [b'a'; 16384];
649        let lines: StringOffsets = StringOffsets::from_bytes(&content);
650        assert_eq!(lines.utf8_to_utf16_pos(16384), pos(1, 0));
651    }
652}
string_offsets/lib.rs

string_offsets/
lib.rs