#![deny(missing_docs)]
use std::{marker::PhantomData, ops::Range};
#[cfg(feature = "wasm")]
use wasm_bindgen::prelude::*;
mod bitrank;
mod config;
#[cfg(feature = "wasm")]
mod wasm;
use bitrank::{BitRank, BitRankBuilder};
use config::{Bool, ConfigType, True};
pub use config::{AllConfig, OnlyLines};
pub struct StringOffsets<C: ConfigType = AllConfig> {
line_begins: Vec<u32>,
utf8_to_line: BitRank,
utf8_to_char: BitRank,
utf8_to_utf16: BitRank,
whitespace_only: Vec<bool>,
_config: PhantomData<C>,
}
#[cfg_attr(feature = "wasm", wasm_bindgen)]
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct Pos {
pub line: usize,
pub col: usize,
}
impl<C: ConfigType> StringOffsets<C> {
pub fn new(content: &str) -> Self {
new_converter(content.as_bytes())
}
pub fn from_bytes(content: &[u8]) -> Self {
new_converter(content)
}
}
impl<C: ConfigType<HasLines = True>> StringOffsets<C> {
pub fn len(&self) -> usize {
self.line_begins.last().copied().unwrap_or(0) as usize
}
pub fn is_empty(&self) -> bool {
self.line_begins.is_empty()
}
pub fn lines(&self) -> usize {
self.line_begins.len() - 1
}
pub fn line_to_utf8_begin(&self, line_number: usize) -> usize {
self.line_begins[line_number.min(self.lines())] as usize
}
pub fn line_to_utf8_end(&self, line_number: usize) -> usize {
self.line_to_utf8_begin(line_number + 1)
}
pub fn utf8_to_line(&self, byte_number: usize) -> usize {
self.utf8_to_line.rank(byte_number)
}
pub fn utf8s_to_lines(&self, bytes: Range<usize>) -> Range<usize> {
self.utf8_to_line(bytes.start)
..self
.lines()
.min(self.utf8_to_line(bytes.end.saturating_sub(1).max(bytes.start)) + 1)
}
pub fn line_to_utf8s(&self, line_number: usize) -> Range<usize> {
self.line_to_utf8_begin(line_number)..self.line_to_utf8_end(line_number)
}
pub fn lines_to_utf8s(&self, line_numbers: Range<usize>) -> Range<usize> {
self.line_to_utf8_begin(line_numbers.start)..self.line_to_utf8_begin(line_numbers.end)
}
}
impl<C: ConfigType<HasChars = True, HasLines = True>> StringOffsets<C> {
pub fn line_chars(&self, line_number: usize) -> usize {
let r = self.utf8s_to_chars(self.line_to_utf8s(line_number));
r.end - r.start
}
pub fn line_to_char_begin(&self, line_number: usize) -> usize {
self.utf8_to_char(self.line_to_utf8_begin(line_number))
}
pub fn line_to_char_end(&self, line_number: usize) -> usize {
self.utf8_to_char(self.line_to_utf8_end(line_number))
}
pub fn line_to_chars(&self, line_number: usize) -> Range<usize> {
self.utf8s_to_chars(self.line_to_utf8s(line_number))
}
pub fn lines_to_chars(&self, line_numbers: Range<usize>) -> Range<usize> {
self.utf8s_to_chars(self.lines_to_utf8s(line_numbers))
}
pub fn utf8_to_char_pos(&self, byte_number: usize) -> Pos {
let line = self.utf8_to_line(byte_number);
let line_start_char_number = self.line_to_char_begin(line);
let char_idx = self.utf8_to_char(byte_number);
Pos {
line,
col: char_idx - line_start_char_number,
}
}
pub fn chars_to_lines(&self, chars: Range<usize>) -> Range<usize> {
self.utf8s_to_lines(self.chars_to_utf8s(chars))
}
}
impl<C: ConfigType<HasWhitespace = True>> StringOffsets<C> {
pub fn only_whitespaces(&self, line_number: usize) -> bool {
self.whitespace_only
.get(line_number)
.copied()
.unwrap_or(true)
}
}
impl<C: ConfigType<HasChars = True>> StringOffsets<C> {
pub fn utf8_to_char(&self, byte_number: usize) -> usize {
self.utf8_to_char.rank(byte_number + 1) - 1
}
pub fn char_to_utf8(&self, char_number: usize) -> usize {
let mut byte_number = char_number;
for _ in 0..128 {
let char_number2 = self.utf8_to_char(byte_number);
if char_number2 == char_number {
return byte_number;
}
byte_number += char_number - char_number2;
}
if char_number >= self.utf8_to_char.max_rank() {
return self
.line_begins
.last()
.copied()
.expect("last entry represents the length of the file!")
as usize;
}
let limit = *self.line_begins.last().expect("no line begins") as usize;
loop {
let char_number2 = self.utf8_to_char(byte_number);
if char_number2 == char_number {
return byte_number;
}
byte_number += char_number - char_number2;
assert!(byte_number < limit);
}
}
pub fn utf8s_to_chars(&self, bytes: Range<usize>) -> Range<usize> {
self.utf8_to_char(bytes.start)..self.utf8_to_char(bytes.end)
}
pub fn chars_to_utf8s(&self, chars: Range<usize>) -> Range<usize> {
self.char_to_utf8(chars.start)..self.char_to_utf8(chars.end)
}
}
impl<C: ConfigType<HasChars = True, HasUtf16 = True>> StringOffsets<C> {
pub fn utf8_to_utf16(&self, byte_number: usize) -> usize {
self.utf8_to_char(byte_number) + self.utf8_to_utf16.rank(byte_number)
}
}
impl<C: ConfigType<HasChars = True, HasLines = True, HasUtf16 = True>> StringOffsets<C> {
pub fn line_to_utf16_begin(&self, line_number: usize) -> usize {
self.utf8_to_utf16(self.line_to_utf8_begin(line_number))
}
pub fn line_to_utf16_end(&self, line_number: usize) -> usize {
self.utf8_to_utf16(self.line_to_utf8_end(line_number))
}
pub fn utf8_to_utf16_pos(&self, byte_number: usize) -> Pos {
let line = self.utf8_to_line(byte_number);
let line_start_char_number = self.line_to_utf16_begin(line);
let char_idx = self.utf8_to_utf16(byte_number);
Pos {
line,
col: char_idx - line_start_char_number,
}
}
}
fn new_converter<C: ConfigType>(content: &[u8]) -> StringOffsets<C> {
let n = content.len();
let mut utf8_builder =
BitRankBuilder::with_capacity(if C::HasChars::VALUE { n + 1 } else { 0 });
let mut utf16_builder = BitRankBuilder::with_capacity(if C::HasUtf16::VALUE { n } else { 0 });
let mut line_builder = BitRankBuilder::with_capacity(if C::HasLines::VALUE { n } else { 0 });
let mut line_begins = vec![0];
let mut whitespace_only = vec![];
let mut only_whitespaces = true; for (i, &c) in content.iter().enumerate() {
if C::HasChars::VALUE && is_char_boundary(c) {
utf8_builder.push(i);
}
if C::HasUtf16::VALUE && two_utf16(c) {
utf16_builder.push(i);
}
if c == b'\n' {
if C::HasWhitespace::VALUE {
whitespace_only.push(only_whitespaces);
only_whitespaces = true; }
if C::HasLines::VALUE {
line_begins.push(i as u32 + 1);
line_builder.push(i);
}
} else if C::HasWhitespace::VALUE {
only_whitespaces = only_whitespaces && matches!(c, b'\t' | b'\r' | b' ');
}
}
if C::HasChars::VALUE {
utf8_builder.push(n);
}
if line_begins.last() != Some(&(n as u32)) {
if C::HasWhitespace::VALUE {
whitespace_only.push(only_whitespaces);
}
if C::HasLines::VALUE {
line_begins.push(n as u32);
line_builder.push(n - 1);
}
}
StringOffsets {
line_begins,
utf8_to_line: line_builder.finish(),
whitespace_only,
utf8_to_char: utf8_builder.finish(),
utf8_to_utf16: utf16_builder.finish(),
_config: PhantomData,
}
}
fn is_char_boundary(b: u8) -> bool {
b as i8 >= -0x40 }
fn two_utf16(c: u8) -> bool {
c & 0b1111_0000 == 0b1111_0000
}
#[cfg(test)]
mod tests {
use super::*;
fn utf8_width(c: u8) -> usize {
const UTF8_WIDTH: u64 = 0x4322_0000_1111_1111;
((UTF8_WIDTH >> ((c >> 4) * 4)) & 0xf) as usize
}
fn utf8_to_utf16_width(content: &[u8]) -> usize {
let len = utf8_width(content[0]);
match len {
0 => 0,
1..=3 => 1,
4 => 2,
_ => panic!("invalid utf8 char width: {}", len),
}
}
#[test]
fn test_utf8_char_width() {
for c in '\0'..=char::MAX {
let mut dst = [0; 4];
let len = c.encode_utf8(&mut dst).len();
assert_eq!(len, utf8_width(dst[0]), "char: {:?} {len}", dst[0] >> 4);
}
for b in 0..=255u8 {
if !is_char_boundary(b) {
assert_eq!(utf8_width(b), 0, "char: {:?}", b >> 4);
} else {
assert!(utf8_width(b) > 0, "char: {:?}", b >> 4);
}
}
}
#[test]
fn test_utf8_to_utf16_len() {
for c in '\0'..=char::MAX {
let mut dst = [0; 4];
let _len = c.encode_utf8(&mut dst).len();
assert_eq!(utf8_to_utf16_width(&dst), c.len_utf16());
}
for b in 0..=255u8 {
if !is_char_boundary(b) {
assert_eq!(utf8_to_utf16_width(&[b]), 0);
}
}
}
#[test]
fn test_line_map() {
let content = r#"a short line.
followed by another one.
no terminating newline!"#;
let lines: StringOffsets = StringOffsets::new(content);
assert_eq!(lines.line_to_utf8s(0), 0..14);
assert_eq!(&content[0..14], "a short line.\n");
assert_eq!(lines.line_to_utf8s(1), 14..39);
assert_eq!(&content[14..39], "followed by another one.\n");
assert_eq!(lines.line_to_utf8s(2), 39..62);
assert_eq!(&content[39..62], "no terminating newline!");
assert_eq!(lines.utf8_to_line(0), 0);
assert_eq!(lines.utf8_to_line(13), 0);
assert_eq!(lines.utf8_to_line(14), 1);
assert_eq!(lines.utf8_to_line(38), 1);
assert_eq!(lines.utf8_to_line(39), 2);
assert_eq!(lines.utf8_to_line(61), 2);
assert_eq!(lines.utf8_to_line(62), 3); assert_eq!(lines.utf8_to_line(100), 3);
assert_eq!(lines.utf8s_to_chars(4..10), 4..10);
assert_eq!(lines.chars_to_utf8s(4..10), 4..10);
assert_eq!(content.len(), 62);
assert_eq!(lines.lines_to_utf8s(2..3), 39..62);
assert_eq!(lines.lines_to_utf8s(2..4), 39..62);
assert_eq!(lines.lines_to_chars(2..4), 39..62);
assert_eq!(lines.utf8s_to_lines(39..62), 2..3);
assert_eq!(lines.utf8s_to_lines(39..63), 2..3); assert_eq!(lines.char_to_utf8(62), 62);
assert_eq!(lines.char_to_utf8(63), 62);
assert_eq!(lines.utf8s_to_lines(0..0), 0..1);
assert_eq!(lines.utf8s_to_lines(13..13), 0..1);
assert_eq!(lines.utf8s_to_lines(14..14), 1..2);
assert_eq!(lines.utf8s_to_lines(38..38), 1..2);
assert_eq!(lines.utf8s_to_lines(39..39), 2..3);
assert_eq!(lines.utf8s_to_lines(61..61), 2..3);
assert_eq!(lines.utf8s_to_lines(62..62), 3..3);
assert_eq!(lines.utf8s_to_lines(63..63), 3..3);
}
fn pos(line: usize, col: usize) -> Pos {
Pos { line, col }
}
#[test]
fn test_convert_ascii() {
let content = r#"line0
line1"#;
let lines: StringOffsets = StringOffsets::new(content);
assert_eq!(lines.utf8_to_char_pos(0), pos(0, 0));
assert_eq!(lines.utf8_to_char_pos(1), pos(0, 1));
assert_eq!(lines.utf8_to_char_pos(6), pos(1, 0));
assert_eq!(lines.utf8_to_char_pos(7), pos(1, 1));
}
#[test]
fn test_convert_unicode() {
let content = r#"❤️ line0
line1
✅ line2"#;
let lines: StringOffsets = StringOffsets::new(content);
assert_eq!(lines.utf8_to_char_pos(0), pos(0, 0)); assert_eq!(lines.utf8_to_char_pos(1), pos(0, 0));
assert_eq!(lines.utf8_to_char_pos(2), pos(0, 0));
assert_eq!(lines.utf8_to_char_pos(3), pos(0, 1));
assert_eq!(lines.utf8_to_char_pos(4), pos(0, 1));
assert_eq!(lines.utf8_to_char_pos(5), pos(0, 1));
assert_eq!(lines.utf8_to_char_pos(6), pos(0, 2)); assert_eq!(lines.utf8_to_char_pos(7), pos(0, 3));
assert_eq!(lines.utf8_to_char_pos(13), pos(1, 0));
assert_eq!(lines.utf8_to_char_pos(19), pos(2, 0)); assert_eq!(lines.utf8_to_char_pos(20), pos(2, 0));
assert_eq!(lines.utf8_to_char_pos(21), pos(2, 0));
assert_eq!(lines.utf8_to_char_pos(22), pos(2, 1));
assert_eq!(lines.utf8_to_utf16_pos(0), pos(0, 0)); assert_eq!(lines.utf8_to_utf16_pos(1), pos(0, 0));
assert_eq!(lines.utf8_to_utf16_pos(2), pos(0, 0));
assert_eq!(lines.utf8_to_utf16_pos(3), pos(0, 1));
}
#[test]
fn test_small() {
let content = r#"❤️ line0 ❤️Á 👋"#;
let lines: StringOffsets = StringOffsets::new(content);
let mut utf16_index = 0;
let mut char_index = 0;
for (byte_index, char) in content.char_indices() {
assert_eq!(lines.utf8_to_char(byte_index), char_index);
assert_eq!(lines.utf8_to_utf16(byte_index), utf16_index);
char_index += 1;
utf16_index += char.len_utf16();
}
assert_eq!(lines.utf8_to_char(content.len()), char_index);
assert_eq!(lines.utf8_to_utf16(content.len()), utf16_index);
}
#[test]
fn test_variable_lengths() {
let content = r#"❤️Á 👋"#;
let lines: StringOffsets = StringOffsets::new(content);
assert_eq!(lines.utf8_to_utf16_pos(0), pos(0, 0)); assert_eq!(lines.utf8_to_utf16_pos(1), pos(0, 0));
assert_eq!(lines.utf8_to_utf16_pos(2), pos(0, 0));
assert_eq!(lines.utf8_to_utf16_pos(3), pos(0, 1));
assert_eq!(lines.utf8_to_utf16_pos(5), pos(0, 1));
assert_eq!(lines.utf8_to_utf16_pos(4), pos(0, 1));
assert_eq!(lines.utf8_to_utf16_pos(6), pos(0, 2)); assert_eq!(lines.utf8_to_utf16_pos(7), pos(0, 2));
assert_eq!(lines.utf8_to_utf16_pos(8), pos(0, 3)); assert_eq!(lines.utf8_to_utf16_pos(9), pos(0, 4));
assert_eq!(lines.utf8_to_utf16_pos(12), pos(0, 5));
assert_eq!(lines.utf8_to_char_pos(0), pos(0, 0)); assert_eq!(lines.utf8_to_char_pos(1), pos(0, 0));
assert_eq!(lines.utf8_to_char_pos(2), pos(0, 0));
assert_eq!(lines.utf8_to_char_pos(3), pos(0, 1));
assert_eq!(lines.utf8_to_char_pos(4), pos(0, 1));
assert_eq!(lines.utf8_to_char_pos(5), pos(0, 1));
assert_eq!(lines.utf8_to_char_pos(6), pos(0, 2)); assert_eq!(lines.utf8_to_char_pos(7), pos(0, 2));
assert_eq!(lines.utf8_to_char_pos(8), pos(0, 3)); assert_eq!(lines.utf8_to_char_pos(9), pos(0, 4)); assert_eq!(lines.utf8_to_char_pos(10), pos(0, 4));
assert_eq!(lines.utf8_to_char_pos(11), pos(0, 4));
assert_eq!(lines.utf8_to_char_pos(12), pos(0, 4));
}
#[test]
fn test_critical_input_len() {
let content = [b'a'; 16384];
let lines: StringOffsets = StringOffsets::from_bytes(&content);
assert_eq!(lines.utf8_to_utf16_pos(16384), pos(1, 0));
}
}