use std::str::from_utf8_unchecked;
use memchr::memchr;
use crate::TokenSource;
pub fn lines(data: &str) -> Lines<'_> {
Lines(ByteLines(data.as_bytes()))
}
pub fn byte_lines(data: &[u8]) -> ByteLines<'_> {
ByteLines(data)
}
impl<'a> TokenSource for &'a str {
type Token = &'a str;
type Tokenizer = Lines<'a>;
fn tokenize(&self) -> Self::Tokenizer {
lines(self)
}
fn estimate_tokens(&self) -> u32 {
lines(self).estimate_tokens()
}
}
impl<'a> TokenSource for &'a [u8] {
type Token = Self;
type Tokenizer = ByteLines<'a>;
fn tokenize(&self) -> Self::Tokenizer {
byte_lines(self)
}
fn estimate_tokens(&self) -> u32 {
byte_lines(self).estimate_tokens()
}
}
#[derive(Clone, Copy, PartialEq, Eq)]
pub struct Lines<'a>(ByteLines<'a>);
impl<'a> Iterator for Lines<'a> {
type Item = &'a str;
fn next(&mut self) -> Option<Self::Item> {
self.0.next().map(|it| unsafe { from_utf8_unchecked(it) })
}
}
impl<'a> TokenSource for Lines<'a> {
type Token = &'a str;
type Tokenizer = Self;
fn tokenize(&self) -> Self::Tokenizer {
*self
}
fn estimate_tokens(&self) -> u32 {
self.0.estimate_tokens()
}
}
#[derive(Clone, Copy, PartialEq, Eq)]
pub struct ByteLines<'a>(&'a [u8]);
impl<'a> Iterator for ByteLines<'a> {
type Item = &'a [u8];
fn next(&mut self) -> Option<Self::Item> {
if self.0.is_empty() {
return None;
}
let line_len = memchr(b'\n', self.0).map_or(self.0.len(), |len| len + 1);
let (line, rem) = self.0.split_at(line_len);
self.0 = rem;
Some(line)
}
}
impl<'a> TokenSource for ByteLines<'a> {
type Token = &'a [u8];
type Tokenizer = Self;
fn tokenize(&self) -> Self::Tokenizer {
*self
}
fn estimate_tokens(&self) -> u32 {
let len: usize = self.take(20).map(|line| line.len()).sum();
if len == 0 {
100
} else {
(self.0.len() * 20 / len) as u32
}
}
}