1use std::str::from_utf8_unchecked;
2
3use memchr::memchr;
4
5use crate::TokenSource;
6
7pub fn lines(data: &str) -> Lines<'_> {
12 Lines(ByteLines(data.as_bytes()))
13}
14
15pub fn byte_lines(data: &[u8]) -> ByteLines<'_> {
20 ByteLines(data)
21}
22
23impl<'a> TokenSource for &'a str {
25 type Token = &'a str;
26
27 type Tokenizer = Lines<'a>;
28
29 fn tokenize(&self) -> Self::Tokenizer {
30 lines(self)
31 }
32
33 fn estimate_tokens(&self) -> u32 {
34 lines(self).estimate_tokens()
35 }
36}
37
38impl<'a> TokenSource for &'a [u8] {
40 type Token = Self;
41 type Tokenizer = ByteLines<'a>;
42
43 fn tokenize(&self) -> Self::Tokenizer {
44 byte_lines(self)
45 }
46
47 fn estimate_tokens(&self) -> u32 {
48 byte_lines(self).estimate_tokens()
49 }
50}
51
52#[derive(Clone, Copy, PartialEq, Eq)]
55pub struct Lines<'a>(ByteLines<'a>);
56
57impl<'a> Iterator for Lines<'a> {
58 type Item = &'a str;
59
60 fn next(&mut self) -> Option<Self::Item> {
61 self.0.next().map(|it| unsafe { from_utf8_unchecked(it) })
64 }
65}
66
67impl<'a> TokenSource for Lines<'a> {
69 type Token = &'a str;
70
71 type Tokenizer = Self;
72
73 fn tokenize(&self) -> Self::Tokenizer {
74 *self
75 }
76
77 fn estimate_tokens(&self) -> u32 {
78 self.0.estimate_tokens()
79 }
80}
81
82#[derive(Clone, Copy, PartialEq, Eq)]
85pub struct ByteLines<'a>(&'a [u8]);
86
87impl<'a> Iterator for ByteLines<'a> {
88 type Item = &'a [u8];
89
90 fn next(&mut self) -> Option<Self::Item> {
91 if self.0.is_empty() {
92 return None;
93 }
94 let line_len = memchr(b'\n', self.0).map_or(self.0.len(), |len| len + 1);
95 let (line, rem) = self.0.split_at(line_len);
96 self.0 = rem;
97 Some(line)
98 }
99}
100
101impl<'a> TokenSource for ByteLines<'a> {
103 type Token = &'a [u8];
104
105 type Tokenizer = Self;
106
107 fn tokenize(&self) -> Self::Tokenizer {
108 *self
109 }
110
111 fn estimate_tokens(&self) -> u32 {
112 let len: usize = self.take(20).map(|line| line.len()).sum();
113 if len == 0 {
114 100
115 } else {
116 (self.0.len() * 20 / len) as u32
117 }
118 }
119}