1use crate::{
8 buffer::GapBuffer,
9 dot::Range,
10 regex::{Match, Regex},
11 syntax::{ByteRange, LineIter, SyntaxRange},
12};
13
14#[derive(Debug)]
15pub struct ReState {
16 mr: MultiRegex,
17 names: Vec<String>,
18 ranges: Vec<SyntaxRange>,
19}
20
21impl ReState {
22 pub fn new(tagged_re: &[(impl AsRef<str>, impl AsRef<str>)]) -> Result<Self, String> {
23 let mut names = Vec::with_capacity(tagged_re.len());
24 let mut re = Vec::with_capacity(tagged_re.len());
25
26 for (name, re_str) in tagged_re.iter() {
27 names.push(name.as_ref().to_string());
28 re.push(
29 Regex::compile(re_str.as_ref())
30 .map_err(|err| format!("invalid regex ({:?}): {err:?}", re_str.as_ref()))?,
31 );
32 }
33
34 Ok(Self {
35 mr: MultiRegex { re },
36 names,
37 ranges: Vec::new(),
38 })
39 }
40
41 pub fn update(&mut self, gb: &GapBuffer, from: usize, n_rows: usize) {
42 self.ranges.clear();
43
44 let pos = gb.line_to_char(from);
45 let ch_to = if from + n_rows + 1 < gb.len_lines() {
46 gb.line_to_char(from + n_rows + 1)
47 } else {
48 gb.len_chars()
49 };
50
51 self.ranges.extend(Tokenizer {
52 mr: &mut self.mr,
53 gb,
54 pos,
55 ch_to,
56 });
57 }
58
59 #[inline]
60 pub fn iter_tokenized_lines_from<'a>(
61 &'a self,
62 line: usize,
63 gb: &'a GapBuffer,
64 dot_range: Range,
65 load_exec_range: Option<(bool, Range)>,
66 ) -> LineIter<'a> {
67 LineIter::new(
68 line,
69 gb,
70 dot_range,
71 load_exec_range,
72 &self.names,
73 &self.ranges,
74 )
75 }
76}
77
78#[derive(Debug)]
79struct Tokenizer<'a> {
80 mr: &'a mut MultiRegex,
81 gb: &'a GapBuffer,
82 pos: usize,
83 ch_to: usize,
84}
85
86impl Iterator for Tokenizer<'_> {
87 type Item = SyntaxRange;
88
89 fn next(&mut self) -> Option<Self::Item> {
90 if self.pos >= self.ch_to {
91 return None;
92 }
93
94 let (i, m) = self.mr.match_gb_from(self.gb, self.pos)?;
95 let (mfrom, mto) = m.loc();
96 self.pos = mto;
97
98 let from = self.gb.char_to_byte(mfrom);
99 let to = self.gb.offset_char_to_byte(mto, from, mfrom);
100
101 Some(SyntaxRange {
102 cap_idx: Some(i),
103 r: ByteRange { from, to },
104 })
105 }
106}
107
108#[derive(Debug, Clone)]
111struct MultiRegex {
112 re: Vec<Regex>,
113}
114
115impl MultiRegex {
116 pub fn match_gb_from(&mut self, gb: &GapBuffer, ch_from: usize) -> Option<(usize, Match)> {
121 let mut leftmost: Option<(usize, Match)> = None;
122
123 for (i, re) in self.re.iter_mut().enumerate() {
124 let m = re.find_from(gb, ch_from);
125 match (leftmost.as_ref(), m) {
126 (Some((_, prev)), Some(m)) if &m < prev => leftmost = Some((i, m)),
127 (None, Some(m)) => leftmost = Some((i, m)),
128 _ => (),
129 }
130 }
131
132 leftmost
133 }
134}
135
136#[cfg(test)]
137mod tests {
138 use super::*;
139
140 const PLUMB_FILE: &str = "
141# this is a comment
142data matches foo
143data from running some command
144attr add action=showdata filename=/+synthetic
145plumb to edit";
146
147 #[test]
148 fn tokenizing_works() {
149 let gb = GapBuffer::from(PLUMB_FILE);
150 let tagged_re = vec![
151 ("comment", "#.*"),
152 ("keyword", r"\b(data|attr|plumb|arg)\b"),
153 (
154 "function",
155 r"\b(matches|narrows|from|add|to|start|isfile|isdir)\b",
156 ),
157 ];
158 let expected: Vec<(String, String)> = [
159 ("dot", ""),
160 ("comment", "# this is a comment"),
161 ("keyword", "data"),
162 ("default", " "),
163 ("function", "matches"),
164 ("default", " foo"),
165 ("keyword", "data"),
166 ("default", " "),
167 ("function", "from"),
168 ("default", " running some command"),
169 ("keyword", "attr"),
170 ("default", " "),
171 ("function", "add"),
172 ("default", " action=showdata filename=/+synthetic"),
173 ("keyword", "plumb"),
174 ("default", " "),
175 ("function", "to"),
176 ("default", " edit"),
177 ]
178 .iter()
179 .map(|(tag, s)| (tag.to_string(), s.to_string()))
180 .collect();
181
182 let mut state = ReState::new(&tagged_re).expect("valid regex patterns");
183 state.update(&gb, 0, 6);
184
185 let mut toks = Vec::new();
186 for line in state.iter_tokenized_lines_from(0, &gb, Range::BOF, None) {
187 for tok in line {
188 let s = tok.as_slice(&gb).to_string();
189 toks.push((tok.tag.to_string(), s));
190 }
191 }
192
193 assert_eq!(toks, expected);
194 }
195}