ad_editor/syntax/
re.rs

1//! Simple regex based highlighting by line.
2//!
3//! This should only be used as a fallback for simple highlighting of file formats that do not have
4//! a tree-sitter grammar available. It will almost certainly produce incorrect results for complex
5//! or overlapping regex sets but its better than nothing if all you need is some quick and simple
6//! rules for highlighting regions of a known file format.
7use crate::{
8    buffer::GapBuffer,
9    dot::Range,
10    regex::{Match, Regex},
11    syntax::{ByteRange, LineIter, SyntaxRange},
12};
13
14#[derive(Debug)]
15pub struct ReState {
16    mr: MultiRegex,
17    names: Vec<String>,
18    ranges: Vec<SyntaxRange>,
19}
20
21impl ReState {
22    pub fn new(tagged_re: &[(impl AsRef<str>, impl AsRef<str>)]) -> Result<Self, String> {
23        let mut names = Vec::with_capacity(tagged_re.len());
24        let mut re = Vec::with_capacity(tagged_re.len());
25
26        for (name, re_str) in tagged_re.iter() {
27            names.push(name.as_ref().to_string());
28            re.push(
29                Regex::compile(re_str.as_ref())
30                    .map_err(|err| format!("invalid regex ({:?}): {err:?}", re_str.as_ref()))?,
31            );
32        }
33
34        Ok(Self {
35            mr: MultiRegex { re },
36            names,
37            ranges: Vec::new(),
38        })
39    }
40
41    pub fn update(&mut self, gb: &GapBuffer, from: usize, n_rows: usize) {
42        self.ranges.clear();
43
44        let pos = gb.line_to_char(from);
45        let ch_to = if from + n_rows + 1 < gb.len_lines() {
46            gb.line_to_char(from + n_rows + 1)
47        } else {
48            gb.len_chars()
49        };
50
51        self.ranges.extend(Tokenizer {
52            mr: &mut self.mr,
53            gb,
54            pos,
55            ch_to,
56        });
57    }
58
59    #[inline]
60    pub fn iter_tokenized_lines_from<'a>(
61        &'a self,
62        line: usize,
63        gb: &'a GapBuffer,
64        dot_range: Range,
65        load_exec_range: Option<(bool, Range)>,
66    ) -> LineIter<'a> {
67        LineIter::new(
68            line,
69            gb,
70            dot_range,
71            load_exec_range,
72            &self.names,
73            &self.ranges,
74        )
75    }
76}
77
78#[derive(Debug)]
79struct Tokenizer<'a> {
80    mr: &'a mut MultiRegex,
81    gb: &'a GapBuffer,
82    pos: usize,
83    ch_to: usize,
84}
85
86impl Iterator for Tokenizer<'_> {
87    type Item = SyntaxRange;
88
89    fn next(&mut self) -> Option<Self::Item> {
90        if self.pos >= self.ch_to {
91            return None;
92        }
93
94        let (i, m) = self.mr.match_gb_from(self.gb, self.pos)?;
95        let (mfrom, mto) = m.loc();
96        self.pos = mto;
97
98        let from = self.gb.char_to_byte(mfrom);
99        let to = self.gb.offset_char_to_byte(mto, from, mfrom);
100
101        Some(SyntaxRange {
102            cap_idx: Some(i),
103            r: ByteRange { from, to },
104        })
105    }
106}
107
108/// An set of [Regex] that will report the first match encountered out of any of the inner
109/// patterns.
110#[derive(Debug, Clone)]
111struct MultiRegex {
112    re: Vec<Regex>,
113}
114
115impl MultiRegex {
116    // FIXME: for now this is just a dumb way to run multiple regex patterns over the same input
117    // really this should live in the regex module and it should run the VMs in parallel, returning
118    // the first match rather than in series like this
119
120    pub fn match_gb_from(&mut self, gb: &GapBuffer, ch_from: usize) -> Option<(usize, Match)> {
121        let mut leftmost: Option<(usize, Match)> = None;
122
123        for (i, re) in self.re.iter_mut().enumerate() {
124            let m = re.find_from(gb, ch_from);
125            match (leftmost.as_ref(), m) {
126                (Some((_, prev)), Some(m)) if &m < prev => leftmost = Some((i, m)),
127                (None, Some(m)) => leftmost = Some((i, m)),
128                _ => (),
129            }
130        }
131
132        leftmost
133    }
134}
135
136#[cfg(test)]
137mod tests {
138    use super::*;
139
140    const PLUMB_FILE: &str = "
141# this is a comment
142data matches foo
143data from running some command
144attr add action=showdata filename=/+synthetic
145plumb to edit";
146
147    #[test]
148    fn tokenizing_works() {
149        let gb = GapBuffer::from(PLUMB_FILE);
150        let tagged_re = vec![
151            ("comment", "#.*"),
152            ("keyword", r"\b(data|attr|plumb|arg)\b"),
153            (
154                "function",
155                r"\b(matches|narrows|from|add|to|start|isfile|isdir)\b",
156            ),
157        ];
158        let expected: Vec<(String, String)> = [
159            ("dot", ""),
160            ("comment", "# this is a comment"),
161            ("keyword", "data"),
162            ("default", " "),
163            ("function", "matches"),
164            ("default", " foo"),
165            ("keyword", "data"),
166            ("default", " "),
167            ("function", "from"),
168            ("default", " running some command"),
169            ("keyword", "attr"),
170            ("default", " "),
171            ("function", "add"),
172            ("default", " action=showdata filename=/+synthetic"),
173            ("keyword", "plumb"),
174            ("default", " "),
175            ("function", "to"),
176            ("default", " edit"),
177        ]
178        .iter()
179        .map(|(tag, s)| (tag.to_string(), s.to_string()))
180        .collect();
181
182        let mut state = ReState::new(&tagged_re).expect("valid regex patterns");
183        state.update(&gb, 0, 6);
184
185        let mut toks = Vec::new();
186        for line in state.iter_tokenized_lines_from(0, &gb, Range::BOF, None) {
187            for tok in line {
188                let s = tok.as_slice(&gb).to_string();
189                toks.push((tok.tag.to_string(), s));
190            }
191        }
192
193        assert_eq!(toks, expected);
194    }
195}