ad_editor/regex/
matches.rs

1use super::vm::{Regex, N_SLOTS};
2use crate::buffer::{GapBuffer, IdxChars};
3use std::{
4    iter::{Enumerate, Skip},
5    rc::Rc,
6    str::Chars,
7};
8
9/// The match location of a Regex against a given input.
10///
11/// The sub-match indices are relative to the input used to run the original match.
12#[derive(Debug, Clone, PartialEq, Eq)]
13pub struct Match {
14    pub(super) sub_matches: [usize; N_SLOTS],
15    pub(super) submatch_names: Rc<[String]>,
16}
17
18impl Match {
19    pub(crate) fn synthetic(from: usize, to: usize) -> Self {
20        let mut sub_matches = [0; N_SLOTS];
21        sub_matches[0] = from;
22        sub_matches[1] = to;
23        Self {
24            sub_matches,
25            submatch_names: Rc::new([]),
26        }
27    }
28
29    pub(crate) fn apply_offset(&mut self, offset: isize) {
30        for i in 0..N_SLOTS {
31            if i > 0 && self.sub_matches[i] == 0 {
32                continue;
33            }
34            self.sub_matches[i] = (self.sub_matches[i] as isize + offset) as usize;
35        }
36    }
37
38    /// Extract this match from the given string
39    pub fn str_match_text(&self, s: &str) -> String {
40        let (a, b) = self.loc();
41        s.chars().skip(a).take(b - a).collect()
42    }
43
44    /// The start and end of this match in terms of byte offsets
45    ///
46    /// use loc for character offsets
47    #[inline]
48    pub fn str_loc_bytes(&self, s: &str) -> (usize, usize) {
49        let (a, b) = self.loc();
50        let mut it = s.char_indices().skip(a);
51        let (first, _) = it.next().unwrap();
52        let (last, _) = it.take(b - a - 1).last().unwrap_or((first, ' '));
53
54        (first, last)
55    }
56
57    /// The start and end of the nth submatch in terms of byte offsets
58    #[inline]
59    pub fn str_sub_loc_bytes(&self, n: usize, s: &str) -> Option<(usize, usize)> {
60        let (a, b) = self.sub_loc(n)?;
61        let mut it = s.char_indices().skip(a);
62        let (first, _) = it.next().unwrap();
63        let (last, _) = it.take(b - a - 1).last().unwrap_or((first, ' '));
64
65        Some((first, last))
66    }
67
68    // FIXME: this is a terrible way to do this but used for testing at the moment
69
70    /// The names of each submatch
71    pub fn named_matches(&self) -> Vec<&str> {
72        let mut matches = Vec::new();
73        for name in self.submatch_names.iter() {
74            if self.sub_loc_by_name(name).is_some() {
75                matches.push(name.as_str());
76            }
77        }
78
79        matches
80    }
81
82    /// The start and end of a named submatch in terms of byte offsets
83    #[inline]
84    pub fn str_sub_loc_bytes_by_name(&self, name: &str, s: &str) -> Option<(usize, usize)> {
85        let (a, b) = self.sub_loc_by_name(name)?;
86        let mut it = s.char_indices().skip(a);
87        let (first, _) = it.next().unwrap();
88        let (last, _) = it.take(b - a - 1).last().unwrap_or((first, ' '));
89
90        Some((first, last))
91    }
92
93    /// The contents of a named submatch
94    pub fn str_sub_loc_text_ref_by_name<'a>(&self, name: &str, s: &'a str) -> Option<&'a str> {
95        let (first, last) = self.str_sub_loc_bytes_by_name(name, s)?;
96
97        Some(&s[first..=last])
98    }
99
100    /// The full match as applied to s
101    pub fn str_match_text_ref<'a>(&self, s: &'a str) -> &'a str {
102        let (first, last) = self.str_loc_bytes(s);
103
104        &s[first..=last]
105    }
106
107    pub(crate) fn str_match_text_ref_with_byte_offsets<'a>(
108        &self,
109        s: &'a str,
110    ) -> (usize, usize, &'a str) {
111        let (first, last) = self.str_loc_bytes(s);
112
113        (first, last, &s[first..=last])
114    }
115
116    /// The numbered submatch match as applied to s
117    pub fn str_submatch_text(&self, n: usize, s: &str) -> Option<String> {
118        let (a, b) = self.sub_loc(n)?;
119        Some(s.chars().skip(a).take(b - a).collect())
120    }
121
122    /// The start and end of this match in terms of character offsets
123    ///
124    /// use str_loc_bytes for byte offsets
125    pub fn loc(&self) -> (usize, usize) {
126        let (start, end) = (self.sub_matches[0], self.sub_matches[1]);
127
128        assert!(
129            start <= end,
130            "invalid match: {start} > {end}: {:?}",
131            self.sub_matches
132        );
133
134        (start, end)
135    }
136
137    fn sub_loc_by_name(&self, name: &str) -> Option<(usize, usize)> {
138        let n = self.submatch_names.iter().position(|s| s == name)?;
139        self.sub_loc(n + 1)
140    }
141
142    pub(crate) fn sub_loc(&self, n: usize) -> Option<(usize, usize)> {
143        if 2 * n + 1 >= N_SLOTS {
144            return None;
145        }
146        let (start, end) = (self.sub_matches[2 * n], self.sub_matches[2 * n + 1]);
147        if n > 0 && start == 0 && end == 0 {
148            return None;
149        }
150
151        assert!(
152            start <= end,
153            "invalid match: {start} > {end}: {:?}",
154            self.sub_matches
155        );
156
157        Some((start, end))
158    }
159}
160
161pub trait IndexedChars {
162    type I: Iterator<Item = (usize, char)>;
163    fn iter_from(&self, from: usize) -> Option<Self::I>;
164}
165
166impl<'a> IndexedChars for &'a str {
167    type I = Skip<Enumerate<Chars<'a>>>;
168
169    fn iter_from(&self, from: usize) -> Option<Self::I> {
170        // This is not at all efficient but we only really make use of strings in test cases where
171        // the length of the string is small. For the "real" impls using GapBuffers, checking the number
172        // of chars in the buffer is O(1) as we cache it.
173        if from >= self.chars().count() {
174            None
175        } else {
176            Some(self.chars().enumerate().skip(from))
177        }
178    }
179}
180
181impl<'a> IndexedChars for &'a GapBuffer {
182    type I = IdxChars<'a>;
183
184    fn iter_from(&self, from: usize) -> Option<Self::I> {
185        if from >= self.len_chars() {
186            None
187        } else {
188            Some(
189                self.slice(from, self.len_chars())
190                    .indexed_chars(from, false),
191            )
192        }
193    }
194}
195
196/// An iterator over sequential, non overlapping matches of a Regex
197/// against a given input
198#[derive(Debug)]
199pub struct MatchIter<'a, I>
200where
201    I: IndexedChars,
202{
203    pub(super) it: I,
204    pub(super) r: &'a mut Regex,
205    pub(super) from: usize,
206}
207
208impl<'a, I> Iterator for MatchIter<'a, I>
209where
210    I: IndexedChars,
211{
212    type Item = Match;
213
214    fn next(&mut self) -> Option<Self::Item> {
215        let m = self
216            .r
217            .match_iter(&mut self.it.iter_from(self.from)?, self.from)?;
218
219        let (_, from) = m.loc();
220        if from == self.from {
221            self.from += 1;
222        } else {
223            self.from = from;
224        }
225
226        Some(m)
227    }
228}