ad_editor/regex/
matches.rs

1use super::vm::{Regex, N_SLOTS};
2use crate::buffer::{GapBuffer, IdxChars};
3use std::{
4    iter::{Enumerate, Skip},
5    rc::Rc,
6    str::Chars,
7};
8
9/// The match location of a Regex against a given input.
10///
11/// The sub-match indices are relative to the input used to run the original match.
12#[derive(Debug, Clone, PartialEq, Eq)]
13pub struct Match {
14    pub(super) sub_matches: [usize; N_SLOTS],
15    pub(super) submatch_names: Rc<[String]>,
16}
17
18impl Match {
19    pub(crate) fn synthetic(from: usize, to: usize) -> Self {
20        let mut sub_matches = [0; N_SLOTS];
21        sub_matches[0] = from;
22        sub_matches[1] = to;
23        Self {
24            sub_matches,
25            submatch_names: Rc::new([]),
26        }
27    }
28
29    pub(crate) fn apply_offset(&mut self, offset: isize) {
30        for i in 0..N_SLOTS {
31            if i > 0 && self.sub_matches[i] == 0 {
32                continue;
33            }
34            self.sub_matches[i] = (self.sub_matches[i] as isize + offset) as usize;
35        }
36    }
37
38    /// Extract this match from the given string
39    pub fn str_match_text(&self, s: &str) -> String {
40        let (a, b) = self.loc();
41        s.chars().skip(a).take(b - a).collect()
42    }
43
44    /// The start and end of this match in terms of byte offsets
45    ///
46    /// use loc for character offsets
47    #[inline]
48    pub fn str_loc_bytes(&self, s: &str) -> (usize, usize) {
49        let (a, b) = self.loc();
50        let mut it = s.char_indices().skip(a);
51        let (first, _) = it.next().unwrap();
52        let (last, _) = it.take(b - a - 1).last().unwrap_or((first, ' '));
53
54        (first, last)
55    }
56
57    /// The start and end of the nth submatch in terms of byte offsets
58    #[inline]
59    pub fn str_sub_loc_bytes(&self, n: usize, s: &str) -> Option<(usize, usize)> {
60        let (a, b) = self.sub_loc(n)?;
61        let mut it = s.char_indices().skip(a);
62        let (first, _) = it.next().unwrap();
63        let (last, _) = it.take(b - a - 1).last().unwrap_or((first, ' '));
64
65        Some((first, last))
66    }
67
68    // FIXME: this is a terrible way to do this but used for testing at the moment
69
70    /// The names of each submatch
71    pub fn named_matches(&self) -> Vec<&str> {
72        let mut matches = Vec::new();
73        for name in self.submatch_names.iter() {
74            if self.sub_loc_by_name(name).is_some() {
75                matches.push(name.as_str());
76            }
77        }
78
79        matches
80    }
81
82    /// The start and end of a named submatch in terms of byte offsets
83    #[inline]
84    pub fn str_sub_loc_bytes_by_name(&self, name: &str, s: &str) -> Option<(usize, usize)> {
85        let (a, b) = self.sub_loc_by_name(name)?;
86        let mut it = s.char_indices().skip(a);
87        let (first, _) = it.next().unwrap();
88        let (last, _) = it.take(b - a - 1).last().unwrap_or((first, ' '));
89
90        Some((first, last))
91    }
92
93    /// The contents of a named submatch
94    pub fn str_sub_loc_text_ref_by_name<'a>(&self, name: &str, s: &'a str) -> Option<&'a str> {
95        let (first, last) = self.str_sub_loc_bytes_by_name(name, s)?;
96
97        Some(&s[first..=last])
98    }
99
100    /// The full match as applied to s
101    pub fn str_match_text_ref<'a>(&self, s: &'a str) -> &'a str {
102        let (first, last) = self.str_loc_bytes(s);
103
104        &s[first..=last]
105    }
106
107    /// The numbered submatch match as applied to s
108    pub fn str_submatch_text(&self, n: usize, s: &str) -> Option<String> {
109        let (a, b) = self.sub_loc(n)?;
110        Some(s.chars().skip(a).take(b - a).collect())
111    }
112
113    /// The start and end of this match in terms of character offsets
114    ///
115    /// use str_loc_bytes for byte offsets
116    pub fn loc(&self) -> (usize, usize) {
117        let (start, end) = (self.sub_matches[0], self.sub_matches[1]);
118
119        assert!(
120            start <= end,
121            "invalid match: {start} > {end}: {:?}",
122            self.sub_matches
123        );
124
125        (start, end)
126    }
127
128    fn sub_loc_by_name(&self, name: &str) -> Option<(usize, usize)> {
129        let n = self.submatch_names.iter().position(|s| s == name)?;
130        self.sub_loc(n + 1)
131    }
132
133    pub(crate) fn sub_loc(&self, n: usize) -> Option<(usize, usize)> {
134        if 2 * n + 1 >= N_SLOTS {
135            return None;
136        }
137        let (start, end) = (self.sub_matches[2 * n], self.sub_matches[2 * n + 1]);
138        if n > 0 && start == 0 && end == 0 {
139            return None;
140        }
141
142        assert!(
143            start <= end,
144            "invalid match: {start} > {end}: {:?}",
145            self.sub_matches
146        );
147
148        Some((start, end))
149    }
150}
151
152pub trait IndexedChars {
153    type I: Iterator<Item = (usize, char)>;
154    fn iter_from(&self, from: usize) -> Option<Self::I>;
155}
156
157impl<'a> IndexedChars for &'a str {
158    type I = Skip<Enumerate<Chars<'a>>>;
159
160    fn iter_from(&self, from: usize) -> Option<Self::I> {
161        // This is not at all efficient but we only really make use of strings in test cases where
162        // the length of the string is small. For the "real" impls using GapBuffers, checking the number
163        // of chars in the buffer is O(1) as we cache it.
164        if from >= self.chars().count() {
165            None
166        } else {
167            Some(self.chars().enumerate().skip(from))
168        }
169    }
170}
171
172impl<'a> IndexedChars for &'a GapBuffer {
173    type I = IdxChars<'a>;
174
175    fn iter_from(&self, from: usize) -> Option<Self::I> {
176        if from >= self.len_chars() {
177            None
178        } else {
179            Some(
180                self.slice(from, self.len_chars())
181                    .indexed_chars(from, false),
182            )
183        }
184    }
185}
186
187/// An iterator over sequential, non overlapping matches of a Regex
188/// against a given input
189#[derive(Debug)]
190pub struct MatchIter<'a, I>
191where
192    I: IndexedChars,
193{
194    pub(super) it: I,
195    pub(super) r: &'a mut Regex,
196    pub(super) from: usize,
197}
198
199impl<I> Iterator for MatchIter<'_, I>
200where
201    I: IndexedChars,
202{
203    type Item = Match;
204
205    fn next(&mut self) -> Option<Self::Item> {
206        let m = self
207            .r
208            .match_iter(&mut self.it.iter_from(self.from)?, self.from)?;
209
210        let (_, from) = m.loc();
211        if from == self.from {
212            self.from += 1;
213        } else {
214            self.from = from;
215        }
216
217        Some(m)
218    }
219}