codeowners_rs/patternset/
builder.rs

1use super::{
2    nfa::{Nfa, StateId, Transition},
3    Matcher,
4};
5
6/// Builder for a patternset [`Matcher`]. Calling [`Builder::build`] will
7/// consume the builder.
8#[derive(Clone)]
9pub struct Builder {
10    nfa: Nfa,
11    next_pattern_id: usize,
12}
13
14impl Builder {
15    /// Create a new `Builder`.
16    pub fn new() -> Self {
17        Self {
18            nfa: Nfa::new(),
19            next_pattern_id: 0,
20        }
21    }
22
23    /// Build the `Matcher` from the patterns added to the builder. This will
24    /// consume the builder.    
25    pub fn build(self) -> Matcher {
26        Matcher::new(self.nfa)
27    }
28
29    /// Add a pattern to the builder.
30    pub fn add(&mut self, pattern: &str) -> usize {
31        let pattern_id = self.next_pattern_id;
32        self.next_pattern_id += 1;
33
34        let mut start_state_id = Nfa::START_STATE;
35
36        // Remove the leading slash if present. It forces left-anchoring so we
37        // need to remember whether it was present or not.
38        let (pattern, leading_slash) = match pattern.strip_prefix('/') {
39            Some(pattern) => (pattern, true),
40            None => (pattern, false),
41        };
42
43        // We currently only match files (as opposed to directories), so the trailing slash
44        // has no effect except adding an extra empty path component at the end.
45        let (pattern, trailing_slash) = match pattern.strip_suffix('/') {
46            Some(pattern) => (pattern, true),
47            None => (pattern, false),
48        };
49
50        // CODEOWNERS files use Unix path separators.
51        let segments = pattern.split('/').collect::<Vec<_>>();
52
53        // All patterns are left-anchored unless they're a single component with
54        // no leading slash (but a trailing slash is permitted).
55        if !leading_slash && segments.len() == 1 {
56            start_state_id = self.add_epsilon_transition(Nfa::START_STATE);
57        }
58
59        // Add states and transitions for each of the pattern components.
60        let mut end_state_id =
61            segments
62                .iter()
63                .fold(start_state_id, |from_id, segment| match *segment {
64                    "**" => self.add_epsilon_transition(from_id),
65                    _ => self.add_transition(from_id, segment),
66                });
67
68        // If the pattern ends with a trailing slash or /**, we match everything
69        // under the directory, but not the directory itself, so we need one
70        // more segment
71        if trailing_slash || segments.last() == Some(&"**") {
72            end_state_id = self.add_transition(end_state_id, "*");
73        }
74
75        // Most patterns are all prefix-matched, which effectively means they end in
76        // a /**, so we need to add a self loop to the final state. The exception is
77        // patterns that end with a single wildcard, which we handle separately, which
78        // don't match recursively. This appears to be a discrepancy between the
79        // CODEOWNERS globbing rules and the .gitignore rules.
80        if let Some(&last_segment) = segments.last() {
81            if last_segment != "*" {
82                end_state_id = self.add_epsilon_transition(end_state_id);
83            }
84        }
85
86        // Mark the final state as the terminal state for this pattern
87        self.nfa
88            .state_mut(end_state_id)
89            .mark_as_terminal(pattern_id);
90
91        pattern_id
92    }
93
94    // Add a regular (non-epsilon) transition from a given state via the
95    // provided path segment.
96    fn add_transition(&mut self, from_id: StateId, segment: &str) -> StateId {
97        let existing_transition = self
98            .nfa
99            .transitions_from(from_id)
100            .find(|t| t.path_segment == segment && t.target != from_id);
101        if let Some(t) = existing_transition {
102            t.target
103        } else {
104            let state_id = self.nfa.add_state();
105            self.nfa
106                .state_mut(from_id)
107                .add_transition(Transition::new(segment.to_owned(), state_id));
108            state_id
109        }
110    }
111
112    // Add an epsilon transition from a given state to a new state. If an epsilon transition
113    // already exists, return the id of that transition.
114    fn add_epsilon_transition(&mut self, from_id: StateId) -> StateId {
115        // Double star segments match zero or more of anything, so there's never a need to
116        // have multiple consecutive double star states. Multiple consecutive double star
117        // states mean we require multiple path segments, which violoates the gitignore spec
118        let has_existing_transition = self
119            .nfa
120            .transitions_from(from_id)
121            .any(|t| t.path_segment == "*" && t.target == from_id);
122        if has_existing_transition {
123            return from_id;
124        }
125
126        match self.nfa.state(from_id).epsilon_transition {
127            // If there's already an epsilon transition, don't create a new one
128            // as multiple epsilon transitions coalesce into one
129            Some(to_id) => to_id,
130            // Otherwise, add a new state and an epsilon transition to it
131            None => {
132                let state_id = self.nfa.add_state();
133                self.nfa
134                    .state_mut(state_id)
135                    .add_transition(Transition::new("*".to_owned(), state_id));
136                self.nfa.state_mut(from_id).epsilon_transition = Some(state_id);
137                state_id
138            }
139        }
140    }
141}
142
143impl Default for Builder {
144    fn default() -> Self {
145        Self::new()
146    }
147}
148
149#[cfg(test)]
150mod tests {
151    use super::*;
152
153    #[test]
154    fn test_nfa_builder() {
155        let mut builder = Builder::new();
156
157        builder.add("/foo/*");
158        assert_eq!(
159            transitions_for(&builder.nfa),
160            vec![(0, "foo".to_owned(), 1), (1, "*".to_owned(), 2)]
161        );
162
163        builder.add("/foo/bar");
164        assert_eq!(
165            transitions_for(&builder.nfa),
166            vec![
167                (0, "foo".to_owned(), 1),
168                (1, "*".to_owned(), 2),
169                (1, "bar".to_owned(), 3),
170                (4, "*".to_owned(), 4)
171            ]
172        );
173    }
174
175    #[test]
176    fn test_thing() {
177        let pat = "/modules/thanos-*/**";
178        let mut builder = Builder::new();
179        builder.add(pat);
180        println!("{}", generate_dot(&builder.nfa));
181    }
182
183    #[allow(dead_code)]
184    fn generate_dot(nfa: &Nfa) -> String {
185        let mut dot = String::from("digraph G {\n  rankdir=\"LR\"\n");
186        for (state_id, state) in nfa.states_iter().enumerate() {
187            if state.terminal_for_patterns.is_some() {
188                dot.push_str(&format!("  s{} [shape=doublecircle];\n", state_id));
189            }
190            for transition in state.transitions.iter() {
191                dot.push_str(&format!(
192                    "  s{} -> s{} [label=\"{}\"];\n",
193                    state_id, transition.target.0, transition.path_segment
194                ));
195            }
196            if let Some(next_state_id) = nfa.state(StateId(state_id as u32)).epsilon_transition {
197                dot.push_str(&format!(
198                    "  s{} -> s{} [label=\"ε\"];\n",
199                    state_id, next_state_id.0
200                ));
201            }
202        }
203        dot.push_str("}\n");
204        dot
205    }
206
207    fn transitions_for(nfa: &Nfa) -> Vec<(usize, String, usize)> {
208        nfa.states_iter()
209            .enumerate()
210            .flat_map(|(idx, s)| {
211                s.transitions
212                    .iter()
213                    .map(|t| (idx, t.path_segment.clone(), t.target.0 as usize))
214                    .collect::<Vec<_>>()
215            })
216            .collect()
217    }
218}