tree_splicer/
splice.rs

1#![allow(dead_code)]
2use std::collections::{HashMap, HashSet};
3
4use rand::{prelude::StdRng, Rng, SeedableRng};
5use tree_sitter::{Language, Node, Tree};
6
7use tree_sitter_edit::Editor;
8
9use crate::node_types::NodeTypes;
10
11#[derive(Debug, Default)]
12struct Edits(HashMap<usize, Vec<u8>>);
13
14impl Editor for Edits {
15    fn has_edit(&self, _tree: &Tree, node: &Node<'_>) -> bool {
16        self.0.contains_key(&node.id())
17    }
18
19    fn edit(&self, _source: &[u8], tree: &Tree, node: &Node<'_>) -> Vec<u8> {
20        debug_assert!(self.has_edit(tree, node));
21        self.0.get(&node.id()).unwrap().clone()
22    }
23}
24
25#[derive(Debug)]
26struct Branches<'a>(HashMap<&'static str, Vec<&'a [u8]>>);
27
28impl<'a> Branches<'a> {
29    fn new(trees: Vec<(&'a [u8], &'a Tree)>) -> Self {
30        let mut branches = HashMap::with_capacity(trees.len()); // min
31        for (text, tree) in trees {
32            let mut nodes = vec![tree.root_node()];
33            while !nodes.is_empty() {
34                let mut children = Vec::with_capacity(nodes.len()); // guesstimate
35                for node in nodes {
36                    branches
37                        .entry(node.kind())
38                        .or_insert_with(|| HashSet::with_capacity(1))
39                        .insert(&text[node.byte_range()]);
40                    let mut i = 0;
41                    while let Some(child) = node.child(i) {
42                        children.push(child);
43                        i += 1;
44                    }
45                }
46                nodes = children;
47            }
48        }
49        Branches(
50            branches
51                .into_iter()
52                .map(|(k, s)| (k, s.iter().copied().collect()))
53                .collect(),
54        )
55    }
56
57    fn possible(&self) -> usize {
58        let mut possible_mutations = 0;
59        for s in self.0.values() {
60            possible_mutations += s.len() - 1;
61        }
62        possible_mutations
63    }
64}
65
66fn parse(language: &Language, code: &str) -> Tree {
67    let mut parser = tree_sitter::Parser::new();
68    parser
69        .set_language(language)
70        .expect("Failed to set tree-sitter parser language");
71    parser.parse(code, None).expect("Failed to parse code")
72}
73
74/// Splicing configuration
75#[derive(Debug)]
76pub struct Config {
77    /// Percent chance to perform chaotic mutation
78    ///
79    /// Chaotic mutations may result in invalid syntax.
80    pub chaos: u8,
81    /// Percent chance to perform a deletion.
82    ///
83    /// By default, deletes optional nodes. Chaotic deletions delete any node.
84    pub deletions: u8,
85    pub language: Language,
86    // pub intra_splices: usize,
87    /// Perform anywhere from zero to this many inter-file splices per test.
88    pub inter_splices: usize,
89    /// Approximate maximum file size to produce (bytes)
90    ///
91    /// Some of the input tests should be below this size.
92    pub max_size: usize,
93    pub node_types: NodeTypes,
94    /// Re-parse the file after this many mutations.
95    ///
96    /// When this is more than `inter_splices`, never re-parse.
97    pub reparse: usize,
98    pub seed: u64,
99}
100
101#[derive(Debug)]
102pub struct Splicer<'a> {
103    pub language: Language,
104    branches: Branches<'a>,
105    chaos: u8,
106    deletions: u8,
107    kinds: Vec<&'static str>,
108    // intra_splices: usize,
109    inter_splices: usize,
110    max_size: usize,
111    node_types: NodeTypes,
112    trees: Vec<(&'a [u8], &'a Tree)>,
113    reparse: usize,
114    rng: StdRng,
115}
116
117impl<'a> Splicer<'a> {
118    fn delta(node: Node<'_>, replace: &[u8]) -> isize {
119        let range = node.byte_range();
120        isize::try_from(replace.len()).unwrap_or_default()
121            - isize::try_from(range.end - range.start).unwrap_or_default()
122    }
123
124    #[must_use]
125    pub fn new(config: Config, files: &'a HashMap<String, (Vec<u8>, Tree)>) -> Option<Self> {
126        let trees: Vec<_> = files
127            .iter()
128            .map(|(_, (txt, tree))| (txt.as_ref(), tree))
129            .collect();
130
131        let mut all_empty = true;
132        for (_bytes, tree) in files.values() {
133            if tree.root_node().child_count() != 0 {
134                all_empty = false;
135                break;
136            }
137        }
138        if all_empty {
139            return None;
140        }
141
142        let branches = Branches::new(
143            files
144                .iter()
145                .map(|(_, (txt, tree))| (txt.as_ref(), tree))
146                .collect(),
147        );
148        let rng = StdRng::seed_from_u64(config.seed);
149        let kinds = branches.0.keys().copied().collect();
150        Some(Splicer {
151            chaos: config.chaos,
152            deletions: config.deletions,
153            language: config.language,
154            branches,
155            kinds,
156            // intra_splices: config.intra_splices,
157            inter_splices: config.inter_splices,
158            max_size: config.max_size,
159            node_types: config.node_types,
160            reparse: config.reparse,
161            rng,
162            trees,
163        })
164    }
165
166    fn pick_usize(&mut self, n: usize) -> usize {
167        self.rng.random_range(0..n)
168    }
169
170    fn pick_idx<T>(&mut self, v: &[T]) -> usize {
171        self.pick_usize(v.len())
172    }
173
174    fn all_nodes(tree: &Tree) -> Vec<Node<'_>> {
175        let mut all = Vec::with_capacity(16); // min
176        let root = tree.root_node();
177        let mut cursor = tree.walk();
178        let mut nodes: HashSet<_> = root.children(&mut cursor).collect();
179        while !nodes.is_empty() {
180            let mut next = HashSet::new();
181            for node in nodes {
182                debug_assert!(!next.contains(&node));
183                all.push(node);
184                let mut child_cursor = tree.walk();
185                for child in node.children(&mut child_cursor) {
186                    debug_assert!(child.id() != node.id());
187                    debug_assert!(!next.contains(&child));
188                    next.insert(child);
189                }
190            }
191            nodes = next;
192        }
193        all
194    }
195
196    fn pick_node<'b>(&mut self, tree: &'b Tree) -> Node<'b> {
197        let nodes = Self::all_nodes(tree);
198        if nodes.is_empty() {
199            return tree.root_node();
200        }
201        *nodes.get(self.pick_idx(nodes.as_slice())).unwrap()
202    }
203
204    fn delete_node(&mut self, _text: &[u8], tree: &Tree) -> (usize, Vec<u8>, isize) {
205        let chaotic = self.rng.random_range(0..100) < self.chaos;
206        if chaotic {
207            let node = self.pick_node(tree);
208            return (node.id(), Vec::new(), Self::delta(node, &[]));
209        }
210        let nodes = Self::all_nodes(tree);
211        if nodes.iter().all(|n| !self.node_types.optional_node(n)) {
212            let node = self.pick_node(tree);
213            return (node.id(), Vec::new(), Self::delta(node, &[]));
214        }
215        let mut node = nodes.get(self.pick_idx(nodes.as_slice())).unwrap();
216        while !self.node_types.optional_node(node) {
217            node = nodes.get(self.pick_idx(nodes.as_slice())).unwrap();
218        }
219        (node.id(), Vec::new(), Self::delta(*node, &[]))
220    }
221
222    fn splice_node(&mut self, text: &[u8], tree: &Tree) -> (usize, Vec<u8>, isize) {
223        let chaotic = self.rng.random_range(0..100) < self.chaos;
224
225        let mut node = tree.root_node();
226        let mut candidates = Vec::new();
227        // When modified trees are re-parsed, their nodes may have novel kinds
228        // not in Branches (candidates.len() == 0). Also, avoid not mutating
229        // (candidates.len() == 1).
230        while candidates.len() <= 1 {
231            node = self.pick_node(tree);
232            candidates = if chaotic {
233                let kind_idx = self.rng.random_range(0..self.kinds.len());
234                let kind = self.kinds.get(kind_idx).unwrap();
235                self.branches.0.get(kind).unwrap().clone()
236            } else {
237                self.branches
238                    .0
239                    .get(node.kind())
240                    .cloned()
241                    .unwrap_or_default()
242            };
243        }
244
245        let idx = self.rng.random_range(0..candidates.len());
246        let mut candidate = candidates.get(idx).unwrap();
247        // Try to avoid not mutating
248        let node_text = &text[node.byte_range()];
249        while candidates.len() > 1 && candidate == &node_text {
250            let idx = self.rng.random_range(0..candidates.len());
251            candidate = candidates.get(idx).unwrap();
252        }
253        // eprintln!(
254        //     "Replacing '{}' with '{}'",
255        //     std::str::from_utf8(&text[node.byte_range()]).unwrap(),
256        //     std::str::from_utf8(candidate).unwrap(),
257        // );
258        let replace = Vec::from(*candidate);
259        let delta = Self::delta(node, replace.as_slice());
260        (node.id(), replace, delta)
261    }
262
263    pub fn splice_tree(&mut self, text0: &[u8], mut tree: Tree) -> Option<Vec<u8>> {
264        // TODO: Assert that text0 and tree.root_node() are the same length?
265        let mut edits = Edits::default();
266        if self.inter_splices == 0 {
267            return None;
268        }
269        let splices = self.rng.random_range(1..self.inter_splices);
270        let mut text = Vec::from(text0);
271        let mut sz = isize::try_from(text.len()).unwrap_or_default();
272        for i in 0..splices {
273            let (id, bytes, delta) = if self.rng.random_range(0..100) < self.deletions {
274                self.delete_node(text.as_slice(), &tree)
275            } else {
276                self.splice_node(text.as_slice(), &tree)
277            };
278            sz += delta;
279            let sized_out = usize::try_from(sz).unwrap_or_default() >= self.max_size;
280            edits.0.insert(id, bytes);
281            if i % self.reparse == 0 || i + 1 == splices || sized_out {
282                let mut result = Vec::with_capacity(usize::try_from(sz).unwrap_or_default());
283                tree_sitter_edit::render(&mut result, &tree, text.as_slice(), &edits).ok()?;
284                text = result.clone();
285                tree = parse(&self.language, &String::from_utf8_lossy(text.as_slice()));
286                edits = Edits::default();
287            }
288            if sized_out {
289                break;
290            }
291        }
292        Some(text)
293    }
294}
295
296impl Iterator for Splicer<'_> {
297    type Item = Vec<u8>;
298
299    fn next(&mut self) -> Option<Self::Item> {
300        let mut tree_idx: usize = self.pick_usize(self.trees.len());
301        let (mut text, mut tree) = *self.trees.get(tree_idx).unwrap();
302        while text.len() > self.max_size {
303            tree_idx = self.pick_usize(self.trees.len());
304            (text, tree) = *self.trees.get(tree_idx).unwrap();
305        }
306        self.splice_tree(text, tree.clone())
307    }
308}