Skip to main content

big_code_analysis/
preproc.rs

1// Per-language metric and AST modules deliberately consume the macro-
2// generated tree-sitter token enums via `use crate::*` and `use Foo::*`
3// inside match expressions — explicit imports would list dozens of
4// variants per arm and obscure the per-language token sets that are the
5// point of these files. Allowed at the module level rather than per
6// function so the per-language impl blocks stay readable.
7#![allow(
8    clippy::enum_glob_use,
9    clippy::if_not_else,
10    clippy::too_many_lines,
11    clippy::wildcard_imports
12)]
13
14use std::collections::{HashMap, HashSet, hash_map};
15use std::path::{Path, PathBuf};
16
17use petgraph::{
18    Direction, algo::kosaraju_scc, graph::NodeIndex, stable_graph::StableGraph, visit::Dfs,
19};
20use serde::{Deserialize, Serialize};
21
22use crate::c_langs_macros::is_specials;
23
24use crate::langs::*;
25use crate::languages::language_preproc::*;
26use crate::tools::*;
27use crate::traits::*;
28
29/// Preprocessor data of a `C/C++` file.
30#[derive(Debug, Default, Deserialize, Serialize)]
31pub struct PreprocFile {
32    /// The set of include directives explicitly written in a file
33    pub direct_includes: HashSet<String>,
34    /// The set of include directives implicitly imported in a file
35    /// from other files
36    pub indirect_includes: HashSet<String>,
37    /// The set of macros of a file
38    pub macros: HashSet<String>,
39}
40
41/// Preprocessor data of a series of `C/C++` files.
42#[derive(Debug, Default, Deserialize, Serialize)]
43pub struct PreprocResults {
44    /// The preprocessor data of each `C/C++` file
45    pub files: HashMap<PathBuf, PreprocFile>,
46}
47
48impl PreprocFile {
49    /// Adds new macros to the set of macro of a file.
50    #[must_use]
51    pub fn new_macros(macros: &[&str]) -> Self {
52        let mut pf = Self::default();
53        for m in macros {
54            pf.macros.insert((*m).to_string());
55        }
56        pf
57    }
58}
59
60/// Returns the macros contained in a `C/C++` file.
61pub fn get_macros<S: ::std::hash::BuildHasher>(
62    file: &Path,
63    files: &HashMap<PathBuf, PreprocFile, S>,
64) -> HashSet<String> {
65    let mut macros = HashSet::new();
66    if let Some(pf) = files.get(file) {
67        for m in &pf.macros {
68            macros.insert(m.clone());
69        }
70        for f in &pf.indirect_includes {
71            if let Some(pf) = files.get(&PathBuf::from(f)) {
72                for m in &pf.macros {
73                    macros.insert(m.clone());
74                }
75            }
76        }
77    }
78    macros
79}
80
81/// Constructs a dependency graph of the include directives
82/// in a `C/C++` file.
83///
84/// The dependency graph is built using both preprocessor data and not
85/// extracted from the considered `C/C++` files.
86///
87/// # Panics
88///
89/// Panics if any of the lockstep invariants between the include graph
90/// `g`, the `nodes` map, and the `scc_map` is violated at runtime —
91/// specifically: an SCC component node missing from the graph, a graph
92/// node weight without a `nodes` map entry, a DFS-visited node without
93/// a stored weight, or an empty-path replacement node without a
94/// `scc_map` entry. These data structures are built in lockstep by
95/// this function, so all four conditions represent unrecoverable
96/// programmer errors rather than reachable input failures.
97pub fn fix_includes<S: ::std::hash::BuildHasher>(
98    files: &mut HashMap<PathBuf, PreprocFile, S>,
99    all_files: &HashMap<String, Vec<PathBuf>, S>,
100) {
101    let mut nodes: HashMap<PathBuf, NodeIndex> = HashMap::new();
102    // Since we'll remove strong connected components we need to have a stable graph
103    // in order to use the nodes we've in the nodes HashMap.
104    let mut g = StableGraph::new();
105
106    // First we build a graph of include dependencies
107    for (file, pf) in files.iter() {
108        let node = match nodes.entry(file.clone()) {
109            hash_map::Entry::Occupied(l) => *l.get(),
110            hash_map::Entry::Vacant(p) => *p.insert(g.add_node(file.clone())),
111        };
112        let direct_includes = &pf.direct_includes;
113        for i in direct_includes {
114            let possibilities = guess_file(file, i, all_files);
115            for i in possibilities {
116                if &i != file {
117                    let i = match nodes.entry(i.clone()) {
118                        hash_map::Entry::Occupied(l) => *l.get(),
119                        hash_map::Entry::Vacant(p) => *p.insert(g.add_node(i)),
120                    };
121                    g.add_edge(node, i, 0);
122                } else {
123                    // TODO: add an option to display warning
124                    eprintln!("Warning: possible self inclusion {}", file.display());
125                }
126            }
127        }
128    }
129
130    // In order to walk in the graph without issues due to cycles
131    // we replace strong connected components by a unique node
132    // All the paths in a scc finally represents a kind of unique file containing
133    // all the files in the scc.
134    let mut scc = kosaraju_scc(&g);
135    let mut scc_map: HashMap<NodeIndex, HashSet<String>> = HashMap::new();
136    for component in &mut scc {
137        if component.len() > 1 {
138            // For Firefox, there are only few scc and all of them are pretty small
139            // So no need to take a hammer here (for 'contains' stuff).
140            // TODO: in some case a hammer can be useful: check perf Vec vs HashSet
141            let mut incoming = Vec::new();
142            let mut outgoing = Vec::new();
143            let mut paths = HashSet::new();
144
145            for c in component.iter() {
146                for i in g.neighbors_directed(*c, Direction::Incoming) {
147                    if !component.contains(&i) && !incoming.contains(&i) {
148                        incoming.push(i);
149                    }
150                }
151                for o in g.neighbors_directed(*c, Direction::Outgoing) {
152                    if !component.contains(&o) && !outgoing.contains(&o) {
153                        outgoing.push(o);
154                    }
155                }
156            }
157
158            let replacement = g.add_node(PathBuf::from(""));
159            for i in incoming.drain(..) {
160                g.add_edge(i, replacement, 0);
161            }
162            for o in outgoing.drain(..) {
163                g.add_edge(replacement, o, 0);
164            }
165            for c in component.drain(..) {
166                let path = g
167                    .remove_node(c)
168                    .expect("invariant: SCC component node must exist in graph");
169                if let Some(s) = path.to_str() {
170                    paths.insert(s.to_string());
171                } else {
172                    eprintln!(
173                        "warning: skipping non-UTF-8 path in include cycle: {}",
174                        path.display()
175                    );
176                }
177                *nodes
178                    .get_mut(&path)
179                    .expect("invariant: every graph node must have a nodes map entry") =
180                    replacement;
181            }
182
183            eprintln!("Warning: possible include cycle:");
184            for p in &paths {
185                // Explicit quotes preserve whitespace visibility for
186                // paths that contain spaces — important when the cycle
187                // warning is the only signal a user gets.
188                eprintln!("  - \"{p}\"");
189            }
190            eprintln!();
191
192            scc_map.insert(replacement, paths);
193        }
194    }
195
196    for (path, node) in nodes {
197        let mut dfs = Dfs::new(&g, node);
198        if let Some(pf) = files.get_mut(&path) {
199            let x_inc = &mut pf.indirect_includes;
200            while let Some(node) = dfs.next(&g) {
201                let w = g
202                    .node_weight(node)
203                    .expect("invariant: DFS-visited node must have weight in graph");
204                if w == &PathBuf::from("") {
205                    let paths = scc_map.get(&node);
206                    if let Some(paths) = paths {
207                        for p in paths {
208                            x_inc.insert(p.clone());
209                        }
210                    } else {
211                        unreachable!(
212                            "every empty-path node is an SCC replacement and must have a scc_map entry"
213                        );
214                    }
215                } else {
216                    let Some(s) = w.to_str() else {
217                        eprintln!(
218                            "warning: skipping non-UTF-8 indirect include path: {}",
219                            w.display()
220                        );
221                        continue;
222                    };
223                    x_inc.insert(s.to_string());
224                }
225            }
226        } else {
227            eprintln!(
228                "Warning: included file which has not been preprocessed: {}",
229                path.display()
230            );
231        }
232    }
233}
234
235/// Extracts preprocessor data from a `C/C++` file
236/// and inserts these data in a [`PreprocResults`] object.
237///
238///
239/// [`PreprocResults`]: struct.PreprocResults.html
240pub fn preprocess(parser: &PreprocParser, path: &Path, results: &mut PreprocResults) {
241    let node = parser.get_root();
242    let mut cursor = node.cursor();
243    let mut stack = Vec::new();
244    let code = parser.get_code();
245    let mut file_result = PreprocFile::default();
246
247    stack.push(node);
248
249    while let Some(node) = stack.pop() {
250        cursor.reset(&node);
251        if cursor.goto_first_child() {
252            loop {
253                stack.push(cursor.node());
254                if !cursor.goto_next_sibling() {
255                    break;
256                }
257            }
258        }
259
260        let id = Preproc::from(node.kind_id());
261        match id {
262            Preproc::Define | Preproc::Undef => {
263                cursor.reset(&node);
264                cursor.goto_first_child();
265                let identifier = cursor.node();
266
267                if identifier.kind_id() == Preproc::Identifier {
268                    let Some(macro_text) = identifier.utf8_text(code) else {
269                        continue;
270                    };
271                    if !is_specials(macro_text) {
272                        file_result.macros.insert(macro_text.to_string());
273                    }
274                }
275            }
276            Preproc::PreprocInclude => {
277                cursor.reset(&node);
278                cursor.goto_first_child();
279                let file = cursor.node();
280
281                if file.kind_id() == Preproc::StringLiteral {
282                    // remove the starting/ending double quote
283                    let file = &code[file.start_byte() + 1..file.end_byte() - 1];
284                    let Some(start) = file.iter().position(|&c| c != b' ' && c != b'\t') else {
285                        continue;
286                    };
287                    let Some(end) = file.iter().rposition(|&c| c != b' ' && c != b'\t') else {
288                        continue;
289                    };
290                    let file = &file[start..=end];
291                    let Ok(file) = std::str::from_utf8(file) else {
292                        continue;
293                    };
294                    file_result.direct_includes.insert(file.to_string());
295                }
296            }
297            _ => {}
298        }
299    }
300
301    results.files.insert(path.to_path_buf(), file_result);
302}
303
304#[cfg(test)]
305#[allow(
306    clippy::float_cmp,
307    clippy::cast_precision_loss,
308    clippy::cast_possible_truncation,
309    clippy::cast_sign_loss,
310    clippy::similar_names,
311    clippy::doc_markdown,
312    clippy::needless_raw_string_hashes,
313    clippy::too_many_lines
314)]
315mod tests {
316    use super::*;
317
318    fn parse(source: &str) -> PreprocParser {
319        PreprocParser::new(source.as_bytes().to_vec(), &PathBuf::from("test.h"), None)
320    }
321
322    /// Empty include strings (`#include ""`) must not panic — earlier
323    /// implementations called `unwrap()` on `position`/`rposition` of the
324    /// trimmed slice, which returns `None` for an all-whitespace or empty
325    /// payload.
326    #[test]
327    fn preprocess_empty_include_does_not_panic() {
328        let parser = parse("#include \"\"\n");
329        let mut results = PreprocResults::default();
330        preprocess(&parser, &PathBuf::from("test.h"), &mut results);
331        let pf = results
332            .files
333            .get(&PathBuf::from("test.h"))
334            .expect("file entry must be inserted");
335        assert!(pf.direct_includes.is_empty());
336    }
337
338    /// Whitespace-only include strings (`#include "   "`) must not panic —
339    /// `position` returns `None` because no non-whitespace byte exists.
340    #[test]
341    fn preprocess_whitespace_only_include_does_not_panic() {
342        let parser = parse("#include \"   \"\n");
343        let mut results = PreprocResults::default();
344        preprocess(&parser, &PathBuf::from("test.h"), &mut results);
345        let pf = results
346            .files
347            .get(&PathBuf::from("test.h"))
348            .expect("file entry must be inserted");
349        assert!(pf.direct_includes.is_empty());
350    }
351
352    /// A well-formed include is still recorded with surrounding whitespace
353    /// stripped.
354    #[test]
355    fn preprocess_valid_include_is_recorded() {
356        let parser = parse("#include \"  foo.h  \"\n");
357        let mut results = PreprocResults::default();
358        preprocess(&parser, &PathBuf::from("test.h"), &mut results);
359        let pf = results
360            .files
361            .get(&PathBuf::from("test.h"))
362            .expect("file entry must be inserted");
363        assert!(pf.direct_includes.contains("foo.h"));
364    }
365
366    /// `#define` of a normal identifier records the macro name.
367    #[test]
368    fn preprocess_define_records_macro() {
369        let parser = parse("#define FOO 1\n");
370        let mut results = PreprocResults::default();
371        preprocess(&parser, &PathBuf::from("test.h"), &mut results);
372        let pf = results
373            .files
374            .get(&PathBuf::from("test.h"))
375            .expect("file entry must be inserted");
376        assert!(pf.macros.contains("FOO"));
377    }
378
379    /// `fix_includes` collapses a 2-file include cycle into one SCC replacement
380    /// node and propagates every member of that SCC into the `indirect_includes`
381    /// of *both* files symmetrically. Also exercises the `let-else` /
382    /// `expect`-with-invariant paths added in the panic-safety refactor (#72).
383    #[test]
384    fn fix_includes_handles_simple_cycle() {
385        let mut files: HashMap<PathBuf, PreprocFile> = HashMap::new();
386        let mut a = PreprocFile::default();
387        a.direct_includes.insert("b.h".to_string());
388        let mut b = PreprocFile::default();
389        b.direct_includes.insert("a.h".to_string());
390        files.insert(PathBuf::from("a.h"), a);
391        files.insert(PathBuf::from("b.h"), b);
392
393        let mut all_files: HashMap<String, Vec<PathBuf>> = HashMap::new();
394        all_files.insert("a.h".to_string(), vec![PathBuf::from("a.h")]);
395        all_files.insert("b.h".to_string(), vec![PathBuf::from("b.h")]);
396
397        fix_includes(&mut files, &all_files);
398
399        // After resolving the cycle each file's indirect_includes should
400        // contain both members of the SCC.
401        let a = files
402            .get(&PathBuf::from("a.h"))
403            .expect("a.h must be retained");
404        assert!(a.indirect_includes.contains("a.h"));
405        assert!(a.indirect_includes.contains("b.h"));
406
407        let b = files
408            .get(&PathBuf::from("b.h"))
409            .expect("b.h must be retained");
410        assert!(b.indirect_includes.contains("a.h"));
411        assert!(b.indirect_includes.contains("b.h"));
412    }
413}