Skip to main content

kaizen/metrics/
analyze.rs

1// SPDX-License-Identifier: AGPL-3.0-or-later
2//! Multi-language file analyzers. Tree-sitter when supported.
3
4use crate::metrics::types::{RepoAnalysis, SymbolFact};
5use anyhow::Result;
6use std::path::Path;
7use tree_sitter::{Language, Node, Parser};
8
9pub trait CodeAnalyzer {
10    fn analyze(&self, rel_path: &str, source: &str) -> Result<RepoAnalysis>;
11}
12
13pub fn analyzer_for(path: &Path) -> Box<dyn CodeAnalyzer> {
14    language_spec(path)
15        .map(|spec| Box::new(TreeSitterAnalyzer { spec }) as Box<dyn CodeAnalyzer>)
16        .unwrap_or_else(|| Box::new(GenericAnalyzer))
17}
18
19pub struct GenericAnalyzer;
20
21impl CodeAnalyzer for GenericAnalyzer {
22    fn analyze(&self, rel_path: &str, source: &str) -> Result<RepoAnalysis> {
23        let lines: Vec<&str> = source.lines().collect();
24        Ok(RepoAnalysis {
25            path: rel_path.into(),
26            language: language_name(Path::new(rel_path)).into(),
27            bytes: source.len() as u64,
28            loc: lines.len() as u32,
29            sloc: lines.iter().filter(|line| !line.trim().is_empty()).count() as u32,
30            complexity_total: 0,
31            max_fn_complexity: 0,
32            imports: vec![],
33            symbols: vec![],
34        })
35    }
36}
37
38struct TreeSitterAnalyzer {
39    spec: &'static LanguageSpec,
40}
41
42impl CodeAnalyzer for TreeSitterAnalyzer {
43    fn analyze(&self, rel_path: &str, source: &str) -> Result<RepoAnalysis> {
44        let mut parser = Parser::new();
45        parser.set_language(&(self.spec.language)())?;
46        let tree = parser.parse(source, None).expect("tree-sitter parse");
47        let root = tree.root_node();
48        let bytes = source.as_bytes();
49        let mut symbols = vec![];
50        collect_symbols(root, bytes, self.spec, &mut symbols);
51        let imports = collect_kind_text(root, bytes, self.spec.import_kinds)
52            .into_iter()
53            .flat_map(|raw| extract_import_targets(&raw))
54            .collect::<Vec<_>>();
55        let symbol_ranges = symbols
56            .iter()
57            .map(|s| (s.start_byte, s.end_byte))
58            .collect::<Vec<_>>();
59        let top_level = count_top_level_complexity(root, bytes, self.spec, &symbol_ranges);
60        let sum_symbols = symbols.iter().map(|s| s.complexity).sum::<u32>();
61        Ok(RepoAnalysis {
62            path: rel_path.into(),
63            language: self.spec.name.into(),
64            bytes: source.len() as u64,
65            loc: source.lines().count() as u32,
66            sloc: source
67                .lines()
68                .filter(|line| !line.trim().is_empty())
69                .count() as u32,
70            complexity_total: sum_symbols + top_level,
71            max_fn_complexity: symbols.iter().map(|s| s.complexity).max().unwrap_or(0),
72            imports,
73            symbols,
74        })
75    }
76}
77
78struct LanguageSpec {
79    name: &'static str,
80    language: fn() -> Language,
81    symbol_kinds: &'static [&'static str],
82    import_kinds: &'static [&'static str],
83    call_kinds: &'static [&'static str],
84    branch_kinds: &'static [&'static str],
85}
86
87fn language_spec(path: &Path) -> Option<&'static LanguageSpec> {
88    static SPECS: &[LanguageSpec] = &[
89        LanguageSpec {
90            name: "rust",
91            language: || tree_sitter_rust::LANGUAGE.into(),
92            symbol_kinds: &[
93                "function_item",
94                "struct_item",
95                "enum_item",
96                "impl_item",
97                "trait_item",
98            ],
99            import_kinds: &["use_declaration"],
100            call_kinds: &["call_expression", "macro_invocation"],
101            branch_kinds: &[
102                "if_expression",
103                "for_expression",
104                "loop_expression",
105                "while_expression",
106                "match_expression",
107            ],
108        },
109        LanguageSpec {
110            name: "typescript",
111            language: || tree_sitter_typescript::LANGUAGE_TYPESCRIPT.into(),
112            symbol_kinds: &[
113                "function_declaration",
114                "method_definition",
115                "class_declaration",
116                "interface_declaration",
117            ],
118            import_kinds: &["import_statement"],
119            call_kinds: &["call_expression"],
120            branch_kinds: &[
121                "if_statement",
122                "for_statement",
123                "for_in_statement",
124                "while_statement",
125                "switch_statement",
126                "ternary_expression",
127                "logical_expression",
128            ],
129        },
130        LanguageSpec {
131            name: "javascript",
132            language: || tree_sitter_javascript::LANGUAGE.into(),
133            symbol_kinds: &[
134                "function_declaration",
135                "method_definition",
136                "class_declaration",
137            ],
138            import_kinds: &["import_statement"],
139            call_kinds: &["call_expression"],
140            branch_kinds: &[
141                "if_statement",
142                "for_statement",
143                "for_in_statement",
144                "while_statement",
145                "switch_statement",
146                "ternary_expression",
147                "logical_expression",
148            ],
149        },
150        LanguageSpec {
151            name: "python",
152            language: || tree_sitter_python::LANGUAGE.into(),
153            symbol_kinds: &["function_definition", "class_definition"],
154            import_kinds: &["import_statement", "import_from_statement"],
155            call_kinds: &["call"],
156            branch_kinds: &[
157                "if_statement",
158                "for_statement",
159                "while_statement",
160                "conditional_expression",
161            ],
162        },
163        LanguageSpec {
164            name: "go",
165            language: || tree_sitter_go::LANGUAGE.into(),
166            symbol_kinds: &[
167                "function_declaration",
168                "method_declaration",
169                "type_declaration",
170            ],
171            import_kinds: &["import_declaration"],
172            call_kinds: &["call_expression"],
173            branch_kinds: &[
174                "if_statement",
175                "for_statement",
176                "expression_switch_statement",
177                "type_switch_statement",
178                "select_statement",
179            ],
180        },
181        LanguageSpec {
182            name: "java",
183            language: || tree_sitter_java::LANGUAGE.into(),
184            symbol_kinds: &[
185                "method_declaration",
186                "class_declaration",
187                "interface_declaration",
188                "enum_declaration",
189            ],
190            import_kinds: &["import_declaration"],
191            call_kinds: &["method_invocation"],
192            branch_kinds: &[
193                "if_statement",
194                "for_statement",
195                "enhanced_for_statement",
196                "while_statement",
197                "switch_expression",
198                "switch_block",
199            ],
200        },
201    ];
202    let name = language_name(path);
203    SPECS.iter().find(|spec| spec.name == name)
204}
205
206fn language_name(path: &Path) -> &'static str {
207    match path.extension().and_then(|ext| ext.to_str()).unwrap_or("") {
208        "rs" => "rust",
209        "ts" | "tsx" => "typescript",
210        "js" | "jsx" | "mjs" | "cjs" => "javascript",
211        "py" => "python",
212        "go" => "go",
213        "java" => "java",
214        _ => "generic",
215    }
216}
217
218fn collect_symbols(node: Node<'_>, source: &[u8], spec: &LanguageSpec, out: &mut Vec<SymbolFact>) {
219    if spec.symbol_kinds.iter().any(|kind| *kind == node.kind()) {
220        let calls = collect_kind_text(node, source, spec.call_kinds)
221            .into_iter()
222            .filter_map(|raw| call_name(&raw))
223            .collect::<Vec<_>>();
224        out.push(SymbolFact {
225            path: String::new(),
226            name: symbol_name(node, source),
227            kind: node.kind().into(),
228            complexity: 1 + count_complexity(node, spec),
229            calls,
230            start_byte: node.start_byte(),
231            end_byte: node.end_byte(),
232        });
233    }
234    let mut cursor = node.walk();
235    for child in node.children(&mut cursor) {
236        collect_symbols(child, source, spec, out);
237    }
238}
239
240fn symbol_name(node: Node<'_>, source: &[u8]) -> String {
241    let mut cursor = node.walk();
242    for child in node.children(&mut cursor) {
243        if (child.kind().contains("identifier") || child.kind().ends_with("name"))
244            && let Ok(text) = child.utf8_text(source)
245        {
246            return text.trim().to_string();
247        }
248    }
249    node.kind().into()
250}
251
252fn collect_kind_text(node: Node<'_>, source: &[u8], kinds: &[&str]) -> Vec<String> {
253    let mut out = vec![];
254    collect_kind_text_inner(node, source, kinds, &mut out);
255    out
256}
257
258fn collect_kind_text_inner(node: Node<'_>, source: &[u8], kinds: &[&str], out: &mut Vec<String>) {
259    if kinds.iter().any(|kind| *kind == node.kind())
260        && let Ok(text) = node.utf8_text(source)
261    {
262        out.push(text.to_string());
263    }
264    let mut cursor = node.walk();
265    for child in node.children(&mut cursor) {
266        collect_kind_text_inner(child, source, kinds, out);
267    }
268}
269
270fn count_complexity(node: Node<'_>, spec: &LanguageSpec) -> u32 {
271    let mut count = if spec.branch_kinds.iter().any(|kind| *kind == node.kind()) {
272        1
273    } else {
274        0
275    };
276    let mut cursor = node.walk();
277    for child in node.children(&mut cursor) {
278        count += count_complexity(child, spec);
279    }
280    count
281}
282
283fn count_top_level_complexity(
284    root: Node<'_>,
285    _source: &[u8],
286    spec: &LanguageSpec,
287    symbol_ranges: &[(usize, usize)],
288) -> u32 {
289    collect_kind_nodes(root, spec.branch_kinds)
290        .into_iter()
291        .filter(|node| !inside_symbol(node.start_byte(), symbol_ranges))
292        .count() as u32
293}
294
295fn collect_kind_nodes<'a>(node: Node<'a>, kinds: &[&str]) -> Vec<Node<'a>> {
296    let mut out = vec![];
297    collect_kind_nodes_inner(node, kinds, &mut out);
298    out
299}
300
301fn collect_kind_nodes_inner<'a>(node: Node<'a>, kinds: &[&str], out: &mut Vec<Node<'a>>) {
302    if kinds.iter().any(|kind| *kind == node.kind()) {
303        out.push(node);
304    }
305    let mut cursor = node.walk();
306    for child in node.children(&mut cursor) {
307        collect_kind_nodes_inner(child, kinds, out);
308    }
309}
310
311fn inside_symbol(byte: usize, ranges: &[(usize, usize)]) -> bool {
312    ranges
313        .iter()
314        .any(|(start, end)| *start <= byte && byte < *end)
315}
316
317fn extract_import_targets(raw: &str) -> Vec<String> {
318    let mut out = vec![];
319    for quote in ['"', '\''] {
320        if let Some(rest) = raw.split(quote).nth(1) {
321            out.push(rest.to_string());
322            return out;
323        }
324    }
325    let cleaned = raw
326        .replace("use ", "")
327        .replace("import ", "")
328        .replace("from ", "")
329        .replace(';', "");
330    let target = cleaned
331        .split_whitespace()
332        .next()
333        .unwrap_or("")
334        .trim_matches('{')
335        .trim_matches('}')
336        .trim();
337    if !target.is_empty() {
338        out.push(target.into());
339    }
340    out
341}
342
343fn call_name(raw: &str) -> Option<String> {
344    let head = raw.split('(').next()?.trim();
345    let name = head.rsplit(['.', ':']).next()?.trim();
346    if name.is_empty() {
347        return None;
348    }
349    Some(name.into())
350}