Skip to main content

semantic/analysis/
analysis_classify.rs

1// SPDX-License-Identifier: Apache-2.0
2//! Change classification engine — determines whether a file modification is
3//! logic, formatting, imports-only, comments-only, or mixed.
4
5use std::path::Path;
6
7use objects::object::{ChangeImportance, ModificationKind};
8
9use super::analysis_similarity::{SimilarityMethod, compute_similarity};
10use crate::parser::{Language, ParsedFile};
11
12/// Classification result: kind, importance, and confidence.
13pub type ClassificationResult = (ModificationKind, ChangeImportance, f64);
14
15/// Classify what kind of modification happened to a file and its review importance.
16///
17/// This is the core engine behind "147 files changed → 11 things worth reviewing":
18/// it separates noise (formatting, imports, comments) from signal (logic changes).
19///
20/// Returns (kind, importance, confidence) where confidence is 0.0–1.0.
21/// AST-backed classification gets high confidence (0.9+), token-fallback gets medium (0.6–0.7).
22pub fn classify_modification(
23    path: &Path,
24    old_content: &str,
25    new_content: &str,
26) -> (ModificationKind, ChangeImportance) {
27    let (kind, importance, _confidence) =
28        classify_modification_with_confidence(path, old_content, new_content);
29    (kind, importance)
30}
31
32/// Like `classify_modification` but also returns a confidence score.
33pub fn classify_modification_with_confidence(
34    path: &Path,
35    old_content: &str,
36    new_content: &str,
37) -> ClassificationResult {
38    // Identical content should not reach here, but handle it gracefully.
39    if old_content == new_content {
40        return (
41            ModificationKind::WhitespaceOnly,
42            ChangeImportance::Noise,
43            1.0,
44        );
45    }
46
47    // --- Check 1: Token-identical means formatting/whitespace only ---
48    let token_sim = compute_similarity(old_content, new_content, SimilarityMethod::Tokens);
49    if token_sim >= 1.0 {
50        // Tokens are identical but raw text differs → pure formatting/whitespace.
51        // High confidence: token identity is a strong signal.
52        return (
53            ModificationKind::FormattingOnly,
54            ChangeImportance::Noise,
55            0.95,
56        );
57    }
58
59    let language = Language::from_path(path);
60
61    // Try AST-based classification. Falls back to token-level if parsing fails.
62    let old_parsed = ParsedFile::parse(old_content, language);
63    let new_parsed = ParsedFile::parse(new_content, language);
64
65    match (&old_parsed, &new_parsed) {
66        (Some(old_ast), Some(new_ast)) => {
67            classify_with_ast(old_content, new_content, old_ast, new_ast)
68        }
69        _ => {
70            // Parse failed — fall back to token-level heuristics (lower confidence).
71            classify_without_ast(old_content, new_content, token_sim)
72        }
73    }
74}
75
76/// AST-backed classification — the most accurate path.
77fn classify_with_ast(
78    old_content: &str,
79    new_content: &str,
80    old_ast: &ParsedFile,
81    new_ast: &ParsedFile,
82) -> ClassificationResult {
83    let old_funcs = old_ast.extract_functions();
84    let new_funcs = new_ast.extract_functions();
85    let old_imports = old_ast.extract_imports();
86    let new_imports = new_ast.extract_imports();
87
88    let funcs_identical = are_functions_identical(&old_funcs, &new_funcs);
89    let imports_identical = old_imports.len() == new_imports.len()
90        && old_imports
91            .iter()
92            .zip(new_imports.iter())
93            .all(|(a, b)| a.raw == b.raw);
94
95    // Check comments-only: strip comments from both and compare.
96    let old_stripped = strip_comments(old_ast);
97    let new_stripped = strip_comments(new_ast);
98    let non_comment_identical = old_stripped == new_stripped;
99
100    if non_comment_identical {
101        return (ModificationKind::CommentsOnly, ChangeImportance::Low, 0.92);
102    }
103
104    if funcs_identical && !imports_identical {
105        // Functions haven't changed, only imports differ.
106        // Double-check that non-import, non-function code is also identical.
107        let old_body = strip_imports_and_functions(old_ast);
108        let new_body = strip_imports_and_functions(new_ast);
109        if old_body == new_body {
110            return (ModificationKind::ImportsOnly, ChangeImportance::Low, 0.93);
111        }
112    }
113
114    // Check if token-equivalent (formatting only) but AST was parseable.
115    let token_sim = compute_similarity(old_content, new_content, SimilarityMethod::Tokens);
116    if token_sim >= 1.0 {
117        return (
118            ModificationKind::FormattingOnly,
119            ChangeImportance::Noise,
120            0.97,
121        );
122    }
123
124    // If functions changed but formatting also changed, it's mixed.
125    // Heuristic: compute line similarity to detect formatting noise alongside logic.
126    let line_sim = compute_similarity(old_content, new_content, SimilarityMethod::Lines);
127    if token_sim > 0.9 && line_sim < 0.7 {
128        // High token overlap but low line overlap → mostly formatting with some logic.
129        return (ModificationKind::Mixed, ChangeImportance::Medium, 0.75);
130    }
131
132    // Default: real logic change. AST-backed so reasonably confident.
133    (ModificationKind::Logic, ChangeImportance::High, 0.85)
134}
135
136/// Token-level fallback when tree-sitter parsing fails (lower confidence).
137fn classify_without_ast(
138    old_content: &str,
139    new_content: &str,
140    token_sim: f64,
141) -> ClassificationResult {
142    if token_sim >= 1.0 {
143        return (
144            ModificationKind::FormattingOnly,
145            ChangeImportance::Noise,
146            0.9,
147        );
148    }
149
150    let line_sim = compute_similarity(old_content, new_content, SimilarityMethod::Lines);
151
152    // High token similarity + low line similarity → mostly formatting.
153    if token_sim > 0.95 && line_sim < 0.8 {
154        return (
155            ModificationKind::FormattingOnly,
156            ChangeImportance::Noise,
157            0.7,
158        );
159    }
160
161    if token_sim > 0.9 {
162        return (ModificationKind::Mixed, ChangeImportance::Medium, 0.6);
163    }
164
165    // Token-level fallback — lower confidence since we can't parse the AST.
166    (ModificationKind::Logic, ChangeImportance::High, 0.5)
167}
168
169/// Compare function lists for identity (same names, same content).
170fn are_functions_identical(
171    old_funcs: &[crate::parser::FunctionDef],
172    new_funcs: &[crate::parser::FunctionDef],
173) -> bool {
174    if old_funcs.len() != new_funcs.len() {
175        return false;
176    }
177    // Sort by name for stable comparison.
178    let mut old_sorted: Vec<_> = old_funcs.iter().collect();
179    let mut new_sorted: Vec<_> = new_funcs.iter().collect();
180    old_sorted.sort_by_key(|f| &f.name);
181    new_sorted.sort_by_key(|f| &f.name);
182
183    old_sorted
184        .iter()
185        .zip(new_sorted.iter())
186        .all(|(a, b)| a.name == b.name && a.content == b.content)
187}
188
189/// Walk the AST and collect text of all non-comment nodes.
190fn strip_comments(parsed: &ParsedFile) -> String {
191    let mut result = String::new();
192    collect_non_comment_text(parsed.root_node(), &parsed.source, &mut result);
193    result
194}
195
196fn collect_non_comment_text(node: tree_sitter::Node<'_>, source: &str, out: &mut String) {
197    let mut stack = vec![node];
198
199    while let Some(current) = stack.pop() {
200        if is_comment_node(current.kind()) {
201            continue;
202        }
203
204        if current.child_count() == 0 {
205            out.push_str(&source[current.byte_range()]);
206            out.push(' ');
207            continue;
208        }
209
210        let child_count = current.child_count();
211        for index in (0..child_count).rev() {
212            if let Some(child) = current.child(index as u32) {
213                stack.push(child);
214            }
215        }
216    }
217}
218
219fn is_comment_node(kind: &str) -> bool {
220    matches!(
221        kind,
222        "comment" | "line_comment" | "block_comment" | "doc_comment" | "string_comment"
223    )
224}
225
226/// Strip imports and function bodies, return remaining "scaffold" text.
227fn strip_imports_and_functions(parsed: &ParsedFile) -> String {
228    let mut result = String::new();
229    let root = parsed.root_node();
230    for i in 0..root.child_count() {
231        if let Some(child) = root.child(i as u32) {
232            let kind = child.kind();
233            // Skip imports.
234            if matches!(
235                kind,
236                "use_declaration"
237                    | "extern_crate_declaration"
238                    | "import_statement"
239                    | "import_from_statement"
240                    | "import_declaration"
241            ) {
242                continue;
243            }
244            // Skip function definitions.
245            if ParsedFile::is_function_kind(kind, parsed.language) {
246                continue;
247            }
248            result.push_str(&parsed.source[child.byte_range()]);
249            result.push('\n');
250        }
251    }
252    result
253}
254
255#[cfg(test)]
256mod tests {
257    use super::*;
258
259    #[test]
260    fn test_whitespace_only() {
261        let old = "fn foo() {\n    bar();\n}\n";
262        let new = "fn foo() {\n        bar();\n}\n";
263        let (kind, importance) = classify_modification(Path::new("test.rs"), old, new);
264        assert_eq!(kind, ModificationKind::FormattingOnly);
265        assert_eq!(importance, ChangeImportance::Noise);
266    }
267
268    #[test]
269    fn test_logic_change() {
270        let old = "fn foo() -> i32 {\n    42\n}\n";
271        let new = "fn foo() -> i32 {\n    43\n}\n";
272        let (kind, importance) = classify_modification(Path::new("test.rs"), old, new);
273        assert_eq!(kind, ModificationKind::Logic);
274        assert_eq!(importance, ChangeImportance::High);
275    }
276
277    #[test]
278    fn test_comments_only() {
279        let old = "// old comment\nfn foo() {\n    bar();\n}\n";
280        let new = "// new comment\nfn foo() {\n    bar();\n}\n";
281        let (kind, importance) = classify_modification(Path::new("test.rs"), old, new);
282        assert_eq!(kind, ModificationKind::CommentsOnly);
283        assert_eq!(importance, ChangeImportance::Low);
284    }
285
286    #[test]
287    fn test_imports_only() {
288        let old = "use std::io;\n\nfn foo() {\n    bar();\n}\n";
289        let new = "use std::io;\nuse std::fs;\n\nfn foo() {\n    bar();\n}\n";
290        let (kind, importance) = classify_modification(Path::new("test.rs"), old, new);
291        assert_eq!(kind, ModificationKind::ImportsOnly);
292        assert_eq!(importance, ChangeImportance::Low);
293    }
294
295    #[test]
296    fn test_parse_error_fallback() {
297        // Unknown language — falls back to token-level classification.
298        let old = "some content here\n";
299        let new = "some content here\nwith additions\n";
300        let (kind, importance) = classify_modification(Path::new("test.xyz"), old, new);
301        // Should classify as Logic since tokens differ and we can't parse.
302        assert_eq!(kind, ModificationKind::Logic);
303        assert_eq!(importance, ChangeImportance::High);
304    }
305
306    #[test]
307    fn test_formatting_only_unknown_lang() {
308        // Token-identical but line-different on unknown language.
309        let old = "foo bar baz\n";
310        let new = "foo  bar  baz\n";
311        let (kind, importance) = classify_modification(Path::new("test.xyz"), old, new);
312        assert_eq!(kind, ModificationKind::FormattingOnly);
313        assert_eq!(importance, ChangeImportance::Noise);
314    }
315}