garbage_code_hunter/language/adapter/
python.rs

1//! PythonAdapter — Python language adapter.
2
3use super::{
4    count_block_ancestors, count_dead_code_with, count_duplicate_imports_with, count_nested_blocks,
5    count_params, is_boolean_or_null, is_common_safe_number, is_inside_declaration,
6    is_repeating_chars, max_scope_depth, FunctionNode, LanguageAdapter,
7};
8use crate::language::Language;
9use crate::treesitter::engine::ParsedFile;
10use crate::treesitter::query::QueryCapture;
11use regex::Regex;
12use std::sync::LazyLock;
13
14const STANDARD_DUNDERS: &[&str] = &[
15    "__init__",
16    "__new__",
17    "__del__",
18    "__repr__",
19    "__str__",
20    "__bytes__",
21    "__format__",
22    "__lt__",
23    "__le__",
24    "__eq__",
25    "__ne__",
26    "__gt__",
27    "__ge__",
28    "__hash__",
29    "__bool__",
30    "__getattr__",
31    "__getattribute__",
32    "__setattr__",
33    "__delattr__",
34    "__call__",
35    "__len__",
36    "__getitem__",
37    "__setitem__",
38    "__delitem__",
39    "__iter__",
40    "__next__",
41    "__reversed__",
42    "__contains__",
43    "__enter__",
44    "__exit__",
45    "__aenter__",
46    "__aexit__",
47    "__await__",
48    "__aiter__",
49    "__anext__",
50    "__add__",
51    "__sub__",
52    "__mul__",
53    "__truediv__",
54    "__floordiv__",
55    "__mod__",
56    "__divmod__",
57    "__pow__",
58    "__lshift__",
59    "__rshift__",
60    "__and__",
61    "__xor__",
62    "__or__",
63    "__radd__",
64    "__rsub__",
65    "__rmul__",
66    "__rtruediv__",
67    "__rfloordiv__",
68    "__rmod__",
69    "__rdivmod__",
70    "__rpow__",
71    "__rlshift__",
72    "__rrshift__",
73    "__rand__",
74    "__rxor__",
75    "__ror__",
76    "__iadd__",
77    "__isub__",
78    "__imul__",
79    "__itruediv__",
80    "__ifloordiv__",
81    "__imod__",
82    "__ipow__",
83    "__ilshift__",
84    "__irshift__",
85    "__iand__",
86    "__ixor__",
87    "__ior__",
88    "__neg__",
89    "__pos__",
90    "__abs__",
91    "__invert__",
92    "__complex__",
93    "__int__",
94    "__float__",
95    "__round__",
96    "__index__",
97    "__copy__",
98    "__deepcopy__",
99    "__sizeof__",
100    "__reduce__",
101    "__reduce_ex__",
102    "__getnewargs__",
103    "__getstate__",
104    "__setstate__",
105    "__dir__",
106    "__class__",
107    "__subclasshook__",
108    "__init_subclass__",
109    "__instancecheck__",
110    "__subclasscheck__",
111    "__fspath__",
112    "__prepare__",
113    "__slots__",
114];
115
116const PYTHON_STDLIB_MODULES: &[&str] = &[
117    "os",
118    "sys",
119    "re",
120    "json",
121    "math",
122    "datetime",
123    "time",
124    "collections",
125    "functools",
126    "itertools",
127    "typing",
128    "pathlib",
129    "io",
130    "abc",
131    "copy",
132    "enum",
133    "dataclasses",
134    "logging",
135    "unittest",
136    "argparse",
137    "subprocess",
138    "threading",
139    "multiprocessing",
140    "socket",
141    "http",
142    "urllib",
143    "email",
144    "html",
145    "xml",
146    "csv",
147    "hashlib",
148    "hmac",
149    "secrets",
150    "base64",
151    "struct",
152    "pickle",
153    "shelve",
154    "sqlite3",
155    "gzip",
156    "zipfile",
157    "tarfile",
158    "shutil",
159    "tempfile",
160    "glob",
161    "fnmatch",
162    "contextlib",
163    "textwrap",
164    "string",
165    "operator",
166    "bisect",
167    "heapq",
168    "array",
169    "weakref",
170    "types",
171    "pprint",
172    "warnings",
173    "traceback",
174    "inspect",
175    "importlib",
176    "pkgutil",
177    "pdb",
178    "profile",
179    "timeit",
180    "dis",
181    "ast",
182    "token",
183    "tokenize",
184    "keyword",
185    "platform",
186    "ctypes",
187    "concurrent",
188    "asyncio",
189    "signal",
190    "mmap",
191    "codecs",
192    "locale",
193    "gettext",
194    "unicodedata",
195    "difflib",
196];
197
198const ACCEPTABLE_WILDCARD_MODULES: &[&str] = &[
199    "manim",
200    "numpy",
201    "matplotlib",
202    "pytest",
203    "tensorflow",
204    "torch",
205    "tkinter",
206    "PyQt5",
207    "PySide6",
208    "gi.repository",
209];
210
211const PYTHON_PATTERNS: &[&str] = &[
212    // pc_ — panic calls (broad except)
213    "(except_clause) @pc_clause",
214    // ex_ / py_ — extract functions + python naming/issues
215    "[(function_definition name: (identifier) @py_name) @py_fn]",
216    // nv_ — naming violations (single-letter)
217    "(assignment left: (identifier) @nv_var (#match? @nv_var \"^[a-z]$\"))",
218    // nv_ — naming violations (all assignment names)
219    "(assignment left: (identifier) @nv_name)",
220    // nv_ — class naming violations
221    "(class_definition name: (identifier) @nv_cls)",
222    // dp_ — debug calls
223    "(call function: (identifier) @dp_fn (#eq? @dp_fn \"print\"))",
224    // ep_ — excessive params
225    "(function_definition parameters: (parameters) @ep_params)",
226    // mn_ — magic numbers
227    "[(integer) @mn_num (float) @mn_num]",
228    // py_ — wildcard imports
229    "(wildcard_import) @py_wi",
230];
231
232pub struct PythonAdapter;
233
234impl LanguageAdapter for PythonAdapter {
235    fn language(&self) -> Language {
236        Language::Python
237    }
238
239    fn query_patterns(&self) -> &[&str] {
240        PYTHON_PATTERNS
241    }
242
243    fn count_panic_calls(&self, file: &ParsedFile) -> usize {
244        self.count_panic_from_batch(file, &self.batch_captures(file))
245    }
246
247    fn extract_functions(&self, file: &ParsedFile) -> Vec<FunctionNode> {
248        self.extract_functions_from_batch(file, &self.batch_captures(file))
249    }
250
251    fn max_nesting_depth(&self, file: &ParsedFile) -> usize {
252        max_scope_depth(file.root_node(), 0)
253    }
254
255    fn count_naming_violations(&self, file: &ParsedFile) -> usize {
256        self.count_naming_from_batch(file, &self.batch_captures(file))
257    }
258
259    fn count_deeply_nested_blocks(&self, file: &ParsedFile) -> usize {
260        let threshold = 5;
261        let mut count = 0;
262        count_nested_blocks(file.root_node(), 0, threshold, &mut count);
263        count
264    }
265
266    fn count_debug_calls(&self, file: &ParsedFile) -> usize {
267        self.count_debug_from_batch(file, &self.batch_captures(file))
268    }
269
270    fn count_excessive_params(&self, file: &ParsedFile, threshold: usize) -> usize {
271        self.count_excessive_from_batch_with(file, &self.batch_captures(file), threshold)
272    }
273
274    fn count_magic_numbers(&self, file: &ParsedFile) -> usize {
275        self.count_magic_from_batch(file, &self.batch_captures(file))
276    }
277
278    fn count_dead_code(&self, file: &ParsedFile) -> usize {
279        count_dead_code_with(
280            file,
281            &["return", "return None", "raise", "break", "continue"],
282            &["return ", "raise ", "sys.exit(", "exit(", "quit("],
283            "#",
284        )
285    }
286
287    fn count_duplicate_imports(&self, file: &ParsedFile) -> usize {
288        count_duplicate_imports_with(file, &["import ", "from "])
289    }
290
291    fn count_python_issues(&self, file: &ParsedFile) -> usize {
292        self.count_python_from_batch(file, &self.batch_captures(file))
293    }
294
295    // -- _from_batch overrides --
296
297    fn count_panic_from_batch<'a>(
298        &self,
299        _file: &ParsedFile,
300        batch: &[Vec<QueryCapture<'a>>],
301    ) -> usize {
302        let mut count = 0;
303        for m in batch {
304            for c in m {
305                if c.name == "pc_clause" {
306                    if let Some(value) = c.node.child_by_field_name("value") {
307                        if let Ok(vtext) = value.utf8_text(_file.content.as_bytes()) {
308                            if vtext == "BaseException" || vtext == "Exception" {
309                                count += 1;
310                            }
311                        }
312                    } else {
313                        // bare except — no value child
314                        count += 1;
315                    }
316                }
317            }
318        }
319        count
320    }
321
322    fn extract_functions_from_batch<'a>(
323        &self,
324        _file: &ParsedFile,
325        batch: &[Vec<QueryCapture<'a>>],
326    ) -> Vec<FunctionNode> {
327        let mut functions = Vec::new();
328        for m in batch {
329            let has_py = m.iter().any(|c| c.name.starts_with("py_"));
330            if !has_py {
331                continue;
332            }
333            let mut name = String::new();
334            let mut start_line = 0usize;
335            let mut end_line = 0usize;
336            for c in m {
337                match c.name.as_str() {
338                    "py_name" => name = c.text.to_string(),
339                    "py_fn" => {
340                        start_line = c.node.start_position().row + 1;
341                        end_line = c.node.end_position().row + 1;
342                    }
343                    _ => {}
344                }
345            }
346            if !name.is_empty() {
347                let nesting_depth = count_block_ancestors(m);
348                functions.push(FunctionNode {
349                    name,
350                    start_line,
351                    end_line,
352                    nesting_depth,
353                });
354            }
355        }
356        functions
357    }
358
359    fn count_naming_from_batch<'a>(
360        &self,
361        _file: &ParsedFile,
362        batch: &[Vec<QueryCapture<'a>>],
363    ) -> usize {
364        let mut count = 0usize;
365        let idiomatic_single: &[&str] = &["e", "x", "i", "j", "k", "f"];
366
367        static TERRIBLE_RE: LazyLock<Option<Regex>> = LazyLock::new(|| {
368            Regex::new(
369                r"^(data|info|temp|tmp|val|value|thing|stuff|obj|object|manager|handler|helper|util|utils)(\d+)?$",
370            ).ok()
371        });
372        let terrible_re = TERRIBLE_RE.as_ref();
373        let meaningless: &[&str] = &[
374            "foo", "bar", "baz", "qux", "quux", "quuz", "aaa", "bbb", "ccc", "ddd", "eee", "xxx",
375            "yyy", "zzz", "test1", "test2", "test3",
376        ];
377
378        for m in batch {
379            for c in m {
380                match c.name.as_str() {
381                    "nv_var" if !idiomatic_single.contains(&c.text) => {
382                        count += 1;
383                    }
384                    "nv_name" => {
385                        let name = c.text;
386                        let name_lower = name.to_lowercase();
387                        if let Some(re) = terrible_re {
388                            if re.is_match(&name_lower) {
389                                count += 1;
390                                continue;
391                            }
392                        }
393                        if meaningless.contains(&name) || is_repeating_chars(name) {
394                            count += 1;
395                        }
396                    }
397                    "nv_cls" if c.text.chars().next().is_some_and(|ch| ch.is_lowercase()) => {
398                        count += 1;
399                    }
400                    "py_name" => {
401                        // snake_case function names with uppercase = naming violation
402                        if count > 2000 {
403                            continue;
404                        }
405                        let name = c.text;
406                        if name.starts_with("__") || name.starts_with('_') {
407                            continue;
408                        }
409                        if name.chars().any(|ch| ch.is_uppercase()) {
410                            count += 1;
411                        }
412                    }
413                    _ => {}
414                }
415            }
416        }
417        count
418    }
419
420    fn count_debug_from_batch<'a>(
421        &self,
422        _file: &ParsedFile,
423        batch: &[Vec<QueryCapture<'a>>],
424    ) -> usize {
425        batch
426            .iter()
427            .filter(|m| m.iter().any(|c| c.name == "dp_fn"))
428            .count()
429    }
430
431    fn count_excessive_from_batch<'a>(
432        &self,
433        _file: &ParsedFile,
434        batch: &[Vec<QueryCapture<'a>>],
435    ) -> usize {
436        self.count_excessive_from_batch_with(_file, batch, 5)
437    }
438
439    fn count_magic_from_batch<'a>(
440        &self,
441        _file: &ParsedFile,
442        batch: &[Vec<QueryCapture<'a>>],
443    ) -> usize {
444        let mut count = 0;
445        for m in batch {
446            for c in m {
447                if c.name == "mn_num" && !is_inside_declaration(c.node) {
448                    let text = c.text;
449                    if text != "0"
450                        && text != "1"
451                        && !is_common_safe_number(text)
452                        && !is_boolean_or_null(text)
453                    {
454                        count += 1;
455                    }
456                }
457            }
458        }
459        count
460    }
461
462    fn count_python_from_batch<'a>(
463        &self,
464        file: &ParsedFile,
465        batch: &[Vec<QueryCapture<'a>>],
466    ) -> usize {
467        let mut count = 0;
468
469        for m in batch {
470            for c in m {
471                match c.name.as_str() {
472                    // wildcard-import: from module import * (excluding idiomatic libraries)
473                    "py_wi" => {
474                        let line = c.node.start_position().row;
475                        let acceptable = file.content.lines().nth(line).is_some_and(|src_line| {
476                            ACCEPTABLE_WILDCARD_MODULES
477                                .iter()
478                                .any(|m| src_line.contains(&format!("from {} import *", m)))
479                        });
480                        if !acceptable {
481                            count += 1;
482                        }
483                    }
484                    // python-magic-method: non-standard __dunder__ methods
485                    "py_name" => {
486                        let name = c.text;
487                        if name.starts_with("__")
488                            && name.ends_with("__")
489                            && !STANDARD_DUNDERS.contains(&name)
490                        {
491                            count += 1;
492                        }
493                    }
494                    _ => {}
495                }
496            }
497        }
498
499        // compared-to-bool, not-is-none, type-ignore, fstring (text scanning)
500        for line in file.content.lines() {
501            let trimmed = line.trim();
502            if trimmed.starts_with('#') {
503                continue;
504            }
505            if (trimmed.contains("== True") || trimmed.contains("== False"))
506                && !trimmed.contains("is True")
507                && !trimmed.contains("is False")
508            {
509                count += 1;
510            }
511            if trimmed.contains("== None") && !trimmed.contains("is None") {
512                count += 1;
513            }
514            if trimmed.contains("!= None") && !trimmed.contains("is not None") {
515                count += 1;
516            }
517            if trimmed.contains("# type: ignore") {
518                count += 1;
519            }
520            if !trimmed.starts_with('#')
521                && !trimmed.starts_with("\"")
522                && !trimmed.starts_with("'")
523                && trimmed.contains(".format(")
524                && !trimmed.contains("f-string")
525            {
526                count += 1;
527            }
528            if trimmed.matches('%').count() >= 2
529                && !trimmed.contains("'%")
530                && !trimmed.contains("\"%")
531                && (trimmed.contains("%s") || trimmed.contains("%d") || trimmed.contains("%r"))
532            {
533                count += 1;
534            }
535        }
536
537        // python-import-order: stdlib after third-party
538        let mut seen_third_party = false;
539        for line in file.content.lines() {
540            let trimmed = line.trim();
541            if trimmed.is_empty() || trimmed.starts_with('#') {
542                continue;
543            }
544            if !trimmed.starts_with("import ") && !trimmed.starts_with("from ") {
545                if !trimmed.is_empty() {
546                    seen_third_party = false;
547                }
548                continue;
549            }
550            let module = if trimmed.starts_with("from ") {
551                trimmed
552                    .strip_prefix("from ")
553                    .unwrap_or("")
554                    .split_whitespace()
555                    .next()
556                    .unwrap_or("")
557            } else {
558                trimmed
559                    .strip_prefix("import ")
560                    .unwrap_or("")
561                    .split_whitespace()
562                    .next()
563                    .unwrap_or("")
564            };
565            if module.starts_with('.') {
566                continue;
567            }
568            let top_module = module.split('.').next().unwrap_or(module);
569            if !PYTHON_STDLIB_MODULES.contains(&top_module) {
570                seen_third_party = true;
571            } else if seen_third_party {
572                count += 1;
573            }
574        }
575
576        count
577    }
578}
579
580impl PythonAdapter {
581    fn count_excessive_from_batch_with<'a>(
582        &self,
583        _file: &ParsedFile,
584        batch: &[Vec<QueryCapture<'a>>],
585        threshold: usize,
586    ) -> usize {
587        let mut count = 0;
588        for m in batch {
589            for c in m {
590                if c.name == "ep_params" && count_params(c.text) > threshold {
591                    count += 1;
592                }
593            }
594        }
595        count
596    }
597}
598
599#[cfg(test)]
600mod tests {
601    use super::super::parse_code;
602    use super::*;
603
604    fn parse_python(code: &str) -> ParsedFile {
605        parse_code(code, "test.py").expect("parse")
606    }
607
608    #[test]
609    fn test_python_count_panic_calls_bare_except() {
610        let code = r#"
611try:
612    do_something()
613except:
614    pass
615"#;
616        let file = parse_python(code);
617        let adapter = PythonAdapter;
618        assert_eq!(adapter.count_panic_calls(&file), 1, "bare except = 1");
619    }
620
621    #[test]
622    fn test_python_count_panic_calls_base_exception() {
623        let code = r#"
624try:
625    do_something()
626except BaseException:
627    pass
628"#;
629        let file = parse_python(code);
630        let adapter = PythonAdapter;
631        assert_eq!(
632            adapter.count_panic_calls(&file),
633            1,
634            "except BaseException = 1"
635        );
636    }
637
638    #[test]
639    fn test_python_count_panic_calls_specific_ok() {
640        let code = r#"
641try:
642    do_something()
643except ValueError:
644    pass
645"#;
646        let file = parse_python(code);
647        let adapter = PythonAdapter;
648        assert_eq!(adapter.count_panic_calls(&file), 0, "specific except = 0");
649    }
650
651    #[test]
652    fn test_python_naming_single_letter() {
653        let code = "a = 1\nb = 2\n";
654        let file = parse_python(code);
655        let adapter = PythonAdapter;
656        assert_eq!(adapter.count_naming_violations(&file), 2, "a and b");
657    }
658
659    #[test]
660    fn test_python_naming_camel_case_fn() {
661        let code = "def getData(): pass\n";
662        let file = parse_python(code);
663        let adapter = PythonAdapter;
664        assert_eq!(adapter.count_naming_violations(&file), 1, "camelCase fn");
665    }
666
667    #[test]
668    fn test_python_debug_print() {
669        let code = r#"
670print("hello")
671print(x)
672"#;
673        let file = parse_python(code);
674        let adapter = PythonAdapter;
675        assert_eq!(adapter.count_debug_calls(&file), 2, "two print calls");
676    }
677
678    #[test]
679    fn test_python_debug_clean() {
680        let code = "result = add(1, 2)\n";
681        let file = parse_python(code);
682        let adapter = PythonAdapter;
683        assert_eq!(adapter.count_debug_calls(&file), 0, "no debug calls");
684    }
685
686    #[test]
687    fn test_python_extract_functions() {
688        let code = "def foo(): pass\ndef bar(x): return x\n";
689        let file = parse_python(code);
690        let adapter = PythonAdapter;
691        let fns = adapter.extract_functions(&file);
692        assert_eq!(fns.len(), 2, "2 functions");
693        assert_eq!(fns[0].name, "foo");
694        assert_eq!(fns[1].name, "bar");
695    }
696
697    #[test]
698    fn test_python_excessive_params() {
699        let code = "def process(a, b, c, d, e, f): pass\n";
700        let file = parse_python(code);
701        let adapter = PythonAdapter;
702        assert_eq!(adapter.count_excessive_params(&file, 5), 1, "6 > 5");
703    }
704
705    #[test]
706    fn test_python_magic_numbers() {
707        let code = "foo(42)\nbar(100)\n";
708        let file = parse_python(code);
709        let adapter = PythonAdapter;
710        assert_eq!(adapter.count_magic_numbers(&file), 2);
711    }
712
713    #[test]
714    fn test_python_magic_numbers_skips_trivial() {
715        let code = "x = 1 + 0\n";
716        let file = parse_python(code);
717        let adapter = PythonAdapter;
718        assert_eq!(adapter.count_magic_numbers(&file), 0, "0 and 1 skipped");
719    }
720
721    #[test]
722    fn test_python_dead_code_after_return() {
723        let code = r#"
724def foo():
725    return 42
726    print("dead")
727"#;
728        let file = parse_python(code);
729        let adapter = PythonAdapter;
730        assert_eq!(adapter.count_dead_code(&file), 1);
731    }
732
733    #[test]
734    fn test_python_dead_code_after_raise() {
735        let code = r#"
736def foo():
737    raise ValueError("bad")
738    x = 1
739"#;
740        let file = parse_python(code);
741        let adapter = PythonAdapter;
742        assert_eq!(adapter.count_dead_code(&file), 1);
743    }
744
745    #[test]
746    fn test_python_duplicate_imports() {
747        let code = "import os\nimport sys\nimport os\n";
748        let file = parse_python(code);
749        let adapter = PythonAdapter;
750        assert_eq!(adapter.count_duplicate_imports(&file), 1);
751    }
752}
garbage_code_hunter/language/adapter/python.rs

garbage_code_hunter/language/adapter/
python.rs