Skip to main content

tldr_cli/commands/remaining/
api_check.rs

1//! API Check command - Detect API misuse patterns
2//!
3//! Analyzes Python code for common API misuse patterns:
4//! - Timeout issues (requests.get without timeout)
5//! - Bare except clauses (catching all exceptions)
6//! - Weak crypto (MD5, SHA1 for security purposes)
7//! - Unclosed resources (files not using context managers)
8//!
9//! # Example
10//!
11//! ```bash
12//! tldr api-check src/
13//! tldr api-check src/main.py --category crypto
14//! tldr api-check src/ --severity high --format text
15//! ```
16
17use std::collections::HashMap;
18use std::fs;
19use std::path::{Path, PathBuf};
20
21use anyhow::Result;
22use clap::Args;
23use regex::Regex;
24use tldr_core::walker::walk_project;
25use tldr_core::Language;
26
27use super::error::RemainingError;
28use super::types::{
29    APICheckReport, APICheckSummary, APIRule, MisuseCategory, MisuseFinding, MisuseSeverity,
30};
31
32use crate::output::OutputWriter;
33
34// =============================================================================
35// Constants
36// =============================================================================
37
38/// Maximum files to analyze in a directory
39const MAX_DIRECTORY_FILES: u32 = 1000;
40
41/// Maximum file size to analyze (10 MB)
42const MAX_FILE_SIZE: u64 = 10 * 1024 * 1024;
43
44#[derive(Debug, Clone, Copy, PartialEq, Eq)]
45pub(crate) enum ApiLanguage {
46    Python,
47    Rust,
48    Go,
49    Java,
50    JavaScript,
51    TypeScript,
52    C,
53    Cpp,
54    Ruby,
55    Php,
56    Kotlin,
57    Swift,
58    CSharp,
59    Scala,
60    Elixir,
61    Lua,
62    Luau,
63    Ocaml,
64}
65
66#[derive(Clone, Copy)]
67struct RegexRuleSpec {
68    id: &'static str,
69    name: &'static str,
70    category: MisuseCategory,
71    severity: MisuseSeverity,
72    description: &'static str,
73    correct_usage: &'static str,
74    pattern: &'static str,
75    api_call: &'static str,
76    message: &'static str,
77    fix_suggestion: &'static str,
78}
79
80impl RegexRuleSpec {
81    fn rule(self) -> APIRule {
82        APIRule {
83            id: self.id.to_string(),
84            name: self.name.to_string(),
85            category: self.category,
86            severity: self.severity,
87            description: self.description.to_string(),
88            correct_usage: self.correct_usage.to_string(),
89        }
90    }
91}
92
93/// Per-rule language applicability (api-check-and-patterns-accuracy-v1,
94/// P11.BUG-AGG-6). Each rule id is tied to the language(s) for which the
95/// rule's pattern is meaningful. The scanner gates `check_regex_rule` and
96/// `check_rule` calls through [`rule_applies_to_language`] so a JS rule
97/// (e.g. `JS003 JSON.parse`) cannot fire against a `.cpp` file even if the
98/// rule list were ever cross-wired by mistake. The per-file `detect_language`
99/// dispatch (in [`ApiCheckArgs::run`]) is the primary gate; this is a
100/// defense-in-depth backstop documented declaratively.
101fn rule_applies_to_language(rule_id: &str, language: ApiLanguage) -> bool {
102    // Rule-id naming follows the constants in this file (`C00x`, `CPP00x`,
103    // `JS00x`, etc). Matching is exact prefix + numeric suffix to avoid
104    // confusing siblings: `C` must NOT match `CPP*`/`CS*`, `LU` must NOT
105    // match `LUA*` (no such id exists, but the digit-suffix rule keeps the
106    // matcher robust to future renames).
107    let prefix_lang: &[&str] = match language {
108        ApiLanguage::Python => &["PY"],
109        ApiLanguage::Rust => &["RS"],
110        ApiLanguage::Go => &["GO"],
111        ApiLanguage::Java => &["JV"],
112        ApiLanguage::JavaScript => &["JS"],
113        ApiLanguage::TypeScript => &["TS"],
114        ApiLanguage::C => &["C"],
115        ApiLanguage::Cpp => &["CPP"],
116        ApiLanguage::Ruby => &["RB"],
117        ApiLanguage::Php => &["PH"],
118        ApiLanguage::Kotlin => &["KT"],
119        ApiLanguage::Swift => &["SW"],
120        ApiLanguage::CSharp => &["CS"],
121        ApiLanguage::Scala => &["SC"],
122        ApiLanguage::Elixir => &["EX"],
123        ApiLanguage::Lua | ApiLanguage::Luau => &["LU"],
124        ApiLanguage::Ocaml => &["OC"],
125    };
126    for prefix in prefix_lang {
127        if let Some(rest) = rule_id.strip_prefix(prefix) {
128            // Require digit immediately after prefix so "C" doesn't
129            // match "CPP001"/"CS001".
130            if rest.chars().next().is_some_and(|c| c.is_ascii_digit()) {
131                return true;
132            }
133        }
134    }
135    false
136}
137
138const GO_RULE_SPECS: &[RegexRuleSpec] = &[
139    RegexRuleSpec {
140        id: "GO001",
141        name: "deprecated-ioutil-readfile",
142        category: MisuseCategory::Resources,
143        severity: MisuseSeverity::Low,
144        description: "ioutil.ReadFile is deprecated and encourages unbounded whole-file reads",
145        correct_usage: "Use os.ReadFile or stream with bufio.Scanner/Reader",
146        pattern: r"\bioutil\.ReadFile\s*\(",
147        api_call: "ioutil.ReadFile",
148        message: "ioutil.ReadFile is deprecated and can load unbounded content into memory",
149        fix_suggestion: "Use os.ReadFile for simple reads or bufio.Reader for bounded streaming",
150    },
151    RegexRuleSpec {
152        id: "GO002",
153        name: "http-get-without-timeout",
154        category: MisuseCategory::Parameters,
155        severity: MisuseSeverity::Medium,
156        description: "http.Get uses the default client and provides no call-specific timeout",
157        correct_usage: "Use an http.Client with Timeout or context-aware requests",
158        pattern: r"\bhttp\.Get\s*\(",
159        api_call: "http.Get",
160        message: "http.Get without an explicit timeout can hang indefinitely",
161        fix_suggestion: "Use an http.Client{Timeout: ...} or NewRequestWithContext",
162    },
163    RegexRuleSpec {
164        id: "GO003",
165        name: "exec-command",
166        category: MisuseCategory::Security,
167        severity: MisuseSeverity::High,
168        description: "exec.Command is risky when arguments or executable names come from input",
169        correct_usage: "Prefer direct library APIs or strictly validate allowed commands",
170        pattern: r"\bexec\.Command\s*\(",
171        api_call: "exec.Command",
172        message: "exec.Command can enable command injection when fed user-controlled values",
173        fix_suggestion: "Validate commands against an allowlist and avoid shell-like execution",
174    },
175    RegexRuleSpec {
176        id: "GO004",
177        name: "template-html-cast",
178        category: MisuseCategory::Security,
179        severity: MisuseSeverity::High,
180        description: "template.HTML bypasses html/template escaping guarantees",
181        correct_usage: "Pass plain strings to templates and let html/template escape them",
182        pattern: r"\btemplate\.HTML\s*\(",
183        api_call: "template.HTML",
184        message: "template.HTML disables escaping and can introduce XSS",
185        fix_suggestion: "Remove the cast and rely on html/template auto-escaping",
186    },
187    RegexRuleSpec {
188        id: "GO005",
189        name: "sql-query-without-context",
190        category: MisuseCategory::CallOrder,
191        severity: MisuseSeverity::Medium,
192        description:
193            "sql.DB.Query lacks cancellation and timeout propagation compared with QueryContext",
194        correct_usage: "Use db.QueryContext(ctx, query, args...)",
195        pattern: r"\bsql\.Query\s*\(",
196        api_call: "sql.Query",
197        message: "sql.Query omits context-driven cancellation and timeout handling",
198        fix_suggestion: "Use QueryContext/ExecContext with a bounded context",
199    },
200];
201
202const JAVA_RULE_SPECS: &[RegexRuleSpec] = &[
203    RegexRuleSpec {
204        id: "JV001",
205        name: "string-comparison-with-double-equals",
206        category: MisuseCategory::CallOrder,
207        severity: MisuseSeverity::Medium,
208        description: "Using == on strings compares references instead of values",
209        correct_usage: "Use value.equals(other) or Objects.equals(a, b)",
210        pattern: r#"(?:".*"|\b\w+\b)\s*==\s*(?:".*"|\b\w+\b)"#,
211        api_call: "==",
212        message: "String comparison with == checks reference identity, not value equality",
213        fix_suggestion: "Use .equals(...) or Objects.equals(...) for string values",
214    },
215    RegexRuleSpec {
216        id: "JV002",
217        name: "runtime-exec",
218        category: MisuseCategory::Security,
219        severity: MisuseSeverity::High,
220        description: "Runtime.exec is dangerous with dynamic input and hard to sandbox correctly",
221        correct_usage: "Use structured APIs or a ProcessBuilder with validated arguments",
222        pattern: r"\bRuntime\.getRuntime\(\)\.exec\s*\(",
223        api_call: "Runtime.exec",
224        message: "Runtime.exec is a common command injection footgun",
225        fix_suggestion: "Prefer library APIs or tightly validated ProcessBuilder arguments",
226    },
227    RegexRuleSpec {
228        id: "JV003",
229        name: "objectinputstream-deserialization",
230        category: MisuseCategory::Security,
231        severity: MisuseSeverity::High,
232        description:
233            "ObjectInputStream on untrusted data can trigger unsafe deserialization gadgets",
234        correct_usage: "Use safer formats like JSON with explicit schemas",
235        pattern: r"\bnew\s+ObjectInputStream\s*\(",
236        api_call: "ObjectInputStream",
237        message: "ObjectInputStream enables unsafe native Java deserialization",
238        fix_suggestion: "Replace native object deserialization with a schema-driven format",
239    },
240    RegexRuleSpec {
241        id: "JV004",
242        name: "create-statement",
243        category: MisuseCategory::Security,
244        severity: MisuseSeverity::Medium,
245        description:
246            "createStatement often leads to string-built SQL instead of prepared statements",
247        correct_usage: "Use prepareStatement with placeholders",
248        pattern: r"\bcreateStatement\s*\(",
249        api_call: "createStatement",
250        message: "createStatement encourages dynamic SQL and weak parameter handling",
251        fix_suggestion: "Use prepareStatement with bound parameters",
252    },
253    RegexRuleSpec {
254        id: "JV005",
255        name: "system-gc-call",
256        category: MisuseCategory::Resources,
257        severity: MisuseSeverity::Low,
258        description: "System.gc() is usually a performance smell and not a reliable memory fix",
259        correct_usage: "Remove manual GC triggers and profile allocations instead",
260        pattern: r"\bSystem\.gc\s*\(",
261        api_call: "System.gc",
262        message: "System.gc() is an unreliable manual GC hint and often harms latency",
263        fix_suggestion: "Remove the call and fix the underlying allocation or lifetime issue",
264    },
265];
266
267const JAVASCRIPT_RULE_SPECS: &[RegexRuleSpec] = &[
268    RegexRuleSpec {
269        id: "JS001",
270        name: "loose-equality",
271        category: MisuseCategory::CallOrder,
272        severity: MisuseSeverity::Medium,
273        description: "Loose equality allows coercions that frequently hide correctness bugs",
274        correct_usage: "Use === / !== except in deliberately reviewed coercion cases",
275        pattern: r"\s==\s|\s!=\s",
276        api_call: "==",
277        message: "Loose equality can coerce values unexpectedly",
278        fix_suggestion: "Use === or !== and handle explicit type conversion",
279    },
280    RegexRuleSpec {
281        id: "JS002",
282        name: "parseint-without-radix",
283        category: MisuseCategory::Parameters,
284        severity: MisuseSeverity::Low,
285        description: "parseInt without a radix is ambiguous and less explicit than required",
286        correct_usage: "Use parseInt(value, 10)",
287        pattern: r"\bparseInt\s*\(\s*[^,\)]+\)",
288        api_call: "parseInt",
289        message: "parseInt called without an explicit radix",
290        fix_suggestion: "Pass a radix explicitly, usually parseInt(value, 10)",
291    },
292    RegexRuleSpec {
293        id: "JS003",
294        name: "json-parse-without-guard",
295        category: MisuseCategory::ErrorHandling,
296        severity: MisuseSeverity::Low,
297        description: "JSON.parse throws on malformed input and should usually be guarded",
298        correct_usage: "Wrap JSON.parse in try/catch when input is not fully trusted",
299        pattern: r"\bJSON\.parse\s*\(",
300        api_call: "JSON.parse",
301        message: "JSON.parse can throw and should be guarded for untrusted input",
302        fix_suggestion: "Use try/catch or validated parsing for untrusted payloads",
303    },
304    RegexRuleSpec {
305        id: "JS004",
306        name: "document-write",
307        category: MisuseCategory::Security,
308        severity: MisuseSeverity::High,
309        description: "document.write is legacy, brittle, and can inject unsanitized HTML",
310        correct_usage: "Use DOM APIs like textContent/appendChild instead",
311        pattern: r"\bdocument\.write(?:ln)?\s*\(",
312        api_call: "document.write",
313        message: "document.write is unsafe and can enable XSS",
314        fix_suggestion: "Use safe DOM APIs instead of writing raw HTML strings",
315    },
316    RegexRuleSpec {
317        id: "JS005",
318        name: "eval-call",
319        category: MisuseCategory::Security,
320        severity: MisuseSeverity::High,
321        description: "eval executes dynamic code and should be avoided",
322        correct_usage: "Use structured data parsing or explicit dispatch tables",
323        pattern: r"\beval\s*\(",
324        api_call: "eval",
325        message: "eval executes dynamic code and creates major security risk",
326        fix_suggestion: "Replace eval with data parsing or explicit function dispatch",
327    },
328];
329
330const TYPESCRIPT_RULE_SPECS: &[RegexRuleSpec] = &[
331    RegexRuleSpec {
332        id: "TS001",
333        name: "loose-equality",
334        category: MisuseCategory::CallOrder,
335        severity: MisuseSeverity::Medium,
336        description: "Loose equality allows coercions that frequently hide correctness bugs",
337        correct_usage: "Use === / !== except in deliberately reviewed coercion cases",
338        pattern: r"\s==\s|\s!=\s",
339        api_call: "==",
340        message: "Loose equality can coerce values unexpectedly",
341        fix_suggestion: "Use === or !== and handle explicit type conversion",
342    },
343    RegexRuleSpec {
344        id: "TS002",
345        name: "parseint-without-radix",
346        category: MisuseCategory::Parameters,
347        severity: MisuseSeverity::Low,
348        description: "parseInt without a radix is ambiguous and less explicit than required",
349        correct_usage: "Use parseInt(value, 10)",
350        pattern: r"\bparseInt\s*\(\s*[^,\)]+\)",
351        api_call: "parseInt",
352        message: "parseInt called without an explicit radix",
353        fix_suggestion: "Pass a radix explicitly, usually parseInt(value, 10)",
354    },
355    RegexRuleSpec {
356        id: "TS003",
357        name: "json-parse-without-guard",
358        category: MisuseCategory::ErrorHandling,
359        severity: MisuseSeverity::Low,
360        description: "JSON.parse throws on malformed input and should usually be guarded",
361        correct_usage: "Wrap JSON.parse in try/catch when input is not fully trusted",
362        pattern: r"\bJSON\.parse\s*\(",
363        api_call: "JSON.parse",
364        message: "JSON.parse can throw and should be guarded for untrusted input",
365        fix_suggestion: "Use try/catch or validated parsing for untrusted payloads",
366    },
367    RegexRuleSpec {
368        id: "TS004",
369        name: "document-write",
370        category: MisuseCategory::Security,
371        severity: MisuseSeverity::High,
372        description: "document.write is legacy, brittle, and can inject unsanitized HTML",
373        correct_usage: "Use DOM APIs like textContent/appendChild instead",
374        pattern: r"\bdocument\.write(?:ln)?\s*\(",
375        api_call: "document.write",
376        message: "document.write is unsafe and can enable XSS",
377        fix_suggestion: "Use safe DOM APIs instead of writing raw HTML strings",
378    },
379    RegexRuleSpec {
380        id: "TS005",
381        name: "eval-call",
382        category: MisuseCategory::Security,
383        severity: MisuseSeverity::High,
384        description: "eval executes dynamic code and should be avoided",
385        correct_usage: "Use structured data parsing or explicit dispatch tables",
386        pattern: r"\beval\s*\(",
387        api_call: "eval",
388        message: "eval executes dynamic code and creates major security risk",
389        fix_suggestion: "Replace eval with data parsing or explicit function dispatch",
390    },
391];
392
393const C_RULE_SPECS: &[RegexRuleSpec] = &[
394    RegexRuleSpec {
395        id: "C001",
396        name: "gets-call",
397        category: MisuseCategory::Security,
398        severity: MisuseSeverity::High,
399        description: "gets cannot bound input and has been removed from the standard library",
400        correct_usage: "Use fgets with an explicit buffer length",
401        pattern: r"\bgets\s*\(",
402        api_call: "gets",
403        message: "gets is inherently unsafe and enables buffer overflows",
404        fix_suggestion: "Use fgets(buffer, size, stdin) or another bounded API",
405    },
406    RegexRuleSpec {
407        id: "C002",
408        name: "strcpy-call",
409        category: MisuseCategory::Security,
410        severity: MisuseSeverity::High,
411        description: "strcpy performs unbounded copies and easily overflows buffers",
412        correct_usage: "Use snprintf, strlcpy, or explicit bounds checks",
413        pattern: r"\bstrcpy\s*\(",
414        api_call: "strcpy",
415        message: "strcpy performs an unbounded copy",
416        fix_suggestion: "Replace strcpy with a bounded copy strategy",
417    },
418    RegexRuleSpec {
419        id: "C003",
420        name: "sprintf-call",
421        category: MisuseCategory::Security,
422        severity: MisuseSeverity::High,
423        description: "sprintf writes formatted data without a size bound",
424        correct_usage: "Use snprintf with the destination buffer size",
425        pattern: r"\bsprintf\s*\(",
426        api_call: "sprintf",
427        message: "sprintf can overflow fixed-size buffers",
428        fix_suggestion: "Use snprintf(buffer, size, ...) instead",
429    },
430    RegexRuleSpec {
431        id: "C004",
432        name: "scanf-string-without-width",
433        category: MisuseCategory::Security,
434        severity: MisuseSeverity::High,
435        description: "scanf with %s and no width limit can overflow the destination buffer",
436        correct_usage: "Provide a width specifier or use fgets",
437        pattern: r#"\bscanf\s*\(\s*"%s"#,
438        api_call: "scanf",
439        message: "scanf(\"%s\") reads unbounded input into a buffer",
440        fix_suggestion: "Add a width limit or use fgets plus parsing",
441    },
442    RegexRuleSpec {
443        id: "C005",
444        name: "system-call",
445        category: MisuseCategory::Security,
446        severity: MisuseSeverity::High,
447        description: "system executes a shell command and is dangerous with dynamic input",
448        correct_usage: "Use execve-family APIs with validated arguments where possible",
449        pattern: r"\bsystem\s*\(",
450        api_call: "system",
451        message: "system executes a shell and is a common command injection vector",
452        fix_suggestion: "Avoid shell execution or tightly validate the command source",
453    },
454];
455
456const CPP_RULE_SPECS: &[RegexRuleSpec] = &[
457    RegexRuleSpec {
458        id: "CPP001",
459        name: "strcpy-call",
460        category: MisuseCategory::Security,
461        severity: MisuseSeverity::High,
462        description: "strcpy performs unbounded copies and easily overflows buffers",
463        correct_usage: "Use std::string, snprintf, or another bounded copy strategy",
464        pattern: r"\bstrcpy\s*\(",
465        api_call: "strcpy",
466        message: "strcpy performs an unbounded copy",
467        fix_suggestion: "Use std::string or a bounded copy API instead",
468    },
469    RegexRuleSpec {
470        id: "CPP002",
471        name: "sprintf-call",
472        category: MisuseCategory::Security,
473        severity: MisuseSeverity::High,
474        description: "sprintf writes formatted data without a size bound",
475        correct_usage: "Use snprintf or std::format into a bounded container",
476        pattern: r"\bsprintf\s*\(",
477        api_call: "sprintf",
478        message: "sprintf can overflow fixed-size buffers",
479        fix_suggestion: "Use snprintf or a safer formatting abstraction",
480    },
481    RegexRuleSpec {
482        id: "CPP003",
483        name: "auto-ptr",
484        category: MisuseCategory::Resources,
485        severity: MisuseSeverity::Medium,
486        description: "std::auto_ptr is obsolete and has broken transfer semantics",
487        correct_usage: "Use std::unique_ptr or std::shared_ptr",
488        pattern: r"\bstd::auto_ptr\s*<",
489        api_call: "std::auto_ptr",
490        message: "std::auto_ptr is obsolete and unsafe by modern ownership standards",
491        fix_suggestion: "Replace std::auto_ptr with std::unique_ptr or std::shared_ptr",
492    },
493    RegexRuleSpec {
494        id: "CPP004",
495        name: "raw-new",
496        category: MisuseCategory::Resources,
497        severity: MisuseSeverity::Medium,
498        description: "Raw new often leads to leaks and exception-safety issues",
499        correct_usage: "Use std::make_unique or stack allocation where possible",
500        pattern: r"\bnew\s+\w",
501        api_call: "new",
502        message: "Raw new makes ownership and exception safety harder to reason about",
503        fix_suggestion: "Use std::make_unique, containers, or stack allocation",
504    },
505    RegexRuleSpec {
506        id: "CPP005",
507        name: "system-call",
508        category: MisuseCategory::Security,
509        severity: MisuseSeverity::High,
510        description: "system executes a shell command and is dangerous with dynamic input",
511        correct_usage: "Use direct process APIs with validated arguments when possible",
512        pattern: r"(?:\bstd::)?system\s*\(",
513        api_call: "system",
514        message: "system executes a shell and is a common command injection vector",
515        fix_suggestion: "Avoid shell execution or tightly validate all command components",
516    },
517];
518
519const RUBY_RULE_SPECS: &[RegexRuleSpec] = &[
520    RegexRuleSpec {
521        id: "RB001",
522        name: "eval-call",
523        category: MisuseCategory::Security,
524        severity: MisuseSeverity::High,
525        description: "eval executes dynamic Ruby code and should be avoided",
526        correct_usage: "Use explicit dispatch or data parsing instead of dynamic code execution",
527        pattern: r"\beval\s*\(",
528        api_call: "eval",
529        message: "eval executes dynamic code and creates major security risk",
530        fix_suggestion: "Replace eval with explicit dispatch or structured parsing",
531    },
532    RegexRuleSpec {
533        id: "RB002",
534        name: "dynamic-send",
535        category: MisuseCategory::Security,
536        severity: MisuseSeverity::Medium,
537        description: "send can invoke arbitrary methods when fed untrusted method names",
538        correct_usage: "Use public_send on a strict allowlist of method names",
539        pattern: r"\.send\s*\(",
540        api_call: "send",
541        message: "send can dispatch to unsafe or unexpected methods",
542        fix_suggestion: "Use public_send with a reviewed allowlist",
543    },
544    RegexRuleSpec {
545        id: "RB003",
546        name: "system-call",
547        category: MisuseCategory::Security,
548        severity: MisuseSeverity::High,
549        description: "system executes a shell command and is dangerous with interpolated input",
550        correct_usage: "Use array-form process APIs with validated arguments",
551        pattern: r"\bsystem\s*\(",
552        api_call: "system",
553        message: "system is a common command injection footgun",
554        fix_suggestion: "Avoid shell execution or pass validated argv-style arguments",
555    },
556    RegexRuleSpec {
557        id: "RB004",
558        name: "yaml-load",
559        category: MisuseCategory::Security,
560        severity: MisuseSeverity::High,
561        description: "YAML.load can instantiate arbitrary objects from untrusted input",
562        correct_usage: "Use YAML.safe_load with permitted classes",
563        pattern: r"\bYAML\.load\s*\(",
564        api_call: "YAML.load",
565        message: "YAML.load can deserialize unsafe objects",
566        fix_suggestion: "Use YAML.safe_load and restrict allowed classes",
567    },
568    RegexRuleSpec {
569        id: "RB005",
570        name: "marshal-load",
571        category: MisuseCategory::Security,
572        severity: MisuseSeverity::High,
573        description: "Marshal.load on untrusted data is unsafe deserialization",
574        correct_usage: "Use JSON or another safe, schema-checked format",
575        pattern: r"\bMarshal\.load\s*\(",
576        api_call: "Marshal.load",
577        message: "Marshal.load performs unsafe native deserialization",
578        fix_suggestion: "Replace Marshal.load with a safer serialization format",
579    },
580];
581
582const PHP_RULE_SPECS: &[RegexRuleSpec] = &[
583    RegexRuleSpec {
584        id: "PH001",
585        name: "deprecated-mysql-functions",
586        category: MisuseCategory::Security,
587        severity: MisuseSeverity::High,
588        description: "mysql_* APIs are removed and encourage unsafe query construction",
589        correct_usage: "Use PDO or mysqli with prepared statements",
590        pattern: r"\bmysql_[a-z_]+\s*\(",
591        api_call: "mysql_*",
592        message: "mysql_* functions are removed and unsafe by modern standards",
593        fix_suggestion: "Migrate to PDO or mysqli prepared statements",
594    },
595    RegexRuleSpec {
596        id: "PH002",
597        name: "extract-call",
598        category: MisuseCategory::Security,
599        severity: MisuseSeverity::Medium,
600        description: "extract pollutes local scope and can overwrite important variables",
601        correct_usage: "Read array keys explicitly instead of splatting them into scope",
602        pattern: r"\bextract\s*\(",
603        api_call: "extract",
604        message: "extract can overwrite local variables and hide data flow",
605        fix_suggestion: "Assign required keys explicitly instead of using extract",
606    },
607    RegexRuleSpec {
608        id: "PH003",
609        name: "eval-call",
610        category: MisuseCategory::Security,
611        severity: MisuseSeverity::High,
612        description: "eval executes dynamic PHP code and should be avoided",
613        correct_usage: "Use explicit dispatch or data parsing instead of dynamic code execution",
614        pattern: r"\beval\s*\(",
615        api_call: "eval",
616        message: "eval executes dynamic code and creates major security risk",
617        fix_suggestion: "Replace eval with explicit dispatch or structured parsing",
618    },
619    RegexRuleSpec {
620        id: "PH004",
621        name: "variable-variables",
622        category: MisuseCategory::Security,
623        severity: MisuseSeverity::Medium,
624        description: "Variable variables make scope mutation hard to reason about",
625        correct_usage: "Use associative arrays or explicit variables instead",
626        pattern: r"\$\$[A-Za-z_]",
627        api_call: "$$",
628        message: "Variable variables obscure data flow and can enable unsafe access patterns",
629        fix_suggestion: "Use an array/map or explicit variable names instead",
630    },
631    RegexRuleSpec {
632        id: "PH005",
633        name: "unserialize-call",
634        category: MisuseCategory::Security,
635        severity: MisuseSeverity::High,
636        description: "unserialize on untrusted data can trigger object injection chains",
637        correct_usage: "Use json_decode or a safer schema-checked format",
638        pattern: r"\bunserialize\s*\(",
639        api_call: "unserialize",
640        message: "unserialize enables unsafe object deserialization",
641        fix_suggestion: "Replace unserialize with json_decode or a safe serializer",
642    },
643];
644
645const KOTLIN_RULE_SPECS: &[RegexRuleSpec] = &[
646    RegexRuleSpec {
647        id: "KT001",
648        name: "force-unwrapped-null",
649        category: MisuseCategory::ErrorHandling,
650        severity: MisuseSeverity::Medium,
651        description: "!! converts nullable values into runtime crashes",
652        correct_usage: "Use safe calls, let, requireNotNull, or explicit branching",
653        pattern: r"!!",
654        api_call: "!!",
655        message: "!! will throw NullPointerException on null values",
656        fix_suggestion: "Use safe calls or explicit null handling instead of !!",
657    },
658    RegexRuleSpec {
659        id: "KT002",
660        name: "lateinit-var",
661        category: MisuseCategory::ErrorHandling,
662        severity: MisuseSeverity::Low,
663        description: "lateinit shifts initialization failures to runtime",
664        correct_usage: "Prefer constructor injection or nullable/state wrappers",
665        pattern: r"\blateinit\s+var\b",
666        api_call: "lateinit",
667        message: "lateinit can fail at runtime if the property is read before initialization",
668        fix_suggestion: "Prefer constructor injection or explicit nullable state",
669    },
670    RegexRuleSpec {
671        id: "KT003",
672        name: "globalscope-launch",
673        category: MisuseCategory::Concurrency,
674        severity: MisuseSeverity::Medium,
675        description: "GlobalScope.launch escapes structured concurrency and leaks work",
676        correct_usage: "Launch from a lifecycle-bound CoroutineScope",
677        pattern: r"\bGlobalScope\.launch\s*\(",
678        api_call: "GlobalScope.launch",
679        message: "GlobalScope.launch detaches work from structured concurrency",
680        fix_suggestion: "Use a lifecycle-bound CoroutineScope instead",
681    },
682    RegexRuleSpec {
683        id: "KT004",
684        name: "runtime-exec",
685        category: MisuseCategory::Security,
686        severity: MisuseSeverity::High,
687        description: "Runtime.exec is dangerous with dynamic input and hard to sandbox correctly",
688        correct_usage: "Use structured APIs or strictly validated ProcessBuilder arguments",
689        pattern: r"\bRuntime\.getRuntime\(\)\.exec\s*\(",
690        api_call: "Runtime.exec",
691        message: "Runtime.exec is a common command injection footgun",
692        fix_suggestion: "Prefer library APIs or tightly validated ProcessBuilder arguments",
693    },
694    RegexRuleSpec {
695        id: "KT005",
696        name: "thread-sleep",
697        category: MisuseCategory::Concurrency,
698        severity: MisuseSeverity::Low,
699        description:
700            "Thread.sleep blocks threads directly and is usually wrong in coroutine-based code",
701        correct_usage: "Use delay(...) in coroutines or higher-level scheduling",
702        pattern: r"\bThread\.sleep\s*\(",
703        api_call: "Thread.sleep",
704        message: "Thread.sleep blocks the current thread directly",
705        fix_suggestion: "Use delay(...) or a proper scheduler instead",
706    },
707];
708
709const SWIFT_RULE_SPECS: &[RegexRuleSpec] = &[
710    RegexRuleSpec {
711        id: "SW001",
712        name: "forced-cast",
713        category: MisuseCategory::ErrorHandling,
714        severity: MisuseSeverity::Medium,
715        description: "as! crashes at runtime when the cast fails",
716        correct_usage: "Use as? with conditional handling",
717        pattern: r"\bas!\b",
718        api_call: "as!",
719        message: "Forced casts crash when the runtime type is different",
720        fix_suggestion: "Use as? and handle the nil case explicitly",
721    },
722    RegexRuleSpec {
723        id: "SW002",
724        name: "forced-try",
725        category: MisuseCategory::ErrorHandling,
726        severity: MisuseSeverity::Medium,
727        description: "try! crashes when the call throws",
728        correct_usage: "Use do/catch or try? with explicit fallback",
729        pattern: r"\btry!\b",
730        api_call: "try!",
731        message: "try! crashes the process on thrown errors",
732        fix_suggestion: "Use do/catch or try? and handle failure explicitly",
733    },
734    RegexRuleSpec {
735        id: "SW003",
736        name: "force-unwrap",
737        category: MisuseCategory::ErrorHandling,
738        severity: MisuseSeverity::Medium,
739        description: "Force unwrapping optionals crashes at runtime on nil",
740        correct_usage: "Use if let, guard let, or nil-coalescing",
741        pattern: r"\b[A-Za-z_][A-Za-z0-9_]*!",
742        api_call: "!",
743        message: "Force unwraps crash when the optional is nil",
744        fix_suggestion: "Use optional binding or nil-coalescing instead of force unwraps",
745    },
746    RegexRuleSpec {
747        id: "SW004",
748        name: "nskeyedunarchiver",
749        category: MisuseCategory::Security,
750        severity: MisuseSeverity::High,
751        description: "Legacy NSKeyedUnarchiver APIs on untrusted data are unsafe",
752        correct_usage: "Use secure decoding APIs with requiresSecureCoding",
753        pattern: r"\bNSKeyedUnarchiver\.unarchiveObject",
754        api_call: "NSKeyedUnarchiver",
755        message: "Legacy unarchiving can deserialize unexpected object graphs",
756        fix_suggestion: "Use secure coding APIs and schema-checked decoding",
757    },
758    RegexRuleSpec {
759        id: "SW005",
760        name: "fatalerror-call",
761        category: MisuseCategory::ErrorHandling,
762        severity: MisuseSeverity::Low,
763        description:
764            "fatalError terminates the process and is risky outside clearly impossible states",
765        correct_usage: "Return/throw recoverable errors where possible",
766        pattern: r"\bfatalError\s*\(",
767        api_call: "fatalError",
768        message: "fatalError terminates the process immediately",
769        fix_suggestion: "Use recoverable error handling unless the state is truly unreachable",
770    },
771];
772
773const CSHARP_RULE_SPECS: &[RegexRuleSpec] = &[
774    RegexRuleSpec {
775        id: "CS001",
776        name: "binaryformatter",
777        category: MisuseCategory::Security,
778        severity: MisuseSeverity::High,
779        description: "BinaryFormatter is insecure and obsolete for untrusted data",
780        correct_usage: "Use System.Text.Json or another safe serializer",
781        pattern: r"\bBinaryFormatter\b",
782        api_call: "BinaryFormatter",
783        message: "BinaryFormatter is insecure and should not be used",
784        fix_suggestion: "Use System.Text.Json or another safe serializer",
785    },
786    RegexRuleSpec {
787        id: "CS002",
788        name: "gc-collect",
789        category: MisuseCategory::Resources,
790        severity: MisuseSeverity::Low,
791        description: "GC.Collect is rarely the right fix and often harms latency",
792        correct_usage: "Remove manual GC triggers and profile the real allocation issue",
793        pattern: r"\bGC\.Collect\s*\(",
794        api_call: "GC.Collect",
795        message: "GC.Collect is an unreliable manual GC hint and often harms performance",
796        fix_suggestion: "Remove the call and fix the underlying allocation issue",
797    },
798    RegexRuleSpec {
799        id: "CS003",
800        name: "task-result",
801        category: MisuseCategory::Concurrency,
802        severity: MisuseSeverity::Medium,
803        description: "Task.Result blocks synchronously and can deadlock async flows",
804        correct_usage: "Use await instead of blocking on Task.Result",
805        pattern: r"\.Result\b",
806        api_call: "Task.Result",
807        message: "Task.Result blocks synchronously and can deadlock async contexts",
808        fix_suggestion: "Use await and keep the async chain asynchronous",
809    },
810    RegexRuleSpec {
811        id: "CS004",
812        name: "task-wait",
813        category: MisuseCategory::Concurrency,
814        severity: MisuseSeverity::Medium,
815        description: "Task.Wait blocks synchronously and can deadlock async flows",
816        correct_usage: "Use await or WhenAll/WhenAny instead of blocking waits",
817        pattern: r"\.Wait\s*\(",
818        api_call: "Task.Wait",
819        message: "Task.Wait blocks synchronously and can deadlock async contexts",
820        fix_suggestion: "Use await or asynchronous coordination primitives instead",
821    },
822    RegexRuleSpec {
823        id: "CS005",
824        name: "process-start",
825        category: MisuseCategory::Security,
826        severity: MisuseSeverity::High,
827        description: "Process.Start is dangerous with untrusted paths or arguments",
828        correct_usage: "Use strict allowlists and avoid shell execution semantics",
829        pattern: r"\bProcess\.Start\s*\(",
830        api_call: "Process.Start",
831        message: "Process.Start can enable command injection with untrusted inputs",
832        fix_suggestion: "Validate executable and arguments against a strict allowlist",
833    },
834];
835
836const SCALA_RULE_SPECS: &[RegexRuleSpec] = &[
837    RegexRuleSpec {
838        id: "SC001",
839        name: "null-usage",
840        category: MisuseCategory::ErrorHandling,
841        severity: MisuseSeverity::Low,
842        description: "null bypasses Scala's stronger option-based absence modeling",
843        correct_usage: "Use Option instead of null",
844        pattern: r"\bnull\b",
845        api_call: "null",
846        message: "null reintroduces runtime absence bugs into Scala code",
847        fix_suggestion: "Use Option and explicit pattern matching instead",
848    },
849    RegexRuleSpec {
850        id: "SC002",
851        name: "asinstanceof-cast",
852        category: MisuseCategory::ErrorHandling,
853        severity: MisuseSeverity::Medium,
854        description: "asInstanceOf crashes at runtime when the type assumption is wrong",
855        correct_usage: "Use pattern matching or TypeTag/ClassTag-aware APIs",
856        pattern: r"\basInstanceOf\[",
857        api_call: "asInstanceOf",
858        message: "asInstanceOf creates unchecked runtime casts",
859        fix_suggestion: "Use pattern matching or safer typed abstractions",
860    },
861    RegexRuleSpec {
862        id: "SC003",
863        name: "await-result",
864        category: MisuseCategory::Concurrency,
865        severity: MisuseSeverity::Medium,
866        description: "Await.result blocks threads and can collapse asynchronous throughput",
867        correct_usage: "Compose futures asynchronously instead of blocking",
868        pattern: r"\bAwait\.result\s*\(",
869        api_call: "Await.result",
870        message: "Await.result blocks threads and can create deadlocks or latency spikes",
871        fix_suggestion: "Use map/flatMap/for-comprehensions instead of blocking",
872    },
873    RegexRuleSpec {
874        id: "SC004",
875        name: "mutable-collection",
876        category: MisuseCategory::Concurrency,
877        severity: MisuseSeverity::Low,
878        description: "scala.collection.mutable structures are harder to reason about under concurrency",
879        correct_usage: "Prefer immutable collections unless mutation is intentionally scoped",
880        pattern: r"\bscala\.collection\.mutable\.",
881        api_call: "scala.collection.mutable",
882        message: "Mutable collections can hide shared-state bugs",
883        fix_suggestion: "Prefer immutable collections or encapsulate mutation carefully",
884    },
885    RegexRuleSpec {
886        id: "SC005",
887        name: "sys-process",
888        category: MisuseCategory::Security,
889        severity: MisuseSeverity::High,
890        description: "sys.process.Process executes external commands and is dangerous with input-derived values",
891        correct_usage: "Use library APIs or validate commands and arguments against an allowlist",
892        pattern: r"\bsys\.process\.Process\s*\(",
893        api_call: "sys.process.Process",
894        message: "sys.process.Process can enable command injection with untrusted input",
895        fix_suggestion: "Avoid shell-style execution or strictly validate all command parts",
896    },
897];
898
899const ELIXIR_RULE_SPECS: &[RegexRuleSpec] = &[
900    RegexRuleSpec {
901        id: "EX001",
902        name: "string-to-atom",
903        category: MisuseCategory::Security,
904        severity: MisuseSeverity::High,
905        description: "String.to_atom on untrusted input can exhaust the VM atom table",
906        correct_usage: "Use String.to_existing_atom only for reviewed values or keep strings",
907        pattern: r"\bString\.to_atom\s*\(",
908        api_call: "String.to_atom",
909        message: "String.to_atom can permanently grow the atom table from user input",
910        fix_suggestion: "Keep values as strings or use a reviewed to_existing_atom path",
911    },
912    RegexRuleSpec {
913        id: "EX002",
914        name: "code-eval-string",
915        category: MisuseCategory::Security,
916        severity: MisuseSeverity::High,
917        description: "Code.eval_string executes dynamic Elixir code and should be avoided",
918        correct_usage: "Use explicit dispatch or data parsing instead of dynamic evaluation",
919        pattern: r"\bCode\.eval_string\s*\(",
920        api_call: "Code.eval_string",
921        message: "Code.eval_string executes dynamic code and is a major security risk",
922        fix_suggestion: "Replace dynamic evaluation with explicit dispatch or parsing",
923    },
924    RegexRuleSpec {
925        id: "EX003",
926        name: "binary-to-term",
927        category: MisuseCategory::Security,
928        severity: MisuseSeverity::High,
929        description: ":erlang.binary_to_term on untrusted data is unsafe deserialization",
930        correct_usage: "Use safe formats like JSON or term_to_binary only for trusted data",
931        pattern: r":erlang\.binary_to_term\s*\(",
932        api_call: ":erlang.binary_to_term",
933        message: ":erlang.binary_to_term can deserialize unsafe terms from untrusted input",
934        fix_suggestion: "Use a safer serialization format for external input",
935    },
936    RegexRuleSpec {
937        id: "EX004",
938        name: "file-read-bang",
939        category: MisuseCategory::ErrorHandling,
940        severity: MisuseSeverity::Low,
941        description: "Bang file APIs raise instead of returning tagged tuples",
942        correct_usage: "Prefer File.read/1 with explicit {:ok, data} / {:error, reason} handling",
943        pattern: r"\bFile\.read!\s*\(",
944        api_call: "File.read!",
945        message: "File.read! raises on failure instead of returning a recoverable error",
946        fix_suggestion: "Use File.read/1 and handle the returned tuple explicitly",
947    },
948    RegexRuleSpec {
949        id: "EX005",
950        name: "task-await-infinity",
951        category: MisuseCategory::Concurrency,
952        severity: MisuseSeverity::Medium,
953        description: "Task.await with :infinity can stall callers indefinitely",
954        correct_usage: "Use bounded timeouts and supervised retry/cancellation behavior",
955        pattern: r"\bTask\.await\s*\([^,]+,\s*:infinity\s*\)",
956        api_call: "Task.await",
957        message: "Task.await(..., :infinity) can block forever",
958        fix_suggestion: "Use a bounded timeout and explicit failure handling",
959    },
960];
961
962const LUA_RULE_SPECS: &[RegexRuleSpec] = &[
963    RegexRuleSpec {
964        id: "LU001",
965        name: "implicit-global",
966        category: MisuseCategory::CallOrder,
967        severity: MisuseSeverity::Low,
968        description: "Assigning without local leaks mutable globals and creates hidden coupling",
969        correct_usage: "Declare locals explicitly with local name = ...",
970        pattern: r"^[A-Za-z_][A-Za-z0-9_]*\s*=",
971        api_call: "global assignment",
972        message: "Implicit global assignment leaks state outside local scope",
973        fix_suggestion: "Prefix the binding with local to keep scope explicit",
974    },
975    RegexRuleSpec {
976        id: "LU002",
977        name: "dynamic-load",
978        category: MisuseCategory::Security,
979        severity: MisuseSeverity::High,
980        description: "load/loadstring execute dynamic Lua code and should be avoided",
981        correct_usage: "Use structured parsing or explicit dispatch instead of dynamic evaluation",
982        pattern: r"\b(?:loadstring|load)\s*\(",
983        api_call: "load",
984        message: "Dynamic code loading executes attacker-controlled Lua if fed untrusted input",
985        fix_suggestion: "Replace dynamic evaluation with explicit dispatch or parsing",
986    },
987    RegexRuleSpec {
988        id: "LU003",
989        name: "os-execute",
990        category: MisuseCategory::Security,
991        severity: MisuseSeverity::High,
992        description: "os.execute shells out and is dangerous with dynamic input",
993        correct_usage: "Avoid shell execution or validate every command component",
994        pattern: r"\bos\.execute\s*\(",
995        api_call: "os.execute",
996        message: "os.execute can enable command injection with untrusted input",
997        fix_suggestion: "Avoid shelling out or strictly validate the command source",
998    },
999    RegexRuleSpec {
1000        id: "LU004",
1001        name: "io-popen",
1002        category: MisuseCategory::Security,
1003        severity: MisuseSeverity::High,
1004        description: "io.popen launches shell commands and should be treated as high risk",
1005        correct_usage: "Use safer process APIs or validate all command components",
1006        pattern: r"\bio\.popen\s*\(",
1007        api_call: "io.popen",
1008        message: "io.popen can enable command injection with untrusted input",
1009        fix_suggestion: "Avoid shell execution or validate every command component",
1010    },
1011    RegexRuleSpec {
1012        id: "LU005",
1013        name: "dofile-loadfile",
1014        category: MisuseCategory::Security,
1015        severity: MisuseSeverity::Medium,
1016        description:
1017            "dofile/loadfile execute external files and are risky with user-controlled paths",
1018        correct_usage: "Validate file origins strictly before executing them",
1019        pattern: r"\b(?:dofile|loadfile)\s*\(",
1020        api_call: "dofile",
1021        message: "Executing external files is dangerous when the path is not fully trusted",
1022        fix_suggestion: "Avoid dynamic file execution or tightly validate trusted origins",
1023    },
1024];
1025
1026const OCAML_RULE_SPECS: &[RegexRuleSpec] = &[
1027    RegexRuleSpec {
1028        id: "OC001",
1029        name: "marshal-from-string",
1030        category: MisuseCategory::Security,
1031        severity: MisuseSeverity::High,
1032        description: "Marshal.from_string on untrusted data is unsafe native deserialization",
1033        correct_usage: "Use a safe, schema-checked serialization format",
1034        pattern: r"\bMarshal\.from_string\b",
1035        api_call: "Marshal.from_string",
1036        message: "Marshal.from_string can deserialize unsafe values from untrusted input",
1037        fix_suggestion: "Use a safer serialization format for external input",
1038    },
1039    RegexRuleSpec {
1040        id: "OC002",
1041        name: "marshal-from-channel",
1042        category: MisuseCategory::Security,
1043        severity: MisuseSeverity::High,
1044        description: "Marshal.from_channel on untrusted data is unsafe native deserialization",
1045        correct_usage: "Use a safe, schema-checked serialization format",
1046        pattern: r"\bMarshal\.from_channel\b",
1047        api_call: "Marshal.from_channel",
1048        message: "Marshal.from_channel can deserialize unsafe values from untrusted input",
1049        fix_suggestion: "Use a safer serialization format for external input",
1050    },
1051    RegexRuleSpec {
1052        id: "OC003",
1053        name: "sys-command",
1054        category: MisuseCategory::Security,
1055        severity: MisuseSeverity::High,
1056        description: "Sys.command executes a shell command and is dangerous with dynamic input",
1057        correct_usage: "Prefer direct library APIs or validate allowed commands strictly",
1058        pattern: r"\bSys\.command\b",
1059        api_call: "Sys.command",
1060        message: "Sys.command can enable command injection with untrusted input",
1061        fix_suggestion: "Avoid shell execution or tightly validate the command source",
1062    },
1063    RegexRuleSpec {
1064        id: "OC004",
1065        name: "obj-magic",
1066        category: MisuseCategory::ErrorHandling,
1067        severity: MisuseSeverity::High,
1068        description: "Obj.magic bypasses the type system and can produce memory-unsound behavior",
1069        correct_usage: "Use typed abstractions or explicit variant handling",
1070        pattern: r"\bObj\.magic\b",
1071        api_call: "Obj.magic",
1072        message: "Obj.magic bypasses type safety and can create undefined behavior",
1073        fix_suggestion: "Refactor to a typed abstraction instead of coercing with Obj.magic",
1074    },
1075    RegexRuleSpec {
1076        id: "OC005",
1077        name: "open-in-out",
1078        category: MisuseCategory::Resources,
1079        severity: MisuseSeverity::Low,
1080        description: "open_in/open_out require explicit close calls and are easy to leak",
1081        correct_usage: "Use In_channel.with_open_* or Out_channel.with_open_* helpers",
1082        pattern: r"\b(?:open_in|open_out)\b",
1083        api_call: "open_in",
1084        message: "open_in/open_out require explicit close handling and are easy to leak",
1085        fix_suggestion: "Use with_open_* helpers to scope the channel lifetime",
1086    },
1087];
1088
1089const ALL_API_LANGUAGES: &[ApiLanguage] = &[
1090    ApiLanguage::Python,
1091    ApiLanguage::Rust,
1092    ApiLanguage::Go,
1093    ApiLanguage::Java,
1094    ApiLanguage::JavaScript,
1095    ApiLanguage::TypeScript,
1096    ApiLanguage::C,
1097    ApiLanguage::Cpp,
1098    ApiLanguage::Ruby,
1099    ApiLanguage::Php,
1100    ApiLanguage::Kotlin,
1101    ApiLanguage::Swift,
1102    ApiLanguage::CSharp,
1103    ApiLanguage::Scala,
1104    ApiLanguage::Elixir,
1105    ApiLanguage::Lua,
1106    ApiLanguage::Luau,
1107    ApiLanguage::Ocaml,
1108];
1109
1110// =============================================================================
1111// Rule Definitions
1112// =============================================================================
1113
1114/// Built-in Python API misuse rules
1115fn python_rules() -> Vec<APIRule> {
1116    vec![
1117        APIRule {
1118            id: "PY001".to_string(),
1119            name: "missing-timeout".to_string(),
1120            category: MisuseCategory::Parameters,
1121            severity: MisuseSeverity::High,
1122            description: "requests.get/post/etc without timeout parameter can hang indefinitely"
1123                .to_string(),
1124            correct_usage: "requests.get(url, timeout=30)".to_string(),
1125        },
1126        APIRule {
1127            id: "PY002".to_string(),
1128            name: "bare-except".to_string(),
1129            category: MisuseCategory::ErrorHandling,
1130            severity: MisuseSeverity::Medium,
1131            description: "Bare except clause catches all exceptions including KeyboardInterrupt"
1132                .to_string(),
1133            correct_usage: "except Exception as e:".to_string(),
1134        },
1135        APIRule {
1136            id: "PY003".to_string(),
1137            name: "weak-hash-md5".to_string(),
1138            category: MisuseCategory::Crypto,
1139            severity: MisuseSeverity::High,
1140            description: "MD5 is cryptographically broken, don't use for security purposes"
1141                .to_string(),
1142            correct_usage: "hashlib.sha256() or bcrypt for passwords".to_string(),
1143        },
1144        APIRule {
1145            id: "PY004".to_string(),
1146            name: "weak-hash-sha1".to_string(),
1147            category: MisuseCategory::Crypto,
1148            severity: MisuseSeverity::High,
1149            description: "SHA1 is cryptographically weak, don't use for security purposes"
1150                .to_string(),
1151            correct_usage: "hashlib.sha256() or stronger".to_string(),
1152        },
1153        APIRule {
1154            id: "PY005".to_string(),
1155            name: "unclosed-file".to_string(),
1156            category: MisuseCategory::Resources,
1157            severity: MisuseSeverity::Medium,
1158            description: "File opened without context manager may not be properly closed"
1159                .to_string(),
1160            correct_usage: "with open(path) as f:".to_string(),
1161        },
1162        APIRule {
1163            id: "PY006".to_string(),
1164            name: "insecure-random".to_string(),
1165            category: MisuseCategory::Security,
1166            severity: MisuseSeverity::High,
1167            description: "random module is not cryptographically secure".to_string(),
1168            correct_usage: "secrets.token_bytes() or secrets.token_hex()".to_string(),
1169        },
1170    ]
1171}
1172
1173/// Built-in Rust API misuse rules
1174fn rust_rules() -> Vec<APIRule> {
1175    vec![
1176        APIRule {
1177            id: "RS001".to_string(),
1178            name: "mutex-lock-unwrap".to_string(),
1179            category: MisuseCategory::Concurrency,
1180            severity: MisuseSeverity::Medium,
1181            description: "Mutex::lock().unwrap() can panic and amplify lock contention (CWE-833)"
1182                .to_string(),
1183            correct_usage:
1184                "Prefer try_lock()/error handling or explicit poison recovery instead of unwrap()"
1185                    .to_string(),
1186        },
1187        APIRule {
1188            id: "RS002".to_string(),
1189            name: "file-open-without-context".to_string(),
1190            category: MisuseCategory::ErrorHandling,
1191            severity: MisuseSeverity::Low,
1192            description:
1193                "File::open without contextual error mapping makes failures hard to triage"
1194                    .to_string(),
1195            correct_usage:
1196                "File::open(path).with_context(|| format!(\"opening {}\", path.display()))?"
1197                    .to_string(),
1198        },
1199        APIRule {
1200            id: "RS003".to_string(),
1201            name: "unbounded-with-capacity".to_string(),
1202            category: MisuseCategory::Resources,
1203            severity: MisuseSeverity::High,
1204            description:
1205                "Vec::with_capacity fed from unbounded input can cause memory exhaustion (CWE-770)"
1206                    .to_string(),
1207            correct_usage: "Clamp capacity input before allocation (e.g. min(user_len, MAX))"
1208                .to_string(),
1209        },
1210        APIRule {
1211            id: "RS004".to_string(),
1212            name: "detached-tokio-spawn".to_string(),
1213            category: MisuseCategory::Concurrency,
1214            severity: MisuseSeverity::Medium,
1215            description: "tokio::spawn without retaining JoinHandle risks silent task failures"
1216                .to_string(),
1217            correct_usage: "Store JoinHandle values and await/join them".to_string(),
1218        },
1219        APIRule {
1220            id: "RS005".to_string(),
1221            name: "hashmap-order-dependence".to_string(),
1222            category: MisuseCategory::CallOrder,
1223            severity: MisuseSeverity::Low,
1224            description:
1225                "HashMap iteration order is non-deterministic; relying on it can break logic"
1226                    .to_string(),
1227            correct_usage:
1228                "Collect keys and sort them, or use BTreeMap/IndexMap when stable order is required"
1229                    .to_string(),
1230        },
1231        APIRule {
1232            id: "RS006".to_string(),
1233            name: "clone-in-hot-loop".to_string(),
1234            category: MisuseCategory::Resources,
1235            severity: MisuseSeverity::Low,
1236            description: "clone() inside loop bodies can create avoidable allocation pressure"
1237                .to_string(),
1238            correct_usage: "Borrow or move values instead of cloning in tight loops".to_string(),
1239        },
1240    ]
1241}
1242
1243fn regex_rule_specs_for_language(language: ApiLanguage) -> &'static [RegexRuleSpec] {
1244    match language {
1245        ApiLanguage::Python | ApiLanguage::Rust => &[],
1246        ApiLanguage::Go => GO_RULE_SPECS,
1247        ApiLanguage::Java => JAVA_RULE_SPECS,
1248        ApiLanguage::JavaScript => JAVASCRIPT_RULE_SPECS,
1249        ApiLanguage::TypeScript => TYPESCRIPT_RULE_SPECS,
1250        ApiLanguage::C => C_RULE_SPECS,
1251        ApiLanguage::Cpp => CPP_RULE_SPECS,
1252        ApiLanguage::Ruby => RUBY_RULE_SPECS,
1253        ApiLanguage::Php => PHP_RULE_SPECS,
1254        ApiLanguage::Kotlin => KOTLIN_RULE_SPECS,
1255        ApiLanguage::Swift => SWIFT_RULE_SPECS,
1256        ApiLanguage::CSharp => CSHARP_RULE_SPECS,
1257        ApiLanguage::Scala => SCALA_RULE_SPECS,
1258        ApiLanguage::Elixir => ELIXIR_RULE_SPECS,
1259        ApiLanguage::Lua | ApiLanguage::Luau => LUA_RULE_SPECS,
1260        ApiLanguage::Ocaml => OCAML_RULE_SPECS,
1261    }
1262}
1263
1264fn all_api_languages() -> &'static [ApiLanguage] {
1265    ALL_API_LANGUAGES
1266}
1267
1268// =============================================================================
1269// CLI Arguments
1270// =============================================================================
1271
1272/// Detect API misuse patterns in code
1273///
1274/// Analyzes code for common API misuse patterns like missing timeouts,
1275/// bare except clauses, weak crypto usage, and unclosed resources.
1276///
1277/// # Example
1278///
1279/// ```bash
1280/// tldr api-check src/
1281/// tldr api-check src/main.py --category crypto
1282/// tldr api-check src/ --severity high
1283/// ```
1284#[derive(Debug, Args)]
1285pub struct ApiCheckArgs {
1286    /// File or directory to analyze (path to file or directory)
1287    #[arg(value_name = "path")]
1288    pub path: PathBuf,
1289
1290    /// Filter by misuse category
1291    #[arg(long, value_delimiter = ',')]
1292    pub category: Option<Vec<MisuseCategory>>,
1293
1294    /// Filter by minimum severity
1295    #[arg(long, value_delimiter = ',')]
1296    pub severity: Option<Vec<MisuseSeverity>>,
1297
1298    /// Output file (optional, stdout if not specified)
1299    #[arg(long, short = 'O')]
1300    pub output: Option<PathBuf>,
1301}
1302
1303impl ApiCheckArgs {
1304    /// Run the api-check command
1305    pub fn run(
1306        &self,
1307        format: crate::output::OutputFormat,
1308        quiet: bool,
1309        global_lang: Option<Language>,
1310    ) -> Result<()> {
1311        let writer = OutputWriter::new(format, quiet);
1312
1313        writer.progress(&format!(
1314            "Checking {} for API misuse patterns...",
1315            self.path.display()
1316        ));
1317
1318        // Validate path exists
1319        if !self.path.exists() {
1320            return Err(RemainingError::file_not_found(&self.path).into());
1321        }
1322
1323        // sibling-resolver-gaps-v1 (P14.AGG14-5): the global `-l/--lang`
1324        // flag (defined in `Cli` and honoured by 30+ sibling commands)
1325        // was silently ignored by `api-check`, so
1326        // `tldr api-check --lang luau /tmp/repos/luau-luau` would scan
1327        // every `.cpp`/`.h`/`.lua`/`.luau`/`.py` file in the tree (89
1328        // findings across hundreds of files). P13.AGG13-10 fixed
1329        // `clones` for the same flag; mirror the pattern here. When the
1330        // global lang maps to a known `ApiLanguage`, restrict the
1331        // `detect_language` dispatch to only that language.
1332        let lang_filter: Option<ApiLanguage> = global_lang.and_then(map_language_to_api_language);
1333
1334        let all_rules_count = all_api_languages()
1335            .iter()
1336            .map(|language| rules_for_language(*language).len() as u32)
1337            .sum();
1338
1339        // Collect files to analyze
1340        let files = collect_files(&self.path)?;
1341        writer.progress(&format!("Found {} files to analyze", files.len()));
1342
1343        // Analyze each file
1344        let mut all_findings: Vec<MisuseFinding> = Vec::new();
1345        let mut files_scanned = 0u32;
1346
1347        for file_path in &files {
1348            let Some(language) = detect_language(file_path) else {
1349                continue;
1350            };
1351            // P14.AGG14-5: if user pinned a specific language, skip files
1352            // whose extension resolves to a different ApiLanguage.
1353            if let Some(want) = lang_filter {
1354                if language != want {
1355                    continue;
1356                }
1357            }
1358            let rules = rules_for_language(language);
1359            match analyze_file(file_path, &rules, language) {
1360                Ok(findings) => {
1361                    all_findings.extend(findings);
1362                    files_scanned += 1;
1363                }
1364                Err(e) => {
1365                    writer.progress(&format!(
1366                        "Warning: Failed to analyze {}: {}",
1367                        file_path.display(),
1368                        e
1369                    ));
1370                }
1371            }
1372        }
1373
1374        // Apply filters
1375        let filtered_findings = filter_findings(
1376            all_findings,
1377            self.category.as_deref(),
1378            self.severity.as_deref(),
1379        );
1380
1381        // Build summary
1382        let summary = build_summary(&filtered_findings, files_scanned);
1383
1384        // Build report
1385        let report = APICheckReport {
1386            findings: filtered_findings,
1387            summary,
1388            rules_applied: all_rules_count,
1389        };
1390
1391        // Write output
1392        if let Some(ref output_path) = self.output {
1393            if writer.is_text() {
1394                let text = format_api_check_text(&report);
1395                fs::write(output_path, text)?;
1396            } else {
1397                let json = serde_json::to_string_pretty(&report)?;
1398                fs::write(output_path, json)?;
1399            }
1400        } else if writer.is_text() {
1401            let text = format_api_check_text(&report);
1402            writer.write_text(&text)?;
1403        } else {
1404            writer.write(&report)?;
1405        }
1406
1407        Ok(())
1408    }
1409}
1410
1411// =============================================================================
1412// File Collection
1413// =============================================================================
1414
1415/// Collect supported source files from a path
1416fn collect_files(path: &Path) -> Result<Vec<PathBuf>> {
1417    let mut files = Vec::new();
1418
1419    if path.is_file() {
1420        if is_supported_file(path) {
1421            files.push(path.to_path_buf());
1422        }
1423    } else if path.is_dir() {
1424        for entry in walk_project(path) {
1425            if files.len() >= MAX_DIRECTORY_FILES as usize {
1426                break;
1427            }
1428
1429            let entry_path = entry.path();
1430            if entry_path.is_file() && is_supported_file(entry_path) {
1431                // Check file size
1432                if let Ok(metadata) = fs::metadata(entry_path) {
1433                    if metadata.len() <= MAX_FILE_SIZE {
1434                        files.push(entry_path.to_path_buf());
1435                    }
1436                }
1437            }
1438        }
1439    }
1440
1441    Ok(files)
1442}
1443
1444/// Check if a path has a supported extension.
1445fn is_supported_file(path: &Path) -> bool {
1446    detect_language(path).is_some()
1447}
1448
1449/// sibling-resolver-gaps-v1 (P14.AGG14-5): map the global `Language`
1450/// enum (used by the top-level `--lang/-l` flag) to the
1451/// `ApiLanguage` variant the api-check engine uses internally. Returns
1452/// `None` for languages api-check has no rule pack for, in which case
1453/// the caller should not apply a filter (preserve current behaviour for
1454/// those langs rather than blocking the run).
1455fn map_language_to_api_language(lang: Language) -> Option<ApiLanguage> {
1456    match lang {
1457        Language::Python => Some(ApiLanguage::Python),
1458        Language::Rust => Some(ApiLanguage::Rust),
1459        Language::Go => Some(ApiLanguage::Go),
1460        Language::Java => Some(ApiLanguage::Java),
1461        Language::JavaScript => Some(ApiLanguage::JavaScript),
1462        Language::TypeScript => Some(ApiLanguage::TypeScript),
1463        Language::C => Some(ApiLanguage::C),
1464        Language::Cpp => Some(ApiLanguage::Cpp),
1465        Language::Ruby => Some(ApiLanguage::Ruby),
1466        Language::Php => Some(ApiLanguage::Php),
1467        Language::Kotlin => Some(ApiLanguage::Kotlin),
1468        Language::Swift => Some(ApiLanguage::Swift),
1469        Language::CSharp => Some(ApiLanguage::CSharp),
1470        Language::Scala => Some(ApiLanguage::Scala),
1471        Language::Elixir => Some(ApiLanguage::Elixir),
1472        Language::Lua => Some(ApiLanguage::Lua),
1473        Language::Luau => Some(ApiLanguage::Luau),
1474        Language::Ocaml => Some(ApiLanguage::Ocaml),
1475    }
1476}
1477
1478pub(crate) fn detect_language(path: &Path) -> Option<ApiLanguage> {
1479    match path.extension().and_then(|e| e.to_str()) {
1480        Some("py") => Some(ApiLanguage::Python),
1481        Some("rs") => Some(ApiLanguage::Rust),
1482        Some("go") => Some(ApiLanguage::Go),
1483        Some("java") => Some(ApiLanguage::Java),
1484        Some("js") | Some("jsx") | Some("mjs") | Some("cjs") => Some(ApiLanguage::JavaScript),
1485        Some("ts") | Some("tsx") => Some(ApiLanguage::TypeScript),
1486        Some("c") | Some("h") => Some(ApiLanguage::C),
1487        Some("cpp") | Some("hpp") | Some("cc") | Some("cxx") => Some(ApiLanguage::Cpp),
1488        Some("rb") => Some(ApiLanguage::Ruby),
1489        Some("php") => Some(ApiLanguage::Php),
1490        Some("kt") | Some("kts") => Some(ApiLanguage::Kotlin),
1491        Some("swift") => Some(ApiLanguage::Swift),
1492        Some("cs") => Some(ApiLanguage::CSharp),
1493        Some("scala") => Some(ApiLanguage::Scala),
1494        Some("ex") | Some("exs") => Some(ApiLanguage::Elixir),
1495        Some("lua") => Some(ApiLanguage::Lua),
1496        Some("luau") => Some(ApiLanguage::Luau),
1497        Some("ml") | Some("mli") => Some(ApiLanguage::Ocaml),
1498        _ => None,
1499    }
1500}
1501
1502pub(crate) fn rules_for_language(language: ApiLanguage) -> Vec<APIRule> {
1503    match language {
1504        ApiLanguage::Python => python_rules(),
1505        ApiLanguage::Rust => rust_rules(),
1506        _ => regex_rule_specs_for_language(language)
1507            .iter()
1508            .copied()
1509            .map(RegexRuleSpec::rule)
1510            .collect(),
1511    }
1512}
1513
1514// =============================================================================
1515// Analysis Engine
1516// =============================================================================
1517
1518/// Per-language needle set used by the file-level fast-path in
1519/// [`analyze_file`].
1520///
1521/// `analyze_file` previously walked every line of every collected file,
1522/// dispatching every rule per line. On large `.cpp`/`.h` files in mixed-
1523/// language repos (e.g. `luau-luau`, where the API-check command sees
1524/// 800+ files including 200 KB+ per-file C++ source) this was O(files ·
1525/// lines · rules). For `tldr api-check /tmp/repos/luau-luau` the BEFORE
1526/// run was ~186 s; almost all of that was scanning files that contained
1527/// none of the rule keywords for their language.
1528///
1529/// fastpath-extend-non-vuln-v1 (extends the M-B1 substring prefilter
1530/// proven in `crates/tldr-core/src/security/vuln.rs::scan_file_vulns`).
1531/// If a file's content contains NONE of the language's rule needles,
1532/// every per-line check is guaranteed to return `None`, so we can skip
1533/// the per-line loop entirely. The needle set is a SUPERSET of the
1534/// per-rule matchers — a file passing the prefilter is still subject to
1535/// the existing per-line precision logic (docstring filtering,
1536/// `find_standalone_call`, etc.), so the fast-path cannot introduce new
1537/// false negatives.
1538///
1539/// The needle list is derived per call from the language's rule
1540/// specs (extracting the substring before the first regex metachar
1541/// from each `pattern` so we use the longest *plain* prefix as the
1542/// needle). For Python and Rust — whose rules use bespoke matchers
1543/// rather than the regex spec table — we hard-code the list.
1544fn language_fastpath_needles(language: ApiLanguage) -> Vec<String> {
1545    match language {
1546        // Built-in Python rules: PY001 requests.*, PY002 except:, PY003 md5,
1547        // PY004 sha1, PY005 open(, PY006 random.*. Use short prefixes so we
1548        // don't tie this list to the precise per-rule call shapes.
1549        ApiLanguage::Python => ["requests.", "except:", "md5", "sha1", "open(", "random."]
1550            .iter()
1551            .map(|s| (*s).to_string())
1552            .collect(),
1553        // Built-in Rust rules: RS001 Mutex, RS002 File::open, RS003
1554        // with_capacity, RS004 tokio::spawn, RS005 HashMap, RS006 clone(
1555        ApiLanguage::Rust => [
1556            "Mutex",
1557            "File::open",
1558            "with_capacity",
1559            "tokio::spawn",
1560            "HashMap",
1561            ".clone(",
1562        ]
1563        .iter()
1564        .map(|s| (*s).to_string())
1565        .collect(),
1566        // Regex-based languages: derive the needles automatically from
1567        // the static rule table by scanning each spec's `pattern` for
1568        // its longest plain-literal run. See `extract_literal_from_regex`
1569        // for the correctness contract (the returned literal is a
1570        // substring of every line that matches the pattern).
1571        _ => regex_rule_specs_for_language(language)
1572            .iter()
1573            .map(|spec| extract_literal_from_regex(spec.pattern))
1574            .collect(),
1575    }
1576}
1577
1578/// Extract a literal substring from a regex pattern that is guaranteed
1579/// to appear (verbatim) in any line that matches the regex.
1580///
1581/// We walk the regex and emit the longest run of literal characters,
1582/// skipping anchors (`\b`, `^`, `$`), interpreting simple character
1583/// escapes (`\.` → `.`, `\(` → `(`), and ending the run at character-
1584/// class shorthands (`\s`, `\w`, `\d`, …) or quantifiers (`*`, `+`,
1585/// `?`, `{n}`). This is intentionally conservative: we never claim a
1586/// literal that the regex engine wouldn't produce. For
1587/// pathological / pure-quantifier patterns the result is the empty
1588/// string, which the caller interprets as "always admit" — preserving
1589/// correctness at the cost of skipping the fast-path for that rule.
1590///
1591/// **Correctness contract**: for every spec in
1592/// `regex_rule_specs_for_language`, the byte string returned here is a
1593/// substring of every input string that matches `spec.pattern`. The
1594/// `extract_literal_from_regex_yields_substring_present_in_match`
1595/// test below pins this contract.
1596///
1597/// Returns a `String` rather than `&'static str` because escaped
1598/// literals (`\.` → `.`) require building a buffer; for plain runs
1599/// without escapes this still allocates, but the cost is paid once
1600/// per rule per `analyze_file` call, not per line.
1601fn extract_literal_from_regex(pattern: &str) -> String {
1602    let bytes = pattern.as_bytes();
1603    let n = bytes.len();
1604
1605    // Soundness: alternation `|` at the top level means a match could
1606    // come from any branch, so a literal is only safe if it appears in
1607    // EVERY branch. Implementing per-branch literal intersection is
1608    // complex; the safe fallback is to return empty (always admit) for
1609    // any pattern containing top-level `|`. This also handles
1610    // `\s==\s|\s!=\s` correctly (we previously over-reported `==`).
1611    let mut depth = 0i32;
1612    let mut k = 0usize;
1613    while k < n {
1614        match bytes[k] {
1615            b'\\' if k + 1 < n => k += 2,
1616            b'[' => {
1617                k += 1;
1618                while k < n && bytes[k] != b']' {
1619                    if bytes[k] == b'\\' && k + 1 < n {
1620                        k += 2;
1621                    } else {
1622                        k += 1;
1623                    }
1624                }
1625                if k < n {
1626                    k += 1;
1627                }
1628            }
1629            b'(' => {
1630                depth += 1;
1631                k += 1;
1632            }
1633            b')' => {
1634                depth -= 1;
1635                k += 1;
1636            }
1637            b'|' if depth == 0 => return String::new(),
1638            _ => k += 1,
1639        }
1640    }
1641
1642    let mut best = String::new();
1643    let mut run = String::new();
1644
1645    let close_run = |run: &mut String, best: &mut String| {
1646        if run.len() > best.len() {
1647            *best = run.clone();
1648        }
1649        run.clear();
1650    };
1651
1652    let mut i = 0usize;
1653    while i < n {
1654        let b = bytes[i];
1655        match b {
1656            // Anchors `^` / `$` are invisible at match time; close run.
1657            b'^' | b'$' => {
1658                close_run(&mut run, &mut best);
1659                i += 1;
1660            }
1661            b'\\' if i + 1 < n => {
1662                let esc = bytes[i + 1];
1663                match esc {
1664                    // Word/string boundaries are invisible at match time.
1665                    b'b' | b'B' | b'A' | b'Z' | b'z' => {
1666                        close_run(&mut run, &mut best);
1667                        i += 2;
1668                    }
1669                    // Character-class shorthands match a single char,
1670                    // not a literal — close the run.
1671                    b's' | b'S' | b'd' | b'D' | b'w' | b'W' => {
1672                        close_run(&mut run, &mut best);
1673                        i += 2;
1674                    }
1675                    // Literal escape: `\.`, `\(`, `\$`, `\\`, … — append
1676                    // the escaped byte to the current run.
1677                    _ => {
1678                        run.push(esc as char);
1679                        i += 2;
1680                    }
1681                }
1682            }
1683            // Quantifiers eat the previous run char (because `foo*`
1684            // could match just `fo`, not necessarily `foo`). Close the
1685            // run after dropping the quantified atom.
1686            b'*' | b'+' | b'?' | b'{' => {
1687                if !run.is_empty() {
1688                    run.pop();
1689                }
1690                close_run(&mut run, &mut best);
1691                // For `{n,m}` we also need to advance past the closing
1692                // `}`; conservatively scan for it.
1693                if b == b'{' {
1694                    while i < n && bytes[i] != b'}' {
1695                        i += 1;
1696                    }
1697                }
1698                i += 1;
1699            }
1700            // Alternation, groups end the run.
1701            b'|' | b'(' | b')' | b']' => {
1702                close_run(&mut run, &mut best);
1703                // Handle `(?:`, `(?=`, `(?!` non-capturing / lookaround
1704                // openers: skip the `?X` so we don't treat `:` as a
1705                // literal.
1706                if b == b'(' && i + 2 < n && bytes[i + 1] == b'?' {
1707                    i += 3;
1708                } else {
1709                    i += 1;
1710                }
1711            }
1712            // Character class `[...]`: skip the entire bracketed group
1713            // — chars inside a class are alternatives, not literals.
1714            b'[' => {
1715                close_run(&mut run, &mut best);
1716                i += 1;
1717                // Skip any leading `^` (negated class).
1718                if i < n && bytes[i] == b'^' {
1719                    i += 1;
1720                }
1721                // Skip a literal `]` immediately after `[` or `[^`.
1722                if i < n && bytes[i] == b']' {
1723                    i += 1;
1724                }
1725                // Walk until the closing `]`, honouring `\]` escapes.
1726                while i < n && bytes[i] != b']' {
1727                    if bytes[i] == b'\\' && i + 1 < n {
1728                        i += 2;
1729                    } else {
1730                        i += 1;
1731                    }
1732                }
1733                if i < n {
1734                    i += 1; // consume closing `]`
1735                }
1736            }
1737            // Bare `.` is the regex "any char" metachar (NOT a literal).
1738            b'.' => {
1739                close_run(&mut run, &mut best);
1740                i += 1;
1741            }
1742            // Plain literal char extends the run.
1743            _ => {
1744                run.push(b as char);
1745                i += 1;
1746            }
1747        }
1748    }
1749    close_run(&mut run, &mut best);
1750
1751    // Require at least 2 characters before claiming a useful literal —
1752    // single-char literals match too eagerly to be effective filters
1753    // (e.g. `==` reduces to "=" which appears in every file).
1754    if best.len() < 2 {
1755        return String::new();
1756    }
1757    best
1758}
1759
1760/// Analyze a single file for API misuse
1761pub(crate) fn analyze_file(
1762    path: &Path,
1763    rules: &[APIRule],
1764    language: ApiLanguage,
1765) -> Result<Vec<MisuseFinding>> {
1766    // fastpath-extend-non-vuln-v1: defer to the central oversize policy
1767    // before reading the file. `analyze_file` reads the full content into
1768    // memory and per-line scans it; without a cap, a 2 MB+ generated
1769    // header (`*.d.ts`, `dom.generated.h`, …) can dominate the run. The
1770    // central policy lives in `tldr_core::fs::oversize::check_size` and
1771    // is shared with `parse_file_with_lang`, `walker::walk_project`'s
1772    // size-aware callers, and `quality::debt`.
1773    if let tldr_core::fs::oversize::SizeCheck::Oversize { .. } =
1774        tldr_core::fs::oversize::check_size(path)
1775    {
1776        return Ok(Vec::new());
1777    }
1778
1779    let content = fs::read_to_string(path)?;
1780
1781    // fastpath-extend-non-vuln-v1: per-file substring fast-path (ports
1782    // M-B1's `function_body_has_taint_pattern` shape from
1783    // `crates/tldr-core/src/security/vuln.rs`). If the file body
1784    // contains NONE of the language's rule needles, no per-line check
1785    // could fire — skip the loop entirely. Correctness contract: the
1786    // needle set is a SUPERSET of every per-rule matcher (see
1787    // `language_fastpath_needles` doc), so a clean prefilter miss is a
1788    // true negative. Documents-only / pure-comment files still benefit
1789    // because their content rarely contains the security-shape needles.
1790    //
1791    // An empty needle in the list means "always admit" (the
1792    // corresponding rule has no useful literal prefix — see
1793    // `extract_literal_needle`); we treat any empty needle as
1794    // unconditional admission to preserve correctness.
1795    let needles = language_fastpath_needles(language);
1796    let any_needle_admits_universally = needles.iter().any(|n| n.is_empty());
1797    let any_needle_hit = needles
1798        .iter()
1799        .any(|n| !n.is_empty() && content.contains(n.as_str()));
1800    if !needles.is_empty() && !any_needle_admits_universally && !any_needle_hit {
1801        return Ok(Vec::new());
1802    }
1803
1804    let file_str = path.display().to_string();
1805    let mut findings = Vec::new();
1806    let mut prev_trimmed = String::new();
1807    let file_has_hashmap = matches!(language, ApiLanguage::Rust) && content.contains("HashMap");
1808
1809    // fastpath-extend-non-vuln-v1: pre-compile regex rules ONCE per file
1810    // (NOT once per (line, rule) pair). Pre-fix, `check_regex_rule`
1811    // called `Regex::new(spec.pattern)` on every (line, rule) match
1812    // inside the per-line loop — for an 800-file mixed-language repo
1813    // (luau-luau: 200KB+ `.cpp` files × ~30 rules each) the regex
1814    // compiler dominated the wall clock (~186 s). Compiling once per
1815    // file collapses this to N_rules per file. We then drive the
1816    // per-line check with the cached `Regex` instead of re-compiling.
1817    let regex_specs: Vec<(&'static RegexRuleSpec, Regex)> =
1818        regex_rule_specs_for_language(language)
1819            .iter()
1820            .filter_map(|spec| Regex::new(spec.pattern).ok().map(|re| (spec, re)))
1821            .collect();
1822
1823    // analysis-precision-v1, BUG-07: for Python, mark lines that are
1824    // function/class signatures or live inside a triple-quoted docstring
1825    // so per-line identifier matchers (PY003 / PY004 / PY006 / ...) skip
1826    // them. Pre-fix `check_sha1_usage` matched the substring `sha1(` on
1827    // `def _lazy_sha1(...)` (a function *signature* mentioning the name)
1828    // and matched `hashlib.sha1` inside a docstring (`"""... ``hashlib.sha1``
1829    // at runtime ..."""`), inflating PY004 from 1 real call site to 3.
1830    let py_line_ctx: Vec<PyLineContext> = if matches!(language, ApiLanguage::Python) {
1831        compute_python_line_contexts(&content)
1832    } else {
1833        Vec::new()
1834    };
1835
1836    // api-check-and-patterns-accuracy-v1 (P11.BUG-AGG-10): for C-family
1837    // languages, mark lines that live inside a `/* ... */` block comment
1838    // so per-line identifier matchers (e.g. `C003 sprintf-call`) skip
1839    // them. Pre-fix the `\bsprintf\s*\(` pattern matched the *literal*
1840    // text `sprintf()` inside a doc-comment block (e.g.
1841    // `/* ... not rely on sprintf() family ... */` in
1842    // `/tmp/repos/c-sds/sds.c:601`), reporting it as a real call site.
1843    // The line-level `is_comment_line` skip only handles `//` line
1844    // comments; block comments need state tracking across lines.
1845    let block_comment_ctx: Vec<bool> = if language_uses_c_block_comments(language) {
1846        compute_c_block_comment_lines(&content)
1847    } else {
1848        Vec::new()
1849    };
1850
1851    for (line_num, line) in content.lines().enumerate() {
1852        let line_number = (line_num + 1) as u32;
1853        let trimmed = line.trim();
1854        // Skip lines that live inside a `/* ... */` block (BUG-AGG-10).
1855        // Indices align with `content.lines()` ordering.
1856        if block_comment_ctx
1857            .get(line_num)
1858            .copied()
1859            .unwrap_or(false)
1860        {
1861            // Still update prev_trimmed so the Rust `previous_is_loop`
1862            // context isn't disrupted by the comment skip.
1863            prev_trimmed = trimmed.to_string();
1864            continue;
1865        }
1866        let rust_ctx = RustLineContext {
1867            file_has_hashmap,
1868            previous_line: prev_trimmed.as_str(),
1869            previous_is_loop: prev_trimmed.starts_with("for ")
1870                || prev_trimmed.starts_with("while "),
1871        };
1872        let py_ctx = py_line_ctx
1873            .get(line_num)
1874            .copied()
1875            .unwrap_or_default();
1876
1877        // Check each rule
1878        for rule in rules {
1879            if let Some(finding) = check_rule(
1880                rule,
1881                &file_str,
1882                line_number,
1883                line,
1884                language,
1885                &rust_ctx,
1886                py_ctx,
1887                &regex_specs,
1888            ) {
1889                findings.push(finding);
1890            }
1891        }
1892        prev_trimmed = trimmed.to_string();
1893    }
1894
1895    Ok(findings)
1896}
1897
1898/// Per-line Python context computed once per file (analysis-precision-v1, BUG-07).
1899///
1900/// Used to suppress identifier-style API misuse matchers on lines that are
1901/// not actual call sites:
1902/// - `in_docstring`: line lives inside a triple-quoted (`"""` or `'''`)
1903///   string literal; identifier mentions inside docstrings (e.g.
1904///   ``"""...``hashlib.sha1``..."""``) are documentation, not calls.
1905/// - `is_def_or_class_signature`: line opens a `def `/`async def `/`class `
1906///   signature (the line itself, not its body); identifier mentions in the
1907///   *name* of a function (e.g. `def _lazy_sha1(string)`) must not be
1908///   treated as a call to `sha1(`.
1909#[derive(Debug, Clone, Copy, Default)]
1910pub(crate) struct PyLineContext {
1911    pub in_docstring: bool,
1912    pub is_def_or_class_signature: bool,
1913}
1914
1915/// Whether `language` uses C-style `/* ... */` block comments. Used by
1916/// the api-check scanner to decide whether to compute per-line block-
1917/// comment context (api-check-and-patterns-accuracy-v1, BUG-AGG-10).
1918///
1919/// All listed languages share the C lexical tradition (block comments
1920/// open with `/*` and close with `*/`). Languages that use a different
1921/// block-comment shape (Python triple-quoted docstrings, Lua `--[[ ]]`,
1922/// OCaml `(* *)`, Elixir doc attribute blocks) are handled separately or
1923/// have their own line-level matcher in `is_comment_line`.
1924fn language_uses_c_block_comments(language: ApiLanguage) -> bool {
1925    matches!(
1926        language,
1927        ApiLanguage::Rust
1928            | ApiLanguage::Go
1929            | ApiLanguage::Java
1930            | ApiLanguage::JavaScript
1931            | ApiLanguage::TypeScript
1932            | ApiLanguage::C
1933            | ApiLanguage::Cpp
1934            | ApiLanguage::Kotlin
1935            | ApiLanguage::Swift
1936            | ApiLanguage::CSharp
1937            | ApiLanguage::Scala
1938            | ApiLanguage::Php
1939    )
1940}
1941
1942/// For each line in `content`, return whether ANY part of the line lives
1943/// inside a C-style `/* ... */` block comment.
1944///
1945/// Tracks block-comment state across lines, including the case where a
1946/// block opens and closes on the same line (that line is treated as
1947/// fully inside the comment for suppression purposes — the rule's
1948/// regex would otherwise match on text *between* `/*` and `*/`).
1949///
1950/// String-literal awareness: this scanner is conservative. It tracks
1951/// double-quoted (`"..."`) and single-quoted (`'..'`) string state so a
1952/// `/*` inside a string doesn't open a phantom block. It does NOT handle
1953/// escaped quotes, raw strings, template literals, or character literals
1954/// with embedded escapes — those are uncommon enough in API-check rule
1955/// shapes that the simpler scanner suffices. When in doubt the scanner
1956/// errs toward NOT marking the line as comment, so the existing
1957/// `is_comment_line` line-comment fallback still runs.
1958///
1959/// (api-check-and-patterns-accuracy-v1, P11.BUG-AGG-10)
1960pub(crate) fn compute_c_block_comment_lines(content: &str) -> Vec<bool> {
1961    let mut out = Vec::new();
1962    let mut in_block = false;
1963    for line in content.lines() {
1964        let line_starts_in_block = in_block;
1965        let mut any_in_block = in_block;
1966        let bytes = line.as_bytes();
1967        let mut i = 0usize;
1968        let mut in_dq = false;
1969        let mut in_sq = false;
1970        while i < bytes.len() {
1971            let b = bytes[i];
1972            if in_block {
1973                // Look for closing `*/`.
1974                if b == b'*' && i + 1 < bytes.len() && bytes[i + 1] == b'/' {
1975                    in_block = false;
1976                    i += 2;
1977                    continue;
1978                }
1979                i += 1;
1980                continue;
1981            }
1982            // Outside a block comment: track strings so `/*` inside
1983            // `"..."` doesn't open a phantom block.
1984            if !in_sq && b == b'"' {
1985                in_dq = !in_dq;
1986                i += 1;
1987                continue;
1988            }
1989            if !in_dq && b == b'\'' {
1990                in_sq = !in_sq;
1991                i += 1;
1992                continue;
1993            }
1994            if !in_dq && !in_sq {
1995                // `//` line comment: rest of the line is comment, no
1996                // block-state change. Stop scanning the line.
1997                if b == b'/' && i + 1 < bytes.len() && bytes[i + 1] == b'/' {
1998                    break;
1999                }
2000                // `/*` opens a block.
2001                if b == b'/' && i + 1 < bytes.len() && bytes[i + 1] == b'*' {
2002                    in_block = true;
2003                    any_in_block = true;
2004                    i += 2;
2005                    continue;
2006                }
2007            }
2008            i += 1;
2009        }
2010        // Mark the line as in-comment if it started inside one or
2011        // entered one anywhere on this line. The opening line of a
2012        // block comment counts as comment for suppression — we don't
2013        // want a `sprintf()` mention sitting *after* a same-line
2014        // `/* ... */` to be missed, but per the bug report, the more
2015        // common case is the *closing-line text* sitting inside the
2016        // block (e.g. `* not rely on sprintf() family ...`), and the
2017        // strictly conservative choice for either case is "skip the
2018        // whole line" to avoid false positives.
2019        let _ = line_starts_in_block; // (kept for clarity; merged into any_in_block)
2020        out.push(any_in_block);
2021    }
2022    out
2023}
2024
2025/// Pre-pass: compute [`PyLineContext`] for every line of a Python file.
2026///
2027/// Tracks triple-quote state across lines (handles both `"""` and `'''`,
2028/// including the case where the closing triple lives on the *same* line
2029/// that opens it — that line is treated as fully inside a docstring for
2030/// suppression purposes). The detector is conservative: when in doubt
2031/// (e.g. nested string-literal edge cases the simple scanner cannot
2032/// disambiguate without a real parser), it suppresses the line, since
2033/// suppressing a docstring is cheaper than emitting a false positive.
2034///
2035/// This is **not** a full Python parser — it intentionally does NOT
2036/// understand escapes, raw strings, or f-strings. It handles the
2037/// docstring shape well enough to fix the BUG-07 reproducer (and the
2038/// vast majority of real-world docstrings) without pulling in a
2039/// tree-sitter pass for every line of every Python file.
2040pub(crate) fn compute_python_line_contexts(content: &str) -> Vec<PyLineContext> {
2041    let mut out = Vec::new();
2042    // 0 = not in docstring; 1 = in `"""`; 2 = in `'''`.
2043    let mut state: u8 = 0;
2044    for line in content.lines() {
2045        let stripped = strip_line_comment(line);
2046        let line_starts_in_docstring = state != 0;
2047
2048        // Walk the line looking for triple-quote toggles.
2049        let bytes = stripped.as_bytes();
2050        let mut i = 0;
2051        while i + 2 < bytes.len() {
2052            let triple_dq = bytes[i] == b'"' && bytes[i + 1] == b'"' && bytes[i + 2] == b'"';
2053            let triple_sq = bytes[i] == b'\'' && bytes[i + 1] == b'\'' && bytes[i + 2] == b'\'';
2054            match state {
2055                0 if triple_dq => {
2056                    state = 1;
2057                    i += 3;
2058                    continue;
2059                }
2060                0 if triple_sq => {
2061                    state = 2;
2062                    i += 3;
2063                    continue;
2064                }
2065                1 if triple_dq => {
2066                    state = 0;
2067                    i += 3;
2068                    continue;
2069                }
2070                2 if triple_sq => {
2071                    state = 0;
2072                    i += 3;
2073                    continue;
2074                }
2075                _ => {}
2076            }
2077            i += 1;
2078        }
2079        // also handle bytes 0..2 for the trailing window
2080        let line_ends_in_docstring = state != 0;
2081
2082        // A line is "in_docstring" if it starts inside one OR ends inside one
2083        // (i.e. the line opens/lives inside a triple-quoted block). A line
2084        // that *only contains* the opening triple-quote and content (without
2085        // closing) starts at state=0, ends at state=1 → marked as docstring.
2086        let in_docstring = line_starts_in_docstring || line_ends_in_docstring;
2087
2088        let trimmed = line.trim_start();
2089        let is_def_or_class_signature = trimmed.starts_with("def ")
2090            || trimmed.starts_with("async def ")
2091            || trimmed.starts_with("class ");
2092
2093        out.push(PyLineContext {
2094            in_docstring,
2095            is_def_or_class_signature,
2096        });
2097    }
2098    out
2099}
2100
2101/// Strip a trailing `#` comment (best-effort; ignores `#` inside string
2102/// literals only at a syntactic level we can detect — we treat any `#`
2103/// outside an obvious string as a comment start). Used by
2104/// [`compute_python_line_contexts`] to avoid scanning triple-quotes that
2105/// appear inside line comments.
2106fn strip_line_comment(line: &str) -> String {
2107    let mut out = String::with_capacity(line.len());
2108    let mut in_single = false;
2109    let mut in_double = false;
2110    for c in line.chars() {
2111        if c == '\'' && !in_double {
2112            in_single = !in_single;
2113        } else if c == '"' && !in_single {
2114            in_double = !in_double;
2115        } else if c == '#' && !in_single && !in_double {
2116            break;
2117        }
2118        out.push(c);
2119    }
2120    out
2121}
2122
2123struct RustLineContext<'a> {
2124    file_has_hashmap: bool,
2125    previous_line: &'a str,
2126    previous_is_loop: bool,
2127}
2128
2129/// Check a single rule against a line of code
2130fn check_rule(
2131    rule: &APIRule,
2132    file: &str,
2133    line: u32,
2134    line_text: &str,
2135    language: ApiLanguage,
2136    rust_ctx: &RustLineContext<'_>,
2137    py_ctx: PyLineContext,
2138    regex_specs: &[(&'static RegexRuleSpec, Regex)],
2139) -> Option<MisuseFinding> {
2140    let trimmed = line_text.trim();
2141
2142    // api-check-and-patterns-accuracy-v1 (P11.BUG-AGG-6): defense-in-depth
2143    // gate. The primary dispatch (`ApiCheckArgs::run`) already restricts
2144    // each file to its detected language's rule set, but this explicit
2145    // gate ensures that even if a rule list were ever cross-wired (or if
2146    // a future code path bypasses `rules_for_language`), a JS rule like
2147    // `JS003 JSON.parse` cannot fire against a `.cpp` file.
2148    if !rule_applies_to_language(rule.id.as_str(), language) {
2149        return None;
2150    }
2151
2152    // Skip comments
2153    if is_comment_line(trimmed, language) {
2154        return None;
2155    }
2156
2157    // analysis-precision-v1, BUG-07: Python identifier-style rules must
2158    // not match docstring lines or `def`/`class` signature lines (only
2159    // real call sites). Apply the suppression centrally so individual
2160    // checkers don't have to re-implement it.
2161    if matches!(language, ApiLanguage::Python)
2162        && py_rule_skips_docstring_and_signatures(rule.id.as_str())
2163        && (py_ctx.in_docstring || py_ctx.is_def_or_class_signature)
2164    {
2165        return None;
2166    }
2167
2168    match rule.id.as_str() {
2169        "PY001" => check_missing_timeout(rule, file, line, trimmed),
2170        "PY002" => check_bare_except(rule, file, line, trimmed),
2171        "PY003" => check_md5_usage(rule, file, line, trimmed),
2172        "PY004" => check_sha1_usage(rule, file, line, trimmed),
2173        "PY005" => check_unclosed_file(rule, file, line, trimmed),
2174        "PY006" => check_insecure_random(rule, file, line, trimmed),
2175        "RS001" => check_mutex_lock_unwrap(rule, file, line, trimmed),
2176        "RS002" => check_file_open_without_context(rule, file, line, trimmed),
2177        "RS003" => check_unbounded_with_capacity(rule, file, line, trimmed),
2178        "RS004" => check_detached_tokio_spawn(rule, file, line, trimmed),
2179        "RS005" => check_hashmap_order_dependence(rule, file, line, trimmed, rust_ctx),
2180        "RS006" => check_clone_in_hot_loop(rule, file, line, trimmed, rust_ctx),
2181        _ => check_regex_rule(rule, file, line, trimmed, regex_specs),
2182    }
2183}
2184
2185/// Find an occurrence of `name(` in `line_text` that is *not* preceded by
2186/// an identifier character (`a-z`, `A-Z`, `0-9`, `_`). Returns the byte
2187/// offset of `name(` if such an occurrence exists. This rules out
2188/// substring matches against bigger identifiers (e.g. `_lazy_sha1(` for
2189/// `name = "sha1"`).
2190///
2191/// (analysis-precision-v1, BUG-07)
2192fn find_standalone_call(line_text: &str, name: &str) -> Option<usize> {
2193    let needle = format!("{}(", name);
2194    let bytes = line_text.as_bytes();
2195    let mut start = 0usize;
2196    while let Some(rel) = line_text[start..].find(&needle) {
2197        let abs = start + rel;
2198        let prev_ok = abs == 0
2199            || {
2200                let p = bytes[abs - 1];
2201                !(p.is_ascii_alphanumeric() || p == b'_')
2202            };
2203        if prev_ok {
2204            return Some(abs);
2205        }
2206        start = abs + 1;
2207    }
2208    None
2209}
2210
2211/// Whether a Python rule's matcher should be suppressed on docstring /
2212/// `def`/`class` signature lines. Returns `true` for rules whose detection
2213/// is identifier-style (substring of an API name) — false for rules that
2214/// inherently require a body-statement context (like `PY002` bare-except,
2215/// which already requires `except:` syntax).
2216///
2217/// (analysis-precision-v1, BUG-07)
2218fn py_rule_skips_docstring_and_signatures(rule_id: &str) -> bool {
2219    matches!(rule_id, "PY003" | "PY004" | "PY005" | "PY006")
2220}
2221
2222fn is_comment_line(trimmed: &str, language: ApiLanguage) -> bool {
2223    match language {
2224        ApiLanguage::Python | ApiLanguage::Ruby | ApiLanguage::Elixir => trimmed.starts_with('#'),
2225        ApiLanguage::Rust
2226        | ApiLanguage::Go
2227        | ApiLanguage::Java
2228        | ApiLanguage::JavaScript
2229        | ApiLanguage::TypeScript
2230        | ApiLanguage::C
2231        | ApiLanguage::Cpp
2232        | ApiLanguage::Kotlin
2233        | ApiLanguage::Swift
2234        | ApiLanguage::CSharp
2235        | ApiLanguage::Scala => trimmed.starts_with("//"),
2236        ApiLanguage::Php => trimmed.starts_with("//") || trimmed.starts_with('#'),
2237        ApiLanguage::Lua | ApiLanguage::Luau => trimmed.starts_with("--"),
2238        ApiLanguage::Ocaml => trimmed.starts_with("(*"),
2239    }
2240}
2241
2242fn check_regex_rule(
2243    rule: &APIRule,
2244    file: &str,
2245    line: u32,
2246    line_text: &str,
2247    regex_specs: &[(&'static RegexRuleSpec, Regex)],
2248) -> Option<MisuseFinding> {
2249    // fastpath-extend-non-vuln-v1: lookup the pre-compiled regex by rule id
2250    // (compiled ONCE per file in `analyze_file`, not once per line).
2251    let (spec, regex) = regex_specs.iter().find(|(spec, _)| spec.id == rule.id)?;
2252    if !regex.is_match(line_text) {
2253        return None;
2254    }
2255
2256    // language-specific-bugs-v1 (P14.AGG14-15): JV001
2257    // (`string-comparison-with-double-equals`) flags `x == y` as a
2258    // suspected reference-equality bug, which is correct for two String
2259    // operands but a false positive for the canonical Java null check
2260    // `if (x == null) { ... }`. The regex
2261    // `(?:".*"|\b\w+\b)\s*==\s*(?:".*"|\b\w+\b)` matches `null` (a
2262    // bareword) on either side because there is no syntactic null
2263    // literal exclusion. Skip the finding when one side of the `==` /
2264    // `!=` is the bare `null` keyword. Same idiom for C# (CS rules) is
2265    // not currently affected — the C# rule list does not include a
2266    // double-equals-string rule, so this guard is JV001-specific.
2267    if rule.id == "JV001" {
2268        // Conservative substring check: any line whose `==` / `!=` has
2269        // `null` immediately on either side is a null-comparison
2270        // idiom, not a string equality bug.
2271        if line_has_null_comparison(line_text) {
2272            return None;
2273        }
2274    }
2275
2276    let column = regex.find(line_text).map(|m| m.start()).unwrap_or(0) as u32;
2277    Some(MisuseFinding {
2278        file: file.to_string(),
2279        line,
2280        column,
2281        rule: (*rule).clone(),
2282        api_call: spec.api_call.to_string(),
2283        message: spec.message.to_string(),
2284        fix_suggestion: spec.fix_suggestion.to_string(),
2285        code_context: line_text.to_string(),
2286    })
2287}
2288
2289/// language-specific-bugs-v1 (P14.AGG14-15): true when `line_text` contains
2290/// a `==` or `!=` operator with the literal keyword `null` on at least
2291/// one side. Used to suppress JV001 false positives on canonical Java
2292/// null checks.
2293fn line_has_null_comparison(line_text: &str) -> bool {
2294    // Walk the line character by character, finding each `==` / `!=`
2295    // occurrence (ignoring `===` which Java doesn't have but other langs
2296    // do) and inspecting a small window on both sides for the bareword
2297    // `null`. We check for word-boundary `null` rather than a raw
2298    // substring so identifiers like `notnull` / `nullable` don't trigger.
2299    let bytes = line_text.as_bytes();
2300    let mut i = 0;
2301    while i + 1 < bytes.len() {
2302        let is_eq = bytes[i] == b'=' && bytes[i + 1] == b'=';
2303        let is_neq = bytes[i] == b'!' && bytes[i + 1] == b'=';
2304        if !is_eq && !is_neq {
2305            i += 1;
2306            continue;
2307        }
2308        // Skip `===` chains (defense in depth — should not appear in Java).
2309        if is_eq && bytes.get(i + 2) == Some(&b'=') {
2310            i += 3;
2311            continue;
2312        }
2313        // Inspect ~16 chars to the left and right for word-boundary `null`.
2314        let lo = i.saturating_sub(16);
2315        let hi = (i + 2 + 16).min(bytes.len());
2316        let left = std::str::from_utf8(&bytes[lo..i]).unwrap_or("");
2317        let right = std::str::from_utf8(&bytes[i + 2..hi]).unwrap_or("");
2318        if has_word_null(left) || has_word_null(right) {
2319            return true;
2320        }
2321        i += 2;
2322    }
2323    false
2324}
2325
2326/// True when `s` contains the bareword `null` with word boundaries
2327/// (i.e. not preceded or followed by an alphanumeric / underscore).
2328fn has_word_null(s: &str) -> bool {
2329    let bytes = s.as_bytes();
2330    let mut i = 0usize;
2331    while i + 4 <= bytes.len() {
2332        if &bytes[i..i + 4] == b"null" {
2333            let before_ok = i == 0
2334                || !bytes[i - 1].is_ascii_alphanumeric() && bytes[i - 1] != b'_';
2335            let after_ok = i + 4 == bytes.len()
2336                || !bytes[i + 4].is_ascii_alphanumeric() && bytes[i + 4] != b'_';
2337            if before_ok && after_ok {
2338                return true;
2339            }
2340        }
2341        i += 1;
2342    }
2343    false
2344}
2345
2346/// Check for requests without timeout
2347fn check_missing_timeout(
2348    rule: &APIRule,
2349    file: &str,
2350    line: u32,
2351    line_text: &str,
2352) -> Option<MisuseFinding> {
2353    // Look for requests.get/post/put/delete/patch without timeout
2354    let request_patterns = [
2355        "requests.get(",
2356        "requests.post(",
2357        "requests.put(",
2358        "requests.delete(",
2359        "requests.patch(",
2360        "requests.head(",
2361        "requests.options(",
2362    ];
2363
2364    for pattern in &request_patterns {
2365        if line_text.contains(pattern) && !line_text.contains("timeout") {
2366            let column = line_text.find(pattern).unwrap_or(0) as u32;
2367            return Some(MisuseFinding {
2368                file: file.to_string(),
2369                line,
2370                column,
2371                rule: rule.clone(),
2372                api_call: pattern.trim_end_matches('(').to_string(),
2373                message: format!(
2374                    "{} called without timeout parameter",
2375                    pattern.trim_end_matches('(')
2376                ),
2377                fix_suggestion: format!("Add timeout parameter: {}url, timeout=30)", pattern),
2378                code_context: line_text.to_string(),
2379            });
2380        }
2381    }
2382
2383    None
2384}
2385
2386/// Check for bare except clause
2387fn check_bare_except(
2388    rule: &APIRule,
2389    file: &str,
2390    line: u32,
2391    line_text: &str,
2392) -> Option<MisuseFinding> {
2393    // Look for "except:" without an exception type
2394    // Match "except:" but not "except SomeException:" or "except Exception as e:"
2395    if line_text.starts_with("except:") || line_text.contains(" except:") {
2396        let column = line_text.find("except:").unwrap_or(0) as u32;
2397        return Some(MisuseFinding {
2398            file: file.to_string(),
2399            line,
2400            column,
2401            rule: rule.clone(),
2402            api_call: "except".to_string(),
2403            message: "Bare except clause catches all exceptions including KeyboardInterrupt and SystemExit".to_string(),
2404            fix_suggestion: "Use 'except Exception as e:' to catch only program exceptions".to_string(),
2405            code_context: line_text.to_string(),
2406        });
2407    }
2408
2409    None
2410}
2411
2412/// Check for MD5 usage
2413fn check_md5_usage(
2414    rule: &APIRule,
2415    file: &str,
2416    line: u32,
2417    line_text: &str,
2418) -> Option<MisuseFinding> {
2419    // analysis-precision-v1, BUG-07: require either the `hashlib.md5`
2420    // qualified form (with the leading dot, so `hashlib.md5(...)` matches
2421    // but `_my_hashlib.md5_helper` does not) OR a *standalone* `md5(`
2422    // call — i.e. `md5(` not preceded by an identifier character. This
2423    // blocks substring matches against function names that *contain*
2424    // `md5` (e.g. `def compute_md5(...)`).
2425    let has_qualified = line_text.contains("hashlib.md5");
2426    let has_standalone_call = find_standalone_call(line_text, "md5").is_some();
2427    if has_qualified || has_standalone_call {
2428        let column = line_text
2429            .find("hashlib.md5")
2430            .or_else(|| find_standalone_call(line_text, "md5"))
2431            .unwrap_or(0) as u32;
2432        return Some(MisuseFinding {
2433            file: file.to_string(),
2434            line,
2435            column,
2436            rule: rule.clone(),
2437            api_call: "hashlib.md5".to_string(),
2438            message: "MD5 is cryptographically broken and should not be used for security purposes"
2439                .to_string(),
2440            fix_suggestion: "Use hashlib.sha256() or stronger. For passwords, use bcrypt or argon2"
2441                .to_string(),
2442            code_context: line_text.to_string(),
2443        });
2444    }
2445
2446    None
2447}
2448
2449/// Check for SHA1 usage
2450fn check_sha1_usage(
2451    rule: &APIRule,
2452    file: &str,
2453    line: u32,
2454    line_text: &str,
2455) -> Option<MisuseFinding> {
2456    // analysis-precision-v1, BUG-07: require either the `hashlib.sha1`
2457    // qualified form (with the leading dot) OR a *standalone* `sha1(`
2458    // call — i.e. `sha1(` not preceded by an identifier character. This
2459    // blocks substring matches against function names that *contain*
2460    // `sha1` (e.g. `def _lazy_sha1(string)` from flask's
2461    // `src/flask/sessions.py:276`, which was the original BUG-07 FP).
2462    let has_qualified = line_text.contains("hashlib.sha1");
2463    let has_standalone_call = find_standalone_call(line_text, "sha1").is_some();
2464    if has_qualified || has_standalone_call {
2465        let column = line_text
2466            .find("hashlib.sha1")
2467            .or_else(|| find_standalone_call(line_text, "sha1"))
2468            .unwrap_or(0) as u32;
2469        return Some(MisuseFinding {
2470            file: file.to_string(),
2471            line,
2472            column,
2473            rule: rule.clone(),
2474            api_call: "hashlib.sha1".to_string(),
2475            message: "SHA1 is cryptographically weak and should not be used for security purposes"
2476                .to_string(),
2477            fix_suggestion: "Use hashlib.sha256() or stronger".to_string(),
2478            code_context: line_text.to_string(),
2479        });
2480    }
2481
2482    None
2483}
2484
2485/// Check for unclosed file
2486fn check_unclosed_file(
2487    rule: &APIRule,
2488    file: &str,
2489    line: u32,
2490    line_text: &str,
2491) -> Option<MisuseFinding> {
2492    // Look for "open(" that's not after "with "
2493    // This is a simplified check - a proper implementation would use AST
2494    if line_text.contains("open(")
2495        && !line_text.contains("with ")
2496        && !line_text.starts_with("with ")
2497    {
2498        // Check if it's an assignment (f = open(...))
2499        if line_text.contains("= open(") || line_text.contains("=open(") {
2500            let column = line_text.find("open(").unwrap_or(0) as u32;
2501            return Some(MisuseFinding {
2502                file: file.to_string(),
2503                line,
2504                column,
2505                rule: rule.clone(),
2506                api_call: "open".to_string(),
2507                message: "File opened without context manager may not be properly closed"
2508                    .to_string(),
2509                fix_suggestion: "Use 'with open(path) as f:' to ensure file is closed".to_string(),
2510                code_context: line_text.to_string(),
2511            });
2512        }
2513    }
2514
2515    None
2516}
2517
2518/// Check for insecure random usage
2519fn check_insecure_random(
2520    rule: &APIRule,
2521    file: &str,
2522    line: u32,
2523    line_text: &str,
2524) -> Option<MisuseFinding> {
2525    // Look for random.* usage that might be for security
2526    let insecure_patterns = [
2527        "random.randint(",
2528        "random.random(",
2529        "random.choice(",
2530        "random.randrange(",
2531    ];
2532
2533    // Only flag if it looks like it's being used for security
2534    // (contains words like token, secret, password, key)
2535    let security_indicators = ["token", "secret", "password", "key", "auth", "session"];
2536
2537    for pattern in &insecure_patterns {
2538        if line_text.contains(pattern) {
2539            // Check if the line or nearby context suggests security use
2540            let line_lower = line_text.to_lowercase();
2541            for indicator in &security_indicators {
2542                if line_lower.contains(indicator) {
2543                    let column = line_text.find(pattern).unwrap_or(0) as u32;
2544                    return Some(MisuseFinding {
2545                        file: file.to_string(),
2546                        line,
2547                        column,
2548                        rule: rule.clone(),
2549                        api_call: pattern.trim_end_matches('(').to_string(),
2550                        message: format!(
2551                            "{} is not cryptographically secure, don't use for security purposes",
2552                            pattern.trim_end_matches('(')
2553                        ),
2554                        fix_suggestion:
2555                            "Use secrets.token_bytes() or secrets.token_hex() for security"
2556                                .to_string(),
2557                        code_context: line_text.to_string(),
2558                    });
2559                }
2560            }
2561        }
2562    }
2563
2564    None
2565}
2566
2567/// Check for poisoned mutex lock unwrap.
2568fn check_mutex_lock_unwrap(
2569    rule: &APIRule,
2570    file: &str,
2571    line: u32,
2572    line_text: &str,
2573) -> Option<MisuseFinding> {
2574    if line_text.contains(".lock().unwrap()") {
2575        let column = line_text.find(".lock().unwrap()").unwrap_or(0) as u32;
2576        return Some(MisuseFinding {
2577            file: file.to_string(),
2578            line,
2579            column,
2580            rule: rule.clone(),
2581            api_call: "Mutex::lock".to_string(),
2582            message:
2583                "Mutex::lock().unwrap() can panic on poisoned locks and hide deadlock behavior"
2584                    .to_string(),
2585            fix_suggestion:
2586                "Handle lock errors explicitly (match/if let), or use try_lock with backoff"
2587                    .to_string(),
2588            code_context: line_text.to_string(),
2589        });
2590    }
2591    None
2592}
2593
2594/// Check for File::open without context propagation.
2595fn check_file_open_without_context(
2596    rule: &APIRule,
2597    file: &str,
2598    line: u32,
2599    line_text: &str,
2600) -> Option<MisuseFinding> {
2601    if line_text.contains("File::open(")
2602        && !line_text.contains(".context(")
2603        && !line_text.contains(".with_context(")
2604        && !line_text.contains("map_err(")
2605    {
2606        let column = line_text.find("File::open(").unwrap_or(0) as u32;
2607        return Some(MisuseFinding {
2608            file: file.to_string(),
2609            line,
2610            column,
2611            rule: rule.clone(),
2612            api_call: "File::open".to_string(),
2613            message: "File::open used without contextual error mapping".to_string(),
2614            fix_suggestion:
2615                "Wrap errors with context (with_context/context/map_err) before propagating"
2616                    .to_string(),
2617            code_context: line_text.to_string(),
2618        });
2619    }
2620    None
2621}
2622
2623/// Check for capacity allocations sourced from unbounded input.
2624fn check_unbounded_with_capacity(
2625    rule: &APIRule,
2626    file: &str,
2627    line: u32,
2628    line_text: &str,
2629) -> Option<MisuseFinding> {
2630    if line_text.contains("Vec::with_capacity(") {
2631        let line_lower = line_text.to_lowercase();
2632        let user_input_markers = ["input", "args", "user", "request", "len", "size"];
2633        if user_input_markers.iter().any(|m| line_lower.contains(m)) {
2634            let column = line_text.find("Vec::with_capacity(").unwrap_or(0) as u32;
2635            return Some(MisuseFinding {
2636                file: file.to_string(),
2637                line,
2638                column,
2639                rule: rule.clone(),
2640                api_call: "Vec::with_capacity".to_string(),
2641                message: "Vec::with_capacity appears to use unbounded external input".to_string(),
2642                fix_suggestion:
2643                    "Clamp requested capacity with a hard upper bound before allocation".to_string(),
2644                code_context: line_text.to_string(),
2645            });
2646        }
2647    }
2648    None
2649}
2650
2651/// Check for detached tokio tasks.
2652fn check_detached_tokio_spawn(
2653    rule: &APIRule,
2654    file: &str,
2655    line: u32,
2656    line_text: &str,
2657) -> Option<MisuseFinding> {
2658    if line_text.contains("tokio::spawn(")
2659        && !line_text.contains('=')
2660        && !line_text.contains("handles.push")
2661    {
2662        let column = line_text.find("tokio::spawn(").unwrap_or(0) as u32;
2663        return Some(MisuseFinding {
2664            file: file.to_string(),
2665            line,
2666            column,
2667            rule: rule.clone(),
2668            api_call: "tokio::spawn".to_string(),
2669            message: "tokio::spawn used without keeping JoinHandle".to_string(),
2670            fix_suggestion: "Store JoinHandle values and await them to surface task errors"
2671                .to_string(),
2672            code_context: line_text.to_string(),
2673        });
2674    }
2675    None
2676}
2677
2678/// Check for map iteration order assumptions.
2679fn check_hashmap_order_dependence(
2680    rule: &APIRule,
2681    file: &str,
2682    line: u32,
2683    line_text: &str,
2684    rust_ctx: &RustLineContext<'_>,
2685) -> Option<MisuseFinding> {
2686    let looks_like_hashmap_iteration = line_text.contains(".iter()")
2687        && (line_text.contains("for ") || rust_ctx.previous_line.starts_with("for "))
2688        && rust_ctx.file_has_hashmap;
2689    if looks_like_hashmap_iteration {
2690        let column = line_text.find(".iter()").unwrap_or(0) as u32;
2691        return Some(MisuseFinding {
2692            file: file.to_string(),
2693            line,
2694            column,
2695            rule: rule.clone(),
2696            api_call: "HashMap::iter".to_string(),
2697            message: "Potential logic dependence on HashMap iteration order".to_string(),
2698            fix_suggestion: "Use BTreeMap/IndexMap or sort keys before ordered operations"
2699                .to_string(),
2700            code_context: line_text.to_string(),
2701        });
2702    }
2703    None
2704}
2705
2706/// Check for clone usage in loop bodies.
2707fn check_clone_in_hot_loop(
2708    rule: &APIRule,
2709    file: &str,
2710    line: u32,
2711    line_text: &str,
2712    rust_ctx: &RustLineContext<'_>,
2713) -> Option<MisuseFinding> {
2714    if line_text.contains(".clone()")
2715        && (line_text.contains("for ") || line_text.contains("while ") || rust_ctx.previous_is_loop)
2716    {
2717        let column = line_text.find(".clone()").unwrap_or(0) as u32;
2718        return Some(MisuseFinding {
2719            file: file.to_string(),
2720            line,
2721            column,
2722            rule: rule.clone(),
2723            api_call: "clone".to_string(),
2724            message: "clone() in loop context may create avoidable allocation overhead".to_string(),
2725            fix_suggestion: "Prefer borrowing/references or move semantics inside hot loops"
2726                .to_string(),
2727            code_context: line_text.to_string(),
2728        });
2729    }
2730    None
2731}
2732
2733// =============================================================================
2734// Filtering
2735// =============================================================================
2736
2737/// Filter findings by category and severity
2738fn filter_findings(
2739    findings: Vec<MisuseFinding>,
2740    categories: Option<&[MisuseCategory]>,
2741    severities: Option<&[MisuseSeverity]>,
2742) -> Vec<MisuseFinding> {
2743    findings
2744        .into_iter()
2745        .filter(|f| {
2746            // Category filter
2747            if let Some(cats) = categories {
2748                if !cats.contains(&f.rule.category) {
2749                    return false;
2750                }
2751            }
2752
2753            // Severity filter
2754            if let Some(sevs) = severities {
2755                if !sevs.contains(&f.rule.severity) {
2756                    return false;
2757                }
2758            }
2759
2760            true
2761        })
2762        .collect()
2763}
2764
2765// =============================================================================
2766// Summary Building
2767// =============================================================================
2768
2769/// Render a `MisuseCategory` using the same snake_case form as serde
2770/// serialization (schema-naming-and-units-v1). Keeping summary keys in sync
2771/// with `findings[].rule.category` lets consumers join the two without
2772/// ad-hoc normalization.
2773fn serialize_misuse_category(cat: &MisuseCategory) -> String {
2774    match cat {
2775        MisuseCategory::CallOrder => "call_order".to_string(),
2776        MisuseCategory::ErrorHandling => "error_handling".to_string(),
2777        MisuseCategory::Parameters => "parameters".to_string(),
2778        MisuseCategory::Resources => "resources".to_string(),
2779        MisuseCategory::Crypto => "crypto".to_string(),
2780        MisuseCategory::Concurrency => "concurrency".to_string(),
2781        MisuseCategory::Security => "security".to_string(),
2782    }
2783}
2784
2785/// Render a `MisuseSeverity` using the same snake_case form as serde
2786/// serialization (schema-naming-and-units-v1).
2787fn serialize_misuse_severity(sev: &MisuseSeverity) -> String {
2788    match sev {
2789        MisuseSeverity::Info => "info".to_string(),
2790        MisuseSeverity::Low => "low".to_string(),
2791        MisuseSeverity::Medium => "medium".to_string(),
2792        MisuseSeverity::High => "high".to_string(),
2793    }
2794}
2795
2796/// Build summary from findings
2797fn build_summary(findings: &[MisuseFinding], files_scanned: u32) -> APICheckSummary {
2798    let mut by_category: HashMap<String, u32> = HashMap::new();
2799    let mut by_severity: HashMap<String, u32> = HashMap::new();
2800    let mut apis_checked: Vec<String> = Vec::new();
2801
2802    for finding in findings {
2803        // Count by category — use snake_case serde representation so the
2804        // summary key matches what is emitted on `findings[].rule.category`
2805        // (schema-naming-and-units-v1). Previously `format!("{:?}", ...).to_lowercase()`
2806        // produced collapsed-case keys like `errorhandling` while the per-finding
2807        // detail used `error_handling`, forcing consumers to normalize.
2808        let cat_str = serialize_misuse_category(&finding.rule.category);
2809        *by_category.entry(cat_str).or_insert(0) += 1;
2810
2811        // Count by severity — use snake_case serde representation for the same reason.
2812        let sev_str = serialize_misuse_severity(&finding.rule.severity);
2813        *by_severity.entry(sev_str).or_insert(0) += 1;
2814
2815        // Track APIs
2816        if !apis_checked.contains(&finding.api_call) {
2817            apis_checked.push(finding.api_call.clone());
2818        }
2819    }
2820
2821    APICheckSummary {
2822        total_findings: findings.len() as u32,
2823        by_category,
2824        by_severity,
2825        apis_checked,
2826        files_scanned,
2827    }
2828}
2829
2830// =============================================================================
2831// Output Formatting
2832// =============================================================================
2833
2834/// Format report as human-readable text
2835fn format_api_check_text(report: &APICheckReport) -> String {
2836    let mut output = String::new();
2837
2838    output.push_str("=== API Check Report ===\n\n");
2839
2840    // Summary
2841    output.push_str(&format!(
2842        "Files scanned: {}\n",
2843        report.summary.files_scanned
2844    ));
2845    output.push_str(&format!("Rules applied: {}\n", report.rules_applied));
2846    output.push_str(&format!(
2847        "Total findings: {}\n\n",
2848        report.summary.total_findings
2849    ));
2850
2851    // By severity
2852    if !report.summary.by_severity.is_empty() {
2853        output.push_str("By Severity:\n");
2854        for (severity, count) in &report.summary.by_severity {
2855            output.push_str(&format!("  {}: {}\n", severity, count));
2856        }
2857        output.push('\n');
2858    }
2859
2860    // By category
2861    if !report.summary.by_category.is_empty() {
2862        output.push_str("By Category:\n");
2863        for (category, count) in &report.summary.by_category {
2864            output.push_str(&format!("  {}: {}\n", category, count));
2865        }
2866        output.push('\n');
2867    }
2868
2869    // Findings
2870    if !report.findings.is_empty() {
2871        output.push_str("Findings:\n");
2872        output.push_str(&"-".repeat(60));
2873        output.push('\n');
2874
2875        for finding in &report.findings {
2876            output.push_str(&format!(
2877                "[{:?}] {} ({})\n",
2878                finding.rule.severity, finding.rule.name, finding.rule.id
2879            ));
2880            output.push_str(&format!(
2881                "  Location: {}:{}:{}\n",
2882                finding.file, finding.line, finding.column
2883            ));
2884            output.push_str(&format!("  API: {}\n", finding.api_call));
2885            output.push_str(&format!("  Message: {}\n", finding.message));
2886            output.push_str(&format!("  Fix: {}\n", finding.fix_suggestion));
2887            if !finding.code_context.is_empty() {
2888                output.push_str(&format!("  Context: {}\n", finding.code_context.trim()));
2889            }
2890            output.push('\n');
2891        }
2892    } else {
2893        output.push_str("No API misuse patterns detected.\n");
2894    }
2895
2896    output
2897}
2898
2899// =============================================================================
2900// Tests
2901// =============================================================================
2902
2903#[cfg(test)]
2904mod tests {
2905    use super::*;
2906    use tempfile::TempDir;
2907
2908    #[test]
2909    fn test_python_rules_defined() {
2910        let rules = python_rules();
2911        assert!(!rules.is_empty());
2912        assert!(rules.iter().any(|r| r.id == "PY001")); // missing-timeout
2913        assert!(rules.iter().any(|r| r.id == "PY002")); // bare-except
2914        assert!(rules.iter().any(|r| r.id == "PY003")); // weak-hash-md5
2915        assert!(rules.iter().any(|r| r.id == "PY005")); // unclosed-file
2916    }
2917
2918    #[test]
2919    fn test_rust_rules_defined() {
2920        let rules = rust_rules();
2921        assert!(!rules.is_empty());
2922        assert!(rules.iter().any(|r| r.id == "RS001"));
2923        assert!(rules.iter().any(|r| r.id == "RS002"));
2924        assert!(rules.iter().any(|r| r.id == "RS003"));
2925        assert!(rules.iter().any(|r| r.id == "RS004"));
2926        assert!(rules.iter().any(|r| r.id == "RS005"));
2927        assert!(rules.iter().any(|r| r.id == "RS006"));
2928    }
2929
2930    #[test]
2931    fn test_all_supported_languages_have_rules() {
2932        for language in all_api_languages() {
2933            let rules = rules_for_language(*language);
2934            assert!(
2935                !rules.is_empty(),
2936                "expected at least one api-check rule for {:?}",
2937                language
2938            );
2939        }
2940    }
2941
2942    #[test]
2943    fn test_detect_language_extended_extensions() {
2944        let cases = [
2945            ("main.go", ApiLanguage::Go),
2946            ("Main.java", ApiLanguage::Java),
2947            ("app.js", ApiLanguage::JavaScript),
2948            ("component.tsx", ApiLanguage::TypeScript),
2949            ("main.c", ApiLanguage::C),
2950            ("main.cpp", ApiLanguage::Cpp),
2951            ("app.rb", ApiLanguage::Ruby),
2952            ("index.php", ApiLanguage::Php),
2953            ("Main.kt", ApiLanguage::Kotlin),
2954            ("main.swift", ApiLanguage::Swift),
2955            ("Program.cs", ApiLanguage::CSharp),
2956            ("Main.scala", ApiLanguage::Scala),
2957            ("app.ex", ApiLanguage::Elixir),
2958            ("main.lua", ApiLanguage::Lua),
2959            ("game.luau", ApiLanguage::Luau),
2960            ("main.ml", ApiLanguage::Ocaml),
2961        ];
2962
2963        for (path, expected) in cases {
2964            assert_eq!(detect_language(Path::new(path)), Some(expected), "{path}");
2965        }
2966    }
2967
2968    #[test]
2969    fn test_check_missing_timeout() {
2970        let rule = &python_rules()[0]; // PY001
2971
2972        // Should detect
2973        let finding = check_missing_timeout(rule, "test.py", 1, "response = requests.get(url)");
2974        assert!(finding.is_some());
2975
2976        // Should not detect (has timeout)
2977        let finding = check_missing_timeout(
2978            rule,
2979            "test.py",
2980            1,
2981            "response = requests.get(url, timeout=30)",
2982        );
2983        assert!(finding.is_none());
2984    }
2985
2986    #[test]
2987    fn test_check_bare_except() {
2988        let rule = &python_rules()[1]; // PY002
2989
2990        // Should detect
2991        let finding = check_bare_except(rule, "test.py", 1, "except:");
2992        assert!(finding.is_some());
2993
2994        // Should not detect (has exception type)
2995        let finding = check_bare_except(rule, "test.py", 1, "except Exception:");
2996        assert!(finding.is_none());
2997    }
2998
2999    #[test]
3000    fn test_check_md5_usage() {
3001        let rule = &python_rules()[2]; // PY003
3002
3003        // Should detect
3004        let finding = check_md5_usage(rule, "test.py", 1, "hash = hashlib.md5(data)");
3005        assert!(finding.is_some());
3006
3007        // Should not detect
3008        let finding = check_md5_usage(rule, "test.py", 1, "hash = hashlib.sha256(data)");
3009        assert!(finding.is_none());
3010    }
3011
3012    #[test]
3013    fn test_check_unclosed_file() {
3014        let rule = &python_rules()[4]; // PY005
3015
3016        // Should detect
3017        let finding = check_unclosed_file(rule, "test.py", 1, "f = open('data.txt')");
3018        assert!(finding.is_some());
3019
3020        // Should not detect (using context manager)
3021        let finding = check_unclosed_file(rule, "test.py", 1, "with open('data.txt') as f:");
3022        assert!(finding.is_none());
3023    }
3024
3025    #[test]
3026    fn test_filter_by_category() {
3027        let findings = vec![
3028            MisuseFinding {
3029                file: "test.py".to_string(),
3030                line: 1,
3031                column: 0,
3032                rule: APIRule {
3033                    id: "PY001".to_string(),
3034                    name: "test".to_string(),
3035                    category: MisuseCategory::Parameters,
3036                    severity: MisuseSeverity::High,
3037                    description: "test".to_string(),
3038                    correct_usage: "test".to_string(),
3039                },
3040                api_call: "test".to_string(),
3041                message: "test".to_string(),
3042                fix_suggestion: "test".to_string(),
3043                code_context: "test".to_string(),
3044            },
3045            MisuseFinding {
3046                file: "test.py".to_string(),
3047                line: 2,
3048                column: 0,
3049                rule: APIRule {
3050                    id: "PY003".to_string(),
3051                    name: "test".to_string(),
3052                    category: MisuseCategory::Crypto,
3053                    severity: MisuseSeverity::High,
3054                    description: "test".to_string(),
3055                    correct_usage: "test".to_string(),
3056                },
3057                api_call: "test".to_string(),
3058                message: "test".to_string(),
3059                fix_suggestion: "test".to_string(),
3060                code_context: "test".to_string(),
3061            },
3062        ];
3063
3064        let filtered = filter_findings(findings, Some(&[MisuseCategory::Crypto]), None);
3065        assert_eq!(filtered.len(), 1);
3066        assert_eq!(filtered[0].rule.category, MisuseCategory::Crypto);
3067    }
3068
3069    #[test]
3070    fn test_build_summary() {
3071        let findings = vec![MisuseFinding {
3072            file: "test.py".to_string(),
3073            line: 1,
3074            column: 0,
3075            rule: APIRule {
3076                id: "PY001".to_string(),
3077                name: "test".to_string(),
3078                category: MisuseCategory::Parameters,
3079                severity: MisuseSeverity::High,
3080                description: "test".to_string(),
3081                correct_usage: "test".to_string(),
3082            },
3083            api_call: "requests.get".to_string(),
3084            message: "test".to_string(),
3085            fix_suggestion: "test".to_string(),
3086            code_context: "test".to_string(),
3087        }];
3088
3089        let summary = build_summary(&findings, 5);
3090        assert_eq!(summary.total_findings, 1);
3091        assert_eq!(summary.files_scanned, 5);
3092        assert!(summary.apis_checked.contains(&"requests.get".to_string()));
3093    }
3094
3095    #[test]
3096    fn test_collect_files_includes_rust() {
3097        let temp = TempDir::new().unwrap();
3098        let py = temp.path().join("a.py");
3099        let rs = temp.path().join("b.rs");
3100        let go = temp.path().join("c.go");
3101        let txt = temp.path().join("c.txt");
3102        fs::write(&py, "print('ok')").unwrap();
3103        fs::write(&rs, "fn main() {}").unwrap();
3104        fs::write(&go, "package main").unwrap();
3105        fs::write(&txt, "ignore").unwrap();
3106
3107        let files = collect_files(temp.path()).unwrap();
3108        assert!(files.iter().any(|f| f.ends_with("a.py")));
3109        assert!(files.iter().any(|f| f.ends_with("b.rs")));
3110        assert!(files.iter().any(|f| f.ends_with("c.go")));
3111        assert!(!files.iter().any(|f| f.ends_with("c.txt")));
3112    }
3113
3114    #[test]
3115    fn test_check_mutex_lock_unwrap() {
3116        let rule = &rust_rules()[0];
3117        let finding =
3118            check_mutex_lock_unwrap(rule, "lib.rs", 10, "let guard = shared.lock().unwrap();");
3119        assert!(finding.is_some());
3120    }
3121
3122    #[test]
3123    fn test_check_file_open_without_context() {
3124        let rule = &rust_rules()[1];
3125        let finding = check_file_open_without_context(rule, "lib.rs", 8, "let f = File::open(p)?;");
3126        assert!(finding.is_some());
3127
3128        let contextual = check_file_open_without_context(
3129            rule,
3130            "lib.rs",
3131            9,
3132            "let f = File::open(p).with_context(|| \"open\".to_string())?;",
3133        );
3134        assert!(contextual.is_none());
3135    }
3136
3137    #[test]
3138    fn test_check_unbounded_with_capacity() {
3139        let rule = &rust_rules()[2];
3140        let finding =
3141            check_unbounded_with_capacity(rule, "lib.rs", 12, "let v = Vec::with_capacity(len);");
3142        assert!(finding.is_some());
3143
3144        let bounded =
3145            check_unbounded_with_capacity(rule, "lib.rs", 13, "let v = Vec::with_capacity(256);");
3146        assert!(bounded.is_none());
3147    }
3148
3149    #[test]
3150    fn test_check_tokio_spawn_detached() {
3151        let rule = &rust_rules()[3];
3152        let detached = check_detached_tokio_spawn(
3153            rule,
3154            "lib.rs",
3155            3,
3156            "tokio::spawn(async move { work().await; });",
3157        );
3158        let tracked = check_detached_tokio_spawn(
3159            rule,
3160            "lib.rs",
3161            4,
3162            "let handle = tokio::spawn(async move { work().await; });",
3163        );
3164        assert!(detached.is_some());
3165        assert!(tracked.is_none());
3166    }
3167
3168    #[test]
3169    fn test_check_hashmap_order_dependence() {
3170        let rule = &rust_rules()[4];
3171        let ctx = RustLineContext {
3172            file_has_hashmap: true,
3173            previous_line: "for (k, v) in map",
3174            previous_is_loop: true,
3175        };
3176        let finding = check_hashmap_order_dependence(rule, "lib.rs", 12, "    .iter()", &ctx);
3177        assert!(finding.is_some());
3178    }
3179
3180    #[test]
3181    fn test_check_clone_in_hot_loop() {
3182        let rule = &rust_rules()[5];
3183        let ctx = RustLineContext {
3184            file_has_hashmap: false,
3185            previous_line: "for item in items {",
3186            previous_is_loop: true,
3187        };
3188        let finding = check_clone_in_hot_loop(rule, "lib.rs", 20, "value.clone()", &ctx);
3189        assert!(finding.is_some());
3190    }
3191
3192    fn assert_language_findings(
3193        filename: &str,
3194        language: ApiLanguage,
3195        source: &str,
3196        expected_rule_id: &str,
3197    ) {
3198        let temp = TempDir::new().unwrap();
3199        let path = temp.path().join(filename);
3200        fs::write(&path, source).unwrap();
3201        let rules = rules_for_language(language);
3202        let findings = analyze_file(&path, &rules, language).unwrap();
3203        assert!(
3204            findings
3205                .iter()
3206                .any(|finding| finding.rule.id == expected_rule_id),
3207            "expected {expected_rule_id} for {filename}, got {:?}",
3208            findings
3209                .iter()
3210                .map(|f| f.rule.id.clone())
3211                .collect::<Vec<_>>()
3212        );
3213    }
3214
3215    #[test]
3216    fn test_extended_language_rule_detection() {
3217        let cases = [
3218            (
3219                "main.go",
3220                ApiLanguage::Go,
3221                "data, _ := ioutil.ReadFile(path)",
3222                "GO001",
3223            ),
3224            (
3225                "Main.java",
3226                ApiLanguage::Java,
3227                "if (name == otherName) { }",
3228                "JV001",
3229            ),
3230            ("app.js", ApiLanguage::JavaScript, "if (a == b) {}", "JS001"),
3231            ("app.ts", ApiLanguage::TypeScript, "if (a == b) {}", "TS001"),
3232            ("main.c", ApiLanguage::C, "gets(buffer);", "C001"),
3233            (
3234                "main.cpp",
3235                ApiLanguage::Cpp,
3236                "std::auto_ptr<Foo> p;",
3237                "CPP003",
3238            ),
3239            ("app.rb", ApiLanguage::Ruby, "eval(params[:code])", "RB001"),
3240            (
3241                "index.php",
3242                ApiLanguage::Php,
3243                "unserialize($payload);",
3244                "PH005",
3245            ),
3246            ("Main.kt", ApiLanguage::Kotlin, "val name = user!!", "KT001"),
3247            (
3248                "main.swift",
3249                ApiLanguage::Swift,
3250                "let name = value!",
3251                "SW003",
3252            ),
3253            (
3254                "Program.cs",
3255                ApiLanguage::CSharp,
3256                "var x = task.Result;",
3257                "CS003",
3258            ),
3259            (
3260                "Main.scala",
3261                ApiLanguage::Scala,
3262                "val casted = value.asInstanceOf[String]",
3263                "SC002",
3264            ),
3265            (
3266                "app.ex",
3267                ApiLanguage::Elixir,
3268                "String.to_atom(param)",
3269                "EX001",
3270            ),
3271            ("main.lua", ApiLanguage::Lua, "value = 1", "LU001"),
3272            ("game.luau", ApiLanguage::Luau, "os.execute(cmd)", "LU003"),
3273            ("main.ml", ApiLanguage::Ocaml, "Obj.magic value", "OC004"),
3274        ];
3275
3276        for (filename, language, source, expected_rule_id) in cases {
3277            assert_language_findings(filename, language, source, expected_rule_id);
3278        }
3279    }
3280
3281    // fastpath-extend-non-vuln-v1 — verify the file-level fast-path
3282    // does not strip findings from a normal-input fixture.
3283    #[test]
3284    fn test_fastpath_extension_no_perf_regression_on_normal_input() {
3285        use std::time::Instant;
3286
3287        let temp = TempDir::new().unwrap();
3288        let root = temp.path();
3289
3290        // Mixed-language fixture covering each rule needle path:
3291        // - Python `requests.` (PY001) and `hashlib.md5` (PY003)
3292        // - Rust `Mutex` (RS001) and `with_capacity` (RS003)
3293        // - Go `ioutil.ReadFile` (GO001)
3294        // - JavaScript `eval` (JS005)
3295        // - Files with NO needle hits — must be cleanly skipped.
3296        fs::write(
3297            root.join("py_hits.py"),
3298            "import requests\nrequests.get('http://x')\nimport hashlib\nh = hashlib.md5(b'x').hexdigest()\n",
3299        )
3300        .unwrap();
3301        fs::write(
3302            root.join("rs_hits.rs"),
3303            "use std::sync::Mutex;\nlet lock = Mutex::new(0);\nlet v: Vec<u8> = Vec::with_capacity(input);\n",
3304        )
3305        .unwrap();
3306        fs::write(
3307            root.join("go_hits.go"),
3308            "package main\nimport \"io/ioutil\"\nfunc f() { _, _ = ioutil.ReadFile(\"/etc/passwd\") }\n",
3309        )
3310        .unwrap();
3311        fs::write(
3312            root.join("js_hits.js"),
3313            "function f(s) { eval(s); }\n",
3314        )
3315        .unwrap();
3316        // File with no rule needles — cleanly skipped by the fast-path.
3317        fs::write(
3318            root.join("py_no_hits.py"),
3319            "def add(a, b):\n    return a + b\n\nif __name__ == '__main__':\n    print(add(1, 2))\n",
3320        )
3321        .unwrap();
3322
3323        let files = [
3324            (root.join("py_hits.py"), ApiLanguage::Python, true),
3325            (root.join("rs_hits.rs"), ApiLanguage::Rust, true),
3326            (root.join("go_hits.go"), ApiLanguage::Go, true),
3327            (root.join("js_hits.js"), ApiLanguage::JavaScript, true),
3328            (root.join("py_no_hits.py"), ApiLanguage::Python, false),
3329        ];
3330
3331        let start = Instant::now();
3332        for (path, lang, expect_findings) in files {
3333            let rules = rules_for_language(lang);
3334            let findings = analyze_file(&path, &rules, lang).unwrap();
3335            if expect_findings {
3336                assert!(
3337                    !findings.is_empty(),
3338                    "expected findings for {:?} (rule keyword present in source)",
3339                    path.file_name()
3340                );
3341            } else {
3342                // No needle in the source: fast-path returns empty.
3343                // Some rules (e.g. PY002 bare-except) might still match
3344                // unrelated lines, but in this fixture none do.
3345                assert!(
3346                    findings.is_empty(),
3347                    "expected no findings for {:?}, got {:?}",
3348                    path.file_name(),
3349                    findings.iter().map(|f| f.rule.id.clone()).collect::<Vec<_>>()
3350                );
3351            }
3352        }
3353        let elapsed = start.elapsed();
3354        // 5-file run including I/O and per-file regex compile must
3355        // complete well under 2 s — pre-fix this could time out on
3356        // slow CI; post-fix it should be milliseconds.
3357        assert!(
3358            elapsed.as_secs() < 2,
3359            "fastpath-extend-non-vuln-v1: 5-file fixture took {:?}, expected <2s",
3360            elapsed
3361        );
3362    }
3363
3364    // fastpath-extend-non-vuln-v1 — pin the correctness contract for
3365    // `extract_literal_from_regex`: the literal returned for every
3366    // built-in regex rule must be a substring of the rule's
3367    // `api_call`-equivalent positive sample.
3368    #[test]
3369    fn test_extract_literal_from_regex_recovers_useful_needles() {
3370        // Cases: (regex_pattern, expected_literal_substring_or_empty,
3371        //         positive_sample_that_must_contain_the_literal)
3372        let cases: &[(&str, &str, &str)] = &[
3373            (r"\bioutil\.ReadFile\s*\(", "ioutil.ReadFile", "x := ioutil.ReadFile(p)"),
3374            (r"\bunserialize\s*\(", "unserialize", "unserialize($x);"),
3375            (r"\beval\s*\(", "eval", "eval(s)"),
3376            (
3377                r"\bRuntime\.getRuntime\(\)\.exec\s*\(",
3378                "Runtime.getRuntime().exec",
3379                "Runtime.getRuntime().exec(c)",
3380            ),
3381            // Pure-symbol patterns: empty literal → "always admit".
3382            (r"\s==\s|\s!=\s", "", "if (a == b)"),
3383            // Pure char-class pattern: empty literal.
3384            (r"\b[A-Za-z_][A-Za-z0-9_]*!", "", "value!"),
3385        ];
3386        for (pattern, expected, sample) in cases {
3387            let literal = extract_literal_from_regex(pattern);
3388            assert_eq!(
3389                literal.as_str(),
3390                *expected,
3391                "pattern {:?} should yield literal {:?}",
3392                pattern,
3393                expected
3394            );
3395            if !literal.is_empty() {
3396                assert!(
3397                    sample.contains(literal.as_str()),
3398                    "literal {:?} from pattern {:?} must be a substring of positive sample {:?}",
3399                    literal,
3400                    pattern,
3401                    sample
3402                );
3403            }
3404        }
3405    }
3406
3407    // fastpath-extend-non-vuln-v1 — verify the language-fastpath needle
3408    // list is non-empty for every supported language (or contains an
3409    // empty string for the always-admit fallback).
3410    #[test]
3411    fn test_language_fastpath_needles_cover_all_languages() {
3412        for &lang in all_api_languages() {
3413            let needles = language_fastpath_needles(lang);
3414            assert!(
3415                !needles.is_empty(),
3416                "language {:?} has no fastpath needles",
3417                lang
3418            );
3419        }
3420    }
3421}