Skip to main content

sloc_languages/
lib.rs

1// SPDX-License-Identifier: AGPL-3.0-or-later
2// Copyright (C) 2026 Nima Shafie <nimzshafie@gmail.com>
3
4pub mod style;
5pub use style::{IndentStyle, StyleAnalysis, StyleGuideScore, StyleSignal};
6
7use std::collections::{BTreeMap, BTreeSet, HashSet};
8use std::path::Path;
9
10use serde::{Deserialize, Serialize};
11
12#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
13#[serde(rename_all = "snake_case")]
14pub enum Language {
15    C,
16    Cpp,
17    CSharp,
18    Go,
19    Java,
20    JavaScript,
21    Python,
22    Rust,
23    Shell,
24    PowerShell,
25    TypeScript,
26    // --- Extended language support ---
27    Assembly,
28    Clojure,
29    Css,
30    Dart,
31    Dockerfile,
32    Elixir,
33    Erlang,
34    FSharp,
35    Groovy,
36    Haskell,
37    Html,
38    Julia,
39    Kotlin,
40    Lua,
41    Makefile,
42    Nim,
43    ObjectiveC,
44    Ocaml,
45    Perl,
46    Php,
47    R,
48    Ruby,
49    Scala,
50    Scss,
51    Sql,
52    Svelte,
53    Swift,
54    Vue,
55    Xml,
56    Zig,
57    // --- Pass 1: modern declarative / smart-contract languages ---
58    Solidity,
59    Protobuf,
60    Hcl,
61    GraphQl,
62    // --- Pass 2: legacy + embedded / hardware-description languages ---
63    Ada,
64    Vhdl,
65    Verilog,
66    Tcl,
67    Pascal,
68    VisualBasic,
69    Lisp,
70    // --- Pass 3: scientific / infra / systems / graphics ---
71    Fortran,
72    Nix,
73    Crystal,
74    D,
75    Glsl,
76    Cmake,
77    Elm,
78    Awk,
79}
80
81impl Language {
82    #[must_use]
83    pub const fn display_name(&self) -> &'static str {
84        match self {
85            Self::C => "C",
86            Self::Cpp => "C++",
87            Self::CSharp => "C#",
88            Self::Go => "Go",
89            Self::Java => "Java",
90            Self::JavaScript => "JavaScript",
91            Self::Python => "Python",
92            Self::Rust => "Rust",
93            Self::Shell => "Shell",
94            Self::PowerShell => "PowerShell",
95            Self::TypeScript => "TypeScript",
96            Self::Assembly => "Assembly",
97            Self::Clojure => "Clojure",
98            Self::Css => "CSS",
99            Self::Dart => "Dart",
100            Self::Dockerfile => "Dockerfile",
101            Self::Elixir => "Elixir",
102            Self::Erlang => "Erlang",
103            Self::FSharp => "F#",
104            Self::Groovy => "Groovy",
105            Self::Haskell => "Haskell",
106            Self::Html => "HTML",
107            Self::Julia => "Julia",
108            Self::Kotlin => "Kotlin",
109            Self::Lua => "Lua",
110            Self::Makefile => "Makefile",
111            Self::Nim => "Nim",
112            Self::ObjectiveC => "Objective-C",
113            Self::Ocaml => "OCaml",
114            Self::Perl => "Perl",
115            Self::Php => "PHP",
116            Self::R => "R",
117            Self::Ruby => "Ruby",
118            Self::Scala => "Scala",
119            Self::Scss => "SCSS",
120            Self::Sql => "SQL",
121            Self::Svelte => "Svelte",
122            Self::Swift => "Swift",
123            Self::Vue => "Vue",
124            Self::Xml => "XML",
125            Self::Zig => "Zig",
126            Self::Solidity => "Solidity",
127            Self::Protobuf => "Protocol Buffers",
128            Self::Hcl => "HCL/Terraform",
129            Self::GraphQl => "GraphQL",
130            Self::Ada => "Ada",
131            Self::Vhdl => "VHDL",
132            Self::Verilog => "Verilog/SystemVerilog",
133            Self::Tcl => "Tcl",
134            Self::Pascal => "Pascal/Delphi",
135            Self::VisualBasic => "Visual Basic",
136            Self::Lisp => "Lisp/Scheme",
137            Self::Fortran => "Fortran",
138            Self::Nix => "Nix",
139            Self::Crystal => "Crystal",
140            Self::D => "D",
141            Self::Glsl => "GLSL/HLSL",
142            Self::Cmake => "CMake",
143            Self::Elm => "Elm",
144            Self::Awk => "Awk",
145        }
146    }
147
148    #[must_use]
149    pub const fn as_slug(&self) -> &'static str {
150        match self {
151            Self::C => "c",
152            Self::Cpp => "cpp",
153            Self::CSharp => "csharp",
154            Self::Go => "go",
155            Self::Java => "java",
156            Self::JavaScript => "javascript",
157            Self::Python => "python",
158            Self::Rust => "rust",
159            Self::Shell => "shell",
160            Self::PowerShell => "powershell",
161            Self::TypeScript => "typescript",
162            Self::Assembly => "assembly",
163            Self::Clojure => "clojure",
164            Self::Css => "css",
165            Self::Dart => "dart",
166            Self::Dockerfile => "dockerfile",
167            Self::Elixir => "elixir",
168            Self::Erlang => "erlang",
169            Self::FSharp => "fsharp",
170            Self::Groovy => "groovy",
171            Self::Haskell => "haskell",
172            Self::Html => "html",
173            Self::Julia => "julia",
174            Self::Kotlin => "kotlin",
175            Self::Lua => "lua",
176            Self::Makefile => "makefile",
177            Self::Nim => "nim",
178            Self::ObjectiveC => "objectivec",
179            Self::Ocaml => "ocaml",
180            Self::Perl => "perl",
181            Self::Php => "php",
182            Self::R => "r",
183            Self::Ruby => "ruby",
184            Self::Scala => "scala",
185            Self::Scss => "scss",
186            Self::Sql => "sql",
187            Self::Svelte => "svelte",
188            Self::Swift => "swift",
189            Self::Vue => "vue",
190            Self::Xml => "xml",
191            Self::Zig => "zig",
192            Self::Solidity => "solidity",
193            Self::Protobuf => "protobuf",
194            Self::Hcl => "hcl",
195            Self::GraphQl => "graphql",
196            Self::Ada => "ada",
197            Self::Vhdl => "vhdl",
198            Self::Verilog => "verilog",
199            Self::Tcl => "tcl",
200            Self::Pascal => "pascal",
201            Self::VisualBasic => "visualbasic",
202            Self::Lisp => "lisp",
203            Self::Fortran => "fortran",
204            Self::Nix => "nix",
205            Self::Crystal => "crystal",
206            Self::D => "d",
207            Self::Glsl => "glsl",
208            Self::Cmake => "cmake",
209            Self::Elm => "elm",
210            Self::Awk => "awk",
211        }
212    }
213
214    #[must_use]
215    pub fn from_name(name: &str) -> Option<Self> {
216        match name.trim().to_ascii_lowercase().as_str() {
217            "c" => Some(Self::C),
218            "cpp" | "c++" | "cplusplus" => Some(Self::Cpp),
219            "csharp" | "c#" | "cs" => Some(Self::CSharp),
220            "go" | "golang" => Some(Self::Go),
221            "java" => Some(Self::Java),
222            "javascript" | "js" => Some(Self::JavaScript),
223            "python" | "py" => Some(Self::Python),
224            "rust" | "rs" => Some(Self::Rust),
225            "shell" | "sh" | "bash" => Some(Self::Shell),
226            "powershell" | "pwsh" | "ps" => Some(Self::PowerShell),
227            "typescript" | "ts" => Some(Self::TypeScript),
228            "assembly" | "asm" => Some(Self::Assembly),
229            "clojure" | "clj" => Some(Self::Clojure),
230            "css" => Some(Self::Css),
231            "dart" => Some(Self::Dart),
232            "dockerfile" | "docker" => Some(Self::Dockerfile),
233            "elixir" | "ex" => Some(Self::Elixir),
234            "erlang" | "erl" => Some(Self::Erlang),
235            "fsharp" | "f#" | "fs" => Some(Self::FSharp),
236            "groovy" => Some(Self::Groovy),
237            "haskell" | "hs" => Some(Self::Haskell),
238            "html" | "htm" => Some(Self::Html),
239            "julia" | "jl" => Some(Self::Julia),
240            "kotlin" | "kt" => Some(Self::Kotlin),
241            "lua" => Some(Self::Lua),
242            "makefile" | "make" | "mk" => Some(Self::Makefile),
243            "nim" => Some(Self::Nim),
244            "objectivec" | "objc" | "objective-c" => Some(Self::ObjectiveC),
245            "ocaml" | "ml" => Some(Self::Ocaml),
246            "perl" | "pl" => Some(Self::Perl),
247            "php" => Some(Self::Php),
248            "r" => Some(Self::R),
249            "ruby" | "rb" => Some(Self::Ruby),
250            "scala" => Some(Self::Scala),
251            "scss" | "sass" => Some(Self::Scss),
252            "sql" => Some(Self::Sql),
253            "svelte" => Some(Self::Svelte),
254            "swift" => Some(Self::Swift),
255            "vue" => Some(Self::Vue),
256            "xml" => Some(Self::Xml),
257            "zig" => Some(Self::Zig),
258            "solidity" | "sol" => Some(Self::Solidity),
259            "protobuf" | "proto" | "protocolbuffers" => Some(Self::Protobuf),
260            "hcl" | "terraform" | "tf" => Some(Self::Hcl),
261            "graphql" | "gql" => Some(Self::GraphQl),
262            "ada" => Some(Self::Ada),
263            "vhdl" => Some(Self::Vhdl),
264            "verilog" | "systemverilog" | "sv" => Some(Self::Verilog),
265            "tcl" => Some(Self::Tcl),
266            "pascal" | "delphi" | "pas" => Some(Self::Pascal),
267            "visualbasic" | "vb" | "vbnet" | "vb.net" => Some(Self::VisualBasic),
268            "lisp" | "scheme" | "racket" | "clisp" | "elisp" => Some(Self::Lisp),
269            "fortran" | "f90" | "f95" => Some(Self::Fortran),
270            "nix" => Some(Self::Nix),
271            "crystal" | "cr" => Some(Self::Crystal),
272            "d" | "dlang" => Some(Self::D),
273            "glsl" | "hlsl" | "shader" | "wgsl" => Some(Self::Glsl),
274            "cmake" => Some(Self::Cmake),
275            "elm" => Some(Self::Elm),
276            "awk" => Some(Self::Awk),
277            _ => None,
278        }
279    }
280}
281
282#[derive(Debug, Clone, Serialize, Deserialize, Default)]
283pub struct RawLineCounts {
284    pub total_physical_lines: u64,
285    pub blank_only_lines: u64,
286    pub code_only_lines: u64,
287    pub single_comment_only_lines: u64,
288    pub multi_comment_only_lines: u64,
289    pub mixed_code_single_comment_lines: u64,
290    pub mixed_code_multi_comment_lines: u64,
291    pub docstring_comment_lines: u64,
292    pub skipped_unknown_lines: u64,
293    /// Best-effort count of function/method definition lines detected lexically.
294    #[serde(default)]
295    pub functions: u64,
296    /// Best-effort count of class/struct/trait/type definition lines detected lexically.
297    #[serde(default)]
298    pub classes: u64,
299    /// Best-effort count of variable declaration lines detected lexically.
300    #[serde(default)]
301    pub variables: u64,
302    /// Best-effort count of import/use/include statement lines detected lexically.
303    #[serde(default)]
304    pub imports: u64,
305    /// Lines consisting solely of preprocessor/compiler directives (e.g. `#include`, `#define`
306    /// in C/C++/Objective-C). Always a subset of `code_only_lines`. Controlled by
307    /// `AnalysisConfig::count_compiler_directives`. IEEE 1045-1992 §4.2.
308    #[serde(default)]
309    pub compiler_directive_lines: u64,
310    /// Best-effort count of test case / test function definition lines detected lexically
311    /// (`GTest`, Catch2, `PyTest`, `JUnit`, etc.). Always a subset of `code_only_lines`.
312    #[serde(default)]
313    pub test_count: u64,
314    /// Best-effort count of test assertion call lines detected lexically
315    /// (`ASSERT_EQ`, `EXPECT_TRUE`, assertEquals, Assert.AreEqual, `assert_eq`!, etc.).
316    #[serde(default)]
317    pub test_assertion_count: u64,
318    /// Best-effort count of test suite / fixture / group declaration lines detected lexically
319    /// (`TEST_GROUP`, `BOOST_AUTO_TEST_SUITE`, [`TestClass`], [`TestFixture`], etc.).
320    #[serde(default)]
321    pub test_suite_count: u64,
322    /// Cyclomatic complexity approximation: total count of branch decision keywords found on
323    /// code lines (e.g. `if`, `for`, `while`, `||`, `&&`). Starts at 0; +1 per keyword hit.
324    #[serde(default)]
325    pub cyclomatic_complexity: u32,
326    /// Logical SLOC estimate: executable statement count using a language-specific strategy.
327    /// `None` when the language does not support lexical LSLOC estimation.
328    #[serde(default, skip_serializing_if = "Option::is_none")]
329    pub lsloc: Option<u32>,
330    /// Per-code-line content hashes (trimmed) for ULOC aggregation. Never serialized — only
331    /// populated during an in-process scan and consumed by `sloc-core` during aggregation.
332    #[serde(skip)]
333    pub code_line_hashes: Vec<u64>,
334}
335
336#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
337#[serde(rename_all = "snake_case")]
338pub enum ParseMode {
339    Lexical,
340    LexicalBestEffort,
341    TreeSitter,
342}
343
344#[derive(Debug, Clone, Serialize, Deserialize)]
345pub struct RawFileAnalysis {
346    pub raw: RawLineCounts,
347    pub parse_mode: ParseMode,
348    pub warnings: Vec<String>,
349    /// Lexical style-guide analysis for supported languages; `None` when no heuristics apply.
350    #[serde(default, skip_serializing_if = "Option::is_none")]
351    pub style_analysis: Option<StyleAnalysis>,
352}
353
354/// IEEE 1045-1992 counting options passed from `sloc-core` (built from `AnalysisConfig`).
355///
356/// `analyze_text` accepts this struct so that the caller can control behaviour that the
357/// standard defines as configurable parameters rather than fixed conventions.
358#[derive(Debug, Clone, Copy)]
359pub struct AnalysisOptions {
360    /// When `true` (IEEE 1045-1992 default), blank lines inside block comments count as
361    /// comment lines rather than blank lines.
362    pub blank_in_block_comment_as_comment: bool,
363    /// When `true`, backslash-continued physical lines are collapsed into a single logical
364    /// line for SLOC counting purposes (IEEE logical SLOC mode).
365    pub collapse_continuation_lines: bool,
366    /// When `true` (default), run lexical style-guide heuristics and populate
367    /// `RawFileAnalysis::style_analysis`. Set to `false` to skip style scoring entirely.
368    pub enable_style: bool,
369    /// Restrict style analysis to a specific language family slug (`"all"` or `"c_family"`).
370    /// When `"c_family"`, only C / C++ / Objective-C files are style-analysed.
371    pub style_lang_scope: StyleLangScope,
372}
373
374/// Which language families receive style-guide heuristic analysis.
375#[derive(Debug, Clone, Copy, PartialEq, Eq)]
376pub enum StyleLangScope {
377    All,
378    CFamilyOnly,
379}
380
381/// Strategy for computing Logical SLOC (LSLOC) from a physical-line scan.
382#[derive(Debug, Clone, Copy, PartialEq, Eq)]
383pub enum LslocStrategy {
384    /// Count semicolons on code lines (C, C++, Java, C#, Go, Rust, JS/TS, Kotlin, SQL, …).
385    Semicolons,
386    /// Count non-blank code lines whose trimmed content does not end with a continuation
387    /// character (`\`, `,`, `(`, `[`, `{`). Suitable for Python, Ruby, Shell, Elixir, Nim.
388    NonContinuationNewlines,
389    /// Language does not have a well-defined statement boundary detectable by simple
390    /// lexical heuristics; `lsloc` will be `None` for files of this type.
391    Unsupported,
392}
393
394impl Default for AnalysisOptions {
395    fn default() -> Self {
396        Self {
397            blank_in_block_comment_as_comment: true,
398            collapse_continuation_lines: false,
399            enable_style: true,
400            style_lang_scope: StyleLangScope::All,
401        }
402    }
403}
404
405#[must_use]
406pub fn supported_languages() -> BTreeSet<Language> {
407    [
408        Language::Assembly,
409        Language::C,
410        Language::Clojure,
411        Language::Cpp,
412        Language::CSharp,
413        Language::Css,
414        Language::Dart,
415        Language::Dockerfile,
416        Language::Elixir,
417        Language::Erlang,
418        Language::FSharp,
419        Language::Go,
420        Language::Groovy,
421        Language::Haskell,
422        Language::Html,
423        Language::Java,
424        Language::JavaScript,
425        Language::Julia,
426        Language::Kotlin,
427        Language::Lua,
428        Language::Makefile,
429        Language::Nim,
430        Language::ObjectiveC,
431        Language::Ocaml,
432        Language::Perl,
433        Language::Php,
434        Language::PowerShell,
435        Language::Python,
436        Language::R,
437        Language::Ruby,
438        Language::Rust,
439        Language::Scala,
440        Language::Scss,
441        Language::Shell,
442        Language::Sql,
443        Language::Svelte,
444        Language::Swift,
445        Language::TypeScript,
446        Language::Vue,
447        Language::Xml,
448        Language::Zig,
449        Language::Solidity,
450        Language::Protobuf,
451        Language::Hcl,
452        Language::GraphQl,
453        Language::Ada,
454        Language::Vhdl,
455        Language::Verilog,
456        Language::Tcl,
457        Language::Pascal,
458        Language::VisualBasic,
459        Language::Lisp,
460        Language::Fortran,
461        Language::Nix,
462        Language::Crystal,
463        Language::D,
464        Language::Glsl,
465        Language::Cmake,
466        Language::Elm,
467        Language::Awk,
468    ]
469    .into_iter()
470    .collect()
471}
472
473/// Detect language from a shebang line (e.g. `#!/usr/bin/env python3`).
474fn detect_by_shebang(line: &str) -> Option<Language> {
475    let lower = line.to_ascii_lowercase();
476    if !lower.starts_with("#!") {
477        return None;
478    }
479    if lower.contains("python") {
480        return Some(Language::Python);
481    }
482    if lower.contains("pwsh") || lower.contains("powershell") {
483        return Some(Language::PowerShell);
484    }
485    if lower.contains("bash")
486        || lower.contains("/sh")
487        || lower.contains("zsh")
488        || lower.contains("ksh")
489    {
490        return Some(Language::Shell);
491    }
492    if lower.contains("ruby") {
493        return Some(Language::Ruby);
494    }
495    if lower.contains("perl") {
496        return Some(Language::Perl);
497    }
498    if lower.contains("php") {
499        return Some(Language::Php);
500    }
501    if lower.contains("node") || lower.contains("nodejs") {
502        return Some(Language::JavaScript);
503    }
504    None
505}
506
507/// Detect language purely from a (lowercased) file extension.
508#[allow(clippy::too_many_lines)]
509fn detect_by_extension(ext: &str) -> Option<Language> {
510    // Static table avoids a large match statement; each extension maps 1-to-1 to a language.
511    static EXT_MAP: &[(&str, Language)] = &[
512        ("c", Language::C),
513        ("h", Language::C),
514        ("cc", Language::Cpp),
515        ("cp", Language::Cpp),
516        ("cpp", Language::Cpp),
517        ("cxx", Language::Cpp),
518        ("hh", Language::Cpp),
519        ("hpp", Language::Cpp),
520        ("hxx", Language::Cpp),
521        ("cs", Language::CSharp),
522        ("go", Language::Go),
523        ("java", Language::Java),
524        ("js", Language::JavaScript),
525        ("mjs", Language::JavaScript),
526        ("cjs", Language::JavaScript),
527        ("py", Language::Python),
528        ("rs", Language::Rust),
529        ("sh", Language::Shell),
530        ("bash", Language::Shell),
531        ("zsh", Language::Shell),
532        ("ksh", Language::Shell),
533        ("ps1", Language::PowerShell),
534        ("psm1", Language::PowerShell),
535        ("psd1", Language::PowerShell),
536        ("ts", Language::TypeScript),
537        ("mts", Language::TypeScript),
538        ("cts", Language::TypeScript),
539        ("tsx", Language::TypeScript),
540        ("jsx", Language::JavaScript),
541        ("asm", Language::Assembly),
542        ("s", Language::Assembly),
543        ("clj", Language::Clojure),
544        ("cljs", Language::Clojure),
545        ("cljc", Language::Clojure),
546        ("edn", Language::Clojure),
547        ("css", Language::Css),
548        ("dart", Language::Dart),
549        ("ex", Language::Elixir),
550        ("exs", Language::Elixir),
551        ("erl", Language::Erlang),
552        ("hrl", Language::Erlang),
553        ("fs", Language::FSharp),
554        ("fsi", Language::FSharp),
555        ("fsx", Language::FSharp),
556        ("groovy", Language::Groovy),
557        ("gradle", Language::Groovy),
558        ("hs", Language::Haskell),
559        ("lhs", Language::Haskell),
560        ("html", Language::Html),
561        ("htm", Language::Html),
562        ("xhtml", Language::Html),
563        ("jl", Language::Julia),
564        ("kt", Language::Kotlin),
565        ("kts", Language::Kotlin),
566        ("lua", Language::Lua),
567        ("mk", Language::Makefile),
568        ("nim", Language::Nim),
569        ("nims", Language::Nim),
570        ("m", Language::ObjectiveC),
571        ("mm", Language::ObjectiveC),
572        ("ml", Language::Ocaml),
573        ("mli", Language::Ocaml),
574        ("pl", Language::Perl),
575        ("pm", Language::Perl),
576        ("t", Language::Perl),
577        ("php", Language::Php),
578        ("php3", Language::Php),
579        ("php4", Language::Php),
580        ("php5", Language::Php),
581        ("php7", Language::Php),
582        ("phtml", Language::Php),
583        ("r", Language::R),
584        ("rb", Language::Ruby),
585        ("rake", Language::Ruby),
586        ("scala", Language::Scala),
587        ("sc", Language::Scala),
588        ("scss", Language::Scss),
589        ("sass", Language::Scss),
590        ("sql", Language::Sql),
591        ("svelte", Language::Svelte),
592        ("swift", Language::Swift),
593        ("vue", Language::Vue),
594        ("xml", Language::Xml),
595        ("xsd", Language::Xml),
596        ("xsl", Language::Xml),
597        ("xslt", Language::Xml),
598        ("svg", Language::Xml),
599        ("zig", Language::Zig),
600        ("sol", Language::Solidity),
601        ("proto", Language::Protobuf),
602        ("tf", Language::Hcl),
603        ("tfvars", Language::Hcl),
604        ("hcl", Language::Hcl),
605        ("graphql", Language::GraphQl),
606        ("gql", Language::GraphQl),
607        ("adb", Language::Ada),
608        ("ads", Language::Ada),
609        ("ada", Language::Ada),
610        ("vhd", Language::Vhdl),
611        ("vhdl", Language::Vhdl),
612        ("v", Language::Verilog),
613        ("sv", Language::Verilog),
614        ("svh", Language::Verilog),
615        ("vh", Language::Verilog),
616        ("tcl", Language::Tcl),
617        ("pas", Language::Pascal),
618        ("dpr", Language::Pascal),
619        ("vb", Language::VisualBasic),
620        ("bas", Language::VisualBasic),
621        ("lisp", Language::Lisp),
622        ("lsp", Language::Lisp),
623        ("el", Language::Lisp),
624        ("scm", Language::Lisp),
625        ("ss", Language::Lisp),
626        ("rkt", Language::Lisp),
627        ("f90", Language::Fortran),
628        ("f95", Language::Fortran),
629        ("f03", Language::Fortran),
630        ("f08", Language::Fortran),
631        ("f", Language::Fortran),
632        ("for", Language::Fortran),
633        ("nix", Language::Nix),
634        ("cr", Language::Crystal),
635        ("d", Language::D),
636        ("glsl", Language::Glsl),
637        ("vert", Language::Glsl),
638        ("frag", Language::Glsl),
639        ("comp", Language::Glsl),
640        ("geom", Language::Glsl),
641        ("tesc", Language::Glsl),
642        ("tese", Language::Glsl),
643        ("hlsl", Language::Glsl),
644        ("wgsl", Language::Glsl),
645        ("cmake", Language::Cmake),
646        ("elm", Language::Elm),
647        ("awk", Language::Awk),
648    ];
649    EXT_MAP.iter().find_map(|&(e, l)| (e == ext).then_some(l))
650}
651
652/// Detect language from an exact filename (no extension) or well-known filename patterns.
653fn detect_by_filename(filename: &str, filename_lower: &str) -> Option<Language> {
654    // Dockerfile: exact name or Dockerfile.* variant
655    if filename == "Dockerfile"
656        || filename.starts_with("Dockerfile.")
657        || filename_lower == "dockerfile"
658    {
659        return Some(Language::Dockerfile);
660    }
661    // Makefile variants
662    if matches!(
663        filename,
664        "Makefile" | "GNUmakefile" | "makefile" | "BSDmakefile"
665    ) {
666        return Some(Language::Makefile);
667    }
668    // Ruby ecosystem files that have no extension
669    if matches!(
670        filename,
671        "Rakefile" | "Gemfile" | "Guardfile" | "Vagrantfile" | "Fastfile" | "Podfile"
672    ) {
673        return Some(Language::Ruby);
674    }
675    // CMake build scripts: `CMakeLists.txt` has a `.txt` extension, so it must be
676    // matched by exact name before extension-based detection.
677    if filename == "CMakeLists.txt" || filename_lower == "cmakelists.txt" {
678        return Some(Language::Cmake);
679    }
680    None
681}
682
683#[must_use]
684#[allow(clippy::too_many_lines)]
685pub fn detect_language(
686    path: &Path,
687    first_line: Option<&str>,
688    extension_overrides: &BTreeMap<String, String>,
689    shebang_detection: bool,
690) -> Option<Language> {
691    let extension = path
692        .extension()
693        .and_then(|ext| ext.to_str())
694        .map(str::to_ascii_lowercase);
695
696    // Extension override check (user-configured mappings win over everything)
697    if let Some(ext) = extension.as_ref() {
698        if let Some(override_name) = extension_overrides.get(ext.as_str()) {
699            if let Some(lang) = Language::from_name(override_name) {
700                return Some(lang);
701            }
702        }
703    }
704
705    // Filename-based detection for files that have no extension or use exact names
706    let filename = path.file_name().and_then(|s| s.to_str()).unwrap_or("");
707    let filename_lower = filename.to_ascii_lowercase();
708
709    if let Some(lang) = detect_by_filename(filename, &filename_lower) {
710        return Some(lang);
711    }
712
713    // Extension-based detection
714    if let Some(lang) = extension.as_deref().and_then(detect_by_extension) {
715        return Some(lang);
716    }
717
718    // Shebang detection (last resort — only for extensionless scripts)
719    if shebang_detection {
720        if let Some(line) = first_line {
721            if let Some(lang) = detect_by_shebang(line) {
722                return Some(lang);
723            }
724        }
725    }
726
727    None
728}
729
730#[must_use]
731pub fn analyze_text(language: Language, text: &str, options: AnalysisOptions) -> RawFileAnalysis {
732    // tree-sitter fast-paths (compiled out when feature is disabled)
733    #[cfg(feature = "tree-sitter")]
734    {
735        match language {
736            Language::C | Language::Cpp => {
737                if let Some(mut result) = ts::analyze_c(text) {
738                    if options.enable_style
739                        && should_style_analyse(language, options.style_lang_scope)
740                    {
741                        result.style_analysis = style::analyze_style(language, text);
742                    }
743                    return result;
744                }
745            }
746            Language::Python => {
747                if let Some(result) = ts::analyze_python(text) {
748                    return result;
749                }
750            }
751            _ => {}
752        }
753    }
754
755    let (mut config, has_preprocessor) = language_scan_config(language);
756
757    // Python docstring lines are computed from the text and cannot be a static constant.
758    if language == Language::Python {
759        config.skip_lines = detect_python_docstring_lines(text);
760    }
761
762    // C, C++, and Objective-C have a preprocessor whose directive lines are tracked separately
763    // per IEEE 1045-1992 §4.2; every other language uses base flags.
764    let flags = IeeeFlags {
765        has_preprocessor_directives: has_preprocessor,
766        blank_in_block_comment_as_comment: options.blank_in_block_comment_as_comment,
767        collapse_continuation_lines: options.collapse_continuation_lines,
768    };
769    let mut result = analyze_generic(text, config, flags);
770    if options.enable_style && should_style_analyse(language, options.style_lang_scope) {
771        result.style_analysis = style::analyze_style(language, text);
772    }
773    result
774}
775
776/// Returns `true` when `language` should be style-analysed under `scope`.
777const fn should_style_analyse(language: Language, scope: StyleLangScope) -> bool {
778    match scope {
779        StyleLangScope::CFamilyOnly => {
780            matches!(language, Language::C | Language::Cpp | Language::ObjectiveC)
781        }
782        StyleLangScope::All => true,
783    }
784}
785
786/// Returns the lexical scan configuration for `language` and whether it uses a C preprocessor.
787/// All fields are static constants except `skip_lines`, which is always empty here; callers that
788/// need non-empty skip sets (currently only Python) must populate the field after this call.
789///
790/// The implementation delegates to `LANG_SCAN_TABLE` (a static `&[(Language, StaticLangConfig)]`)
791/// defined below the `SP_*` symbol-pattern constants.  Each language appears exactly once in the
792/// table, so the linear scan is O(|languages|) but avoids a 41-arm `match` statement.
793fn language_scan_config(language: Language) -> (ScanConfig, bool) {
794    let cfg = LANG_SCAN_TABLE
795        .iter()
796        .find_map(|&(l, c)| (l == language).then_some(c))
797        .unwrap_or_else(|| panic!("language_scan_config: no entry for {language:?}"));
798    let (branch_keywords, lsloc_strategy) = language_complexity_config(language);
799    (
800        ScanConfig {
801            line_comments: cfg.line_comments,
802            block_comment: cfg.block_comment,
803            allow_single_quote_strings: cfg.allow_single_quote_strings,
804            allow_double_quote_strings: cfg.allow_double_quote_strings,
805            allow_triple_quote_strings: cfg.allow_triple_quote_strings,
806            allow_csharp_verbatim_strings: cfg.allow_csharp_verbatim_strings,
807            skip_lines: HashSet::new(),
808            symbol_patterns: cfg.symbol_patterns,
809            branch_keywords,
810            lsloc_strategy,
811        },
812        cfg.has_preprocessor,
813    )
814}
815
816// ── Cyclomatic complexity branch-keyword lists ────────────────────────────────
817// Alphabetic tokens are matched word-bounded; operator tokens (||, &&, ?) are
818// matched as raw substrings.  Each list covers one language family.
819
820const BRANCH_C_FAMILY: &[&str] = &[
821    "if", "else", "for", "while", "switch", "case", "catch", "||", "&&",
822];
823const BRANCH_C_TERNARY: &[&str] = &[
824    "if", "else", "for", "while", "switch", "case", "catch", "||", "&&", "?",
825];
826const BRANCH_GO: &[&str] = &["if", "else", "for", "switch", "case", "select", "||", "&&"];
827const BRANCH_RUST: &[&str] = &["if", "else", "for", "while", "match", "||", "&&"];
828const BRANCH_ZIG: &[&str] = &["if", "else", "for", "while", "switch", "catch", "||", "&&"];
829const BRANCH_FSHARP: &[&str] = &["if", "then", "else", "elif", "match", "when", "||", "&&"];
830const BRANCH_LUA: &[&str] = &[
831    "if", "elseif", "else", "for", "while", "repeat", "and", "or",
832];
833const BRANCH_HASKELL: &[&str] = &["if", "then", "else", "case", "otherwise"];
834const BRANCH_SQL: &[&str] = &["CASE", "WHEN", "IF", "ELSE", "case", "when", "if", "else"];
835const BRANCH_OCAML: &[&str] = &["if", "then", "else", "match", "when", "||", "&&"];
836const BRANCH_CLOJURE: &[&str] = &["if", "when", "cond", "case", "and", "or"];
837const BRANCH_PHP: &[&str] = &[
838    "if", "elseif", "else", "for", "while", "switch", "case", "catch", "match", "||", "&&", "?",
839];
840const BRANCH_JULIA: &[&str] = &["if", "elseif", "else", "for", "while", "catch", "||", "&&"];
841const BRANCH_PYTHON: &[&str] = &["if", "elif", "else", "for", "while", "except", "or", "and"];
842const BRANCH_RUBY: &[&str] = &[
843    "if", "elsif", "else", "unless", "until", "while", "case", "when", "rescue", "||", "&&",
844];
845const BRANCH_SHELL: &[&str] = &["if", "elif", "else", "while", "until", "case", "||", "&&"];
846const BRANCH_ELIXIR: &[&str] = &[
847    "if", "else", "cond", "case", "when", "rescue", "||", "&&", "and", "or",
848];
849const BRANCH_POWERSHELL: &[&str] = &[
850    "if", "elseif", "else", "for", "while", "switch", "foreach", "||", "&&",
851];
852const BRANCH_NIM: &[&str] = &[
853    "if", "elif", "else", "for", "while", "case", "of", "except", "and", "or",
854];
855const BRANCH_PERL: &[&str] = &[
856    "if", "elsif", "else", "unless", "until", "for", "while", "foreach", "||", "&&",
857];
858const BRANCH_R: &[&str] = &["if", "else", "for", "while", "repeat", "||", "&&"];
859// Pass 2 branch-keyword lists (legacy + embedded / HDL).
860const BRANCH_ADA: &[&str] = &[
861    "if", "elsif", "else", "case", "when", "loop", "while", "for", "and", "or",
862];
863const BRANCH_VHDL: &[&str] = &[
864    "if", "elsif", "else", "case", "when", "loop", "while", "for", "and", "or", "nand", "nor",
865    "xor",
866];
867const BRANCH_VERILOG: &[&str] = &[
868    "if", "else", "case", "casex", "casez", "for", "while", "&&", "||",
869];
870const BRANCH_TCL: &[&str] = &["if", "elseif", "else", "switch", "while", "for", "foreach"];
871const BRANCH_PASCAL: &[&str] = &[
872    "if", "then", "else", "case", "while", "for", "repeat", "until", "and", "or",
873];
874const BRANCH_VB: &[&str] = &[
875    "If", "Then", "ElseIf", "Else", "Select", "Case", "While", "For", "Do", "And", "Or",
876];
877const BRANCH_LISP: &[&str] = &["if", "when", "unless", "cond", "case", "and", "or"];
878// Pass 3 branch-keyword lists (scientific / infra / systems / graphics).
879const BRANCH_FORTRAN: &[&str] = &[
880    "if", "then", "else", "elseif", "case", "do", "while", "where",
881];
882const BRANCH_NIX: &[&str] = &["if", "then", "else"];
883const BRANCH_CMAKE: &[&str] = &["if(", "elseif(", "else(", "while(", "foreach("];
884const BRANCH_ELM: &[&str] = &["if", "then", "else", "case", "of"];
885const BRANCH_AWK: &[&str] = &["if", "else", "while", "for", "do"];
886
887/// Returns (`branch_keywords`, `lsloc_strategy`) for the given language.
888/// Kept separate from `LANG_SCAN_TABLE` to avoid touching that large table.
889const fn language_complexity_config(
890    language: Language,
891) -> (&'static [&'static str], LslocStrategy) {
892    match language {
893        // ── C-ternary family (ternary operator counted as branch) ─────────────
894        Language::C
895        | Language::Cpp
896        | Language::ObjectiveC
897        | Language::CSharp
898        | Language::JavaScript
899        | Language::TypeScript
900        | Language::Svelte
901        | Language::Vue
902        | Language::Dart
903        | Language::Groovy
904        | Language::Swift
905        | Language::Solidity => (BRANCH_C_TERNARY, LslocStrategy::Semicolons),
906        // ── C-family (no ternary keyword) ────────────────────────────────────
907        Language::Java | Language::Kotlin | Language::Scala | Language::D | Language::Glsl => {
908            (BRANCH_C_FAMILY, LslocStrategy::Semicolons)
909        }
910        Language::Go => (BRANCH_GO, LslocStrategy::Semicolons),
911        Language::Rust => (BRANCH_RUST, LslocStrategy::Semicolons),
912        Language::Zig => (BRANCH_ZIG, LslocStrategy::Semicolons),
913        Language::FSharp => (BRANCH_FSHARP, LslocStrategy::Unsupported),
914        // ── Hash-comment family ───────────────────────────────────────────────
915        Language::Shell => (BRANCH_SHELL, LslocStrategy::NonContinuationNewlines),
916        Language::Elixir => (BRANCH_ELIXIR, LslocStrategy::NonContinuationNewlines),
917        Language::Perl => (BRANCH_PERL, LslocStrategy::Semicolons),
918        Language::R => (BRANCH_R, LslocStrategy::NonContinuationNewlines),
919        Language::Ruby | Language::Crystal => (BRANCH_RUBY, LslocStrategy::NonContinuationNewlines),
920        Language::Python => (BRANCH_PYTHON, LslocStrategy::NonContinuationNewlines),
921        Language::PowerShell => (BRANCH_POWERSHELL, LslocStrategy::Unsupported),
922        Language::Nim => (BRANCH_NIM, LslocStrategy::NonContinuationNewlines),
923        // ── Unique comment styles ─────────────────────────────────────────────
924        Language::Lua => (BRANCH_LUA, LslocStrategy::Unsupported),
925        Language::Haskell => (BRANCH_HASKELL, LslocStrategy::Unsupported),
926        Language::Sql => (BRANCH_SQL, LslocStrategy::Semicolons),
927        Language::Ocaml => (BRANCH_OCAML, LslocStrategy::Semicolons),
928        Language::Clojure => (BRANCH_CLOJURE, LslocStrategy::Unsupported),
929        Language::Php => (BRANCH_PHP, LslocStrategy::Semicolons),
930        Language::Julia => (BRANCH_JULIA, LslocStrategy::NonContinuationNewlines),
931        Language::Protobuf => (&[], LslocStrategy::Semicolons),
932        Language::Hcl => (&[], LslocStrategy::NonContinuationNewlines),
933        // ── Legacy / embedded / HDL ───────────────────────────────────────────
934        Language::Ada => (BRANCH_ADA, LslocStrategy::Semicolons),
935        Language::Vhdl => (BRANCH_VHDL, LslocStrategy::Semicolons),
936        Language::Verilog => (BRANCH_VERILOG, LslocStrategy::Semicolons),
937        Language::Tcl => (BRANCH_TCL, LslocStrategy::NonContinuationNewlines),
938        Language::Pascal => (BRANCH_PASCAL, LslocStrategy::Semicolons),
939        Language::VisualBasic => (BRANCH_VB, LslocStrategy::NonContinuationNewlines),
940        Language::Lisp => (BRANCH_LISP, LslocStrategy::Unsupported),
941        // ── Scientific / infra / systems / graphics ───────────────────────────
942        Language::Fortran => (BRANCH_FORTRAN, LslocStrategy::NonContinuationNewlines),
943        Language::Nix => (BRANCH_NIX, LslocStrategy::Unsupported),
944        Language::Cmake => (BRANCH_CMAKE, LslocStrategy::Unsupported),
945        Language::Elm => (BRANCH_ELM, LslocStrategy::Unsupported),
946        Language::Awk => (BRANCH_AWK, LslocStrategy::NonContinuationNewlines),
947        // ── No branch detection / syntax unsupported ──────────────────────────
948        Language::Makefile
949        | Language::Dockerfile
950        | Language::Css
951        | Language::Html
952        | Language::Xml
953        | Language::Assembly
954        | Language::Erlang
955        | Language::GraphQl
956        | Language::Scss => (&[], LslocStrategy::Unsupported),
957    }
958}
959
960/// Per-language keyword prefixes used for best-effort structural symbol detection.
961/// Each slice lists line prefixes (after leading whitespace is stripped) that indicate
962/// a definition of that category. Empty slice = detection disabled for that category.
963#[derive(Debug, Clone, Copy)]
964struct SymbolPatterns {
965    functions: &'static [&'static str],
966    /// Line prefixes that classify as a function only when the line ALSO contains `(`
967    /// AND there is no `=` between the prefix and the first `(`.  Used for C/C++ where
968    /// function definitions are led by the return type (`void`, `int`, `bool`, …) with
969    /// no dedicated keyword, so the paren guard distinguishes `void f(x)` from
970    /// `void* p = malloc(n)`.
971    functions_prefix_paren: &'static [&'static str],
972    classes: &'static [&'static str],
973    variables: &'static [&'static str],
974    imports: &'static [&'static str],
975    /// Line prefixes (after stripping leading whitespace) that indicate a test case or test
976    /// function definition. Matched against code lines only, same as other symbol categories.
977    tests: &'static [&'static str],
978    /// Line prefixes that indicate a test assertion call (`ASSERT_EQ`, assertEquals, `assert_eq`!,
979    /// Assert.AreEqual, etc.). Matched against code lines only.
980    assertions: &'static [&'static str],
981    /// Line prefixes that indicate a test suite / fixture / group declaration
982    /// (`TEST_GROUP`, `BOOST_AUTO_TEST_SUITE`, [`TestClass`], [`TestFixture`], etc.).
983    test_suites: &'static [&'static str],
984    /// Type-keyword prefixes (e.g. `"int "`, `"const "`) that classify a line as a
985    /// variable declaration when the line ALSO satisfies the complement of the
986    /// `functions_prefix_paren` condition: either no `(` is present, or a `=` appears
987    /// before the first `(`.  Used for C/C++ where both functions and variables are
988    /// led by the same return / value type keywords; the paren guard splits them.
989    variables_prefix_no_paren: &'static [&'static str],
990}
991
992impl SymbolPatterns {
993    const fn none() -> Self {
994        Self {
995            functions: &[],
996            functions_prefix_paren: &[],
997            classes: &[],
998            variables: &[],
999            imports: &[],
1000            tests: &[],
1001            assertions: &[],
1002            test_suites: &[],
1003            variables_prefix_no_paren: &[],
1004        }
1005    }
1006}
1007
1008const SP_NONE: SymbolPatterns = SymbolPatterns::none(); // all fields are &[]
1009
1010// Solidity: `function`/`modifier`/`constructor` definitions; `contract`/`interface`/
1011// `library` are the structural units (mapped to classes alongside struct/enum).
1012const SP_SOLIDITY: SymbolPatterns = SymbolPatterns {
1013    functions: &[
1014        "function ",
1015        "modifier ",
1016        "constructor",
1017        "receive ",
1018        "fallback ",
1019    ],
1020    functions_prefix_paren: &[],
1021    classes: &["contract ", "interface ", "library ", "struct ", "enum "],
1022    variables: &[],
1023    imports: &["import "],
1024    tests: &[],
1025    assertions: &[],
1026    test_suites: &[],
1027    variables_prefix_no_paren: &[],
1028};
1029
1030// Protocol Buffers: `message`/`service`/`enum` declarations are the structural units;
1031// `rpc` entries are the closest thing to functions.
1032const SP_PROTOBUF: SymbolPatterns = SymbolPatterns {
1033    functions: &["rpc "],
1034    functions_prefix_paren: &[],
1035    classes: &["message ", "service ", "enum "],
1036    variables: &[],
1037    imports: &["import "],
1038    tests: &[],
1039    assertions: &[],
1040    test_suites: &[],
1041    variables_prefix_no_paren: &[],
1042};
1043
1044// ── Pass 2 symbol patterns (legacy + embedded / HDL) ──────────────────────────
1045const SP_ADA: SymbolPatterns = SymbolPatterns {
1046    functions: &["procedure ", "function "],
1047    functions_prefix_paren: &[],
1048    classes: &["package ", "type ", "task ", "protected "],
1049    variables: &[],
1050    imports: &["with ", "use "],
1051    tests: &[],
1052    assertions: &[],
1053    test_suites: &[],
1054    variables_prefix_no_paren: &[],
1055};
1056
1057const SP_VHDL: SymbolPatterns = SymbolPatterns {
1058    functions: &["function ", "procedure ", "process "],
1059    functions_prefix_paren: &[],
1060    classes: &["entity ", "architecture ", "package ", "component "],
1061    variables: &[],
1062    imports: &["library ", "use "],
1063    tests: &[],
1064    assertions: &[],
1065    test_suites: &[],
1066    variables_prefix_no_paren: &[],
1067};
1068
1069const SP_VERILOG: SymbolPatterns = SymbolPatterns {
1070    functions: &["function ", "task "],
1071    functions_prefix_paren: &[],
1072    classes: &["module ", "interface ", "class ", "package "],
1073    variables: &[],
1074    imports: &["import ", "`include"],
1075    tests: &[],
1076    assertions: &[],
1077    test_suites: &[],
1078    variables_prefix_no_paren: &[],
1079};
1080
1081const SP_TCL: SymbolPatterns = SymbolPatterns {
1082    functions: &["proc "],
1083    functions_prefix_paren: &[],
1084    classes: &[],
1085    variables: &[],
1086    imports: &["source ", "package require "],
1087    tests: &[],
1088    assertions: &[],
1089    test_suites: &[],
1090    variables_prefix_no_paren: &[],
1091};
1092
1093const SP_PASCAL: SymbolPatterns = SymbolPatterns {
1094    functions: &["procedure ", "function "],
1095    functions_prefix_paren: &[],
1096    classes: &["type ", "class ", "record "],
1097    variables: &[],
1098    imports: &["uses "],
1099    tests: &[],
1100    assertions: &[],
1101    test_suites: &[],
1102    variables_prefix_no_paren: &[],
1103};
1104
1105const SP_VB: SymbolPatterns = SymbolPatterns {
1106    functions: &[
1107        "Sub ",
1108        "Function ",
1109        "Private Sub ",
1110        "Public Sub ",
1111        "Private Function ",
1112        "Public Function ",
1113    ],
1114    functions_prefix_paren: &[],
1115    classes: &["Class ", "Module ", "Structure "],
1116    variables: &[],
1117    imports: &["Imports "],
1118    tests: &[],
1119    assertions: &[],
1120    test_suites: &[],
1121    variables_prefix_no_paren: &[],
1122};
1123
1124const SP_LISP: SymbolPatterns = SymbolPatterns {
1125    functions: &["(defun ", "(defmacro ", "(define ", "(defmethod ", "(defn "],
1126    functions_prefix_paren: &[],
1127    classes: &["(defclass ", "(defstruct "],
1128    variables: &[],
1129    imports: &["(require ", "(import ", "(use-package "],
1130    tests: &[],
1131    assertions: &[],
1132    test_suites: &[],
1133    variables_prefix_no_paren: &[],
1134};
1135
1136// ── Pass 3 symbol patterns (scientific / infra / systems / graphics) ──────────
1137const SP_FORTRAN: SymbolPatterns = SymbolPatterns {
1138    functions: &["subroutine ", "function "],
1139    functions_prefix_paren: &[],
1140    classes: &["module ", "program ", "type "],
1141    variables: &[],
1142    imports: &["use ", "include "],
1143    tests: &[],
1144    assertions: &[],
1145    test_suites: &[],
1146    variables_prefix_no_paren: &[],
1147};
1148
1149const SP_CRYSTAL: SymbolPatterns = SymbolPatterns {
1150    functions: &["def "],
1151    functions_prefix_paren: &[],
1152    classes: &["class ", "module ", "struct ", "enum "],
1153    variables: &[],
1154    imports: &["require "],
1155    tests: &[],
1156    assertions: &[],
1157    test_suites: &[],
1158    variables_prefix_no_paren: &[],
1159};
1160
1161const SP_D: SymbolPatterns = SymbolPatterns {
1162    functions: &[],
1163    functions_prefix_paren: &[],
1164    classes: &["class ", "struct ", "interface ", "enum ", "template "],
1165    variables: &[],
1166    imports: &["import "],
1167    tests: &[],
1168    assertions: &[],
1169    test_suites: &[],
1170    variables_prefix_no_paren: &[],
1171};
1172
1173const SP_CMAKE: SymbolPatterns = SymbolPatterns {
1174    functions: &["function(", "macro("],
1175    functions_prefix_paren: &[],
1176    classes: &[],
1177    variables: &[],
1178    imports: &["include(", "add_subdirectory("],
1179    tests: &[],
1180    assertions: &[],
1181    test_suites: &[],
1182    variables_prefix_no_paren: &[],
1183};
1184
1185const SP_ELM: SymbolPatterns = SymbolPatterns {
1186    functions: &[],
1187    functions_prefix_paren: &[],
1188    classes: &["type "],
1189    variables: &[],
1190    imports: &["import "],
1191    tests: &[],
1192    assertions: &[],
1193    test_suites: &[],
1194    variables_prefix_no_paren: &[],
1195};
1196
1197const SP_AWK: SymbolPatterns = SymbolPatterns {
1198    functions: &["function "],
1199    functions_prefix_paren: &[],
1200    classes: &[],
1201    variables: &[],
1202    imports: &[],
1203    tests: &[],
1204    assertions: &[],
1205    test_suites: &[],
1206    variables_prefix_no_paren: &[],
1207};
1208
1209const SP_RUST: SymbolPatterns = SymbolPatterns {
1210    functions: &[
1211        "fn ",
1212        "pub fn ",
1213        "pub(crate) fn ",
1214        "pub(super) fn ",
1215        "async fn ",
1216        "pub async fn ",
1217        "pub(crate) async fn ",
1218        "unsafe fn ",
1219        "pub unsafe fn ",
1220        "pub(crate) unsafe fn ",
1221        "const fn ",
1222        "pub const fn ",
1223        "pub(crate) const fn ",
1224        "extern fn ",
1225        "pub extern fn ",
1226    ],
1227    functions_prefix_paren: &[],
1228    classes: &[
1229        "struct ",
1230        "pub struct ",
1231        "pub(crate) struct ",
1232        "enum ",
1233        "pub enum ",
1234        "pub(crate) enum ",
1235        "trait ",
1236        "pub trait ",
1237        "pub(crate) trait ",
1238        "impl ",
1239        "impl<",
1240        "type ",
1241        "pub type ",
1242        "pub(crate) type ",
1243    ],
1244    variables: &["let ", "let mut "],
1245    imports: &["use ", "pub use ", "pub(crate) use ", "extern crate "],
1246    // Built-in #[test], tokio/actix async test attributes, rstest
1247    tests: &[
1248        "#[test]",
1249        "#[tokio::test]",
1250        "#[actix_web::test]",
1251        "#[rstest]",
1252        "#[test_case",
1253    ],
1254    assertions: &[
1255        "assert_eq!(",
1256        "assert_ne!(",
1257        "assert!(",
1258        "assert_matches!(",
1259        "assert_err!(",
1260        "assert_ok!(",
1261    ],
1262    test_suites: &[],
1263    variables_prefix_no_paren: &[],
1264};
1265
1266const SP_PYTHON: SymbolPatterns = SymbolPatterns {
1267    functions: &["def ", "async def "],
1268    functions_prefix_paren: &[],
1269    classes: &["class "],
1270    variables: &[],
1271    imports: &["import ", "from "],
1272    // pytest: test_ prefix functions and Test* classes; unittest: test_ methods
1273    tests: &["def test_", "async def test_", "class Test"],
1274    assertions: &[
1275        "self.assertEqual(",
1276        "self.assertNotEqual(",
1277        "self.assertTrue(",
1278        "self.assertFalse(",
1279        "self.assertIsNone(",
1280        "self.assertIsNotNone(",
1281        "self.assertIn(",
1282        "self.assertNotIn(",
1283        "self.assertRaises(",
1284        "self.assertAlmostEqual(",
1285    ],
1286    test_suites: &[],
1287    variables_prefix_no_paren: &[],
1288};
1289
1290const SP_JS: SymbolPatterns = SymbolPatterns {
1291    functions: &[
1292        "function ",
1293        "async function ",
1294        "export function ",
1295        "export async function ",
1296        "export default function ",
1297    ],
1298    functions_prefix_paren: &[],
1299    classes: &["class ", "export class ", "export default class "],
1300    variables: &[
1301        "var ",
1302        "let ",
1303        "const ",
1304        "export var ",
1305        "export let ",
1306        "export const ",
1307    ],
1308    imports: &["import "],
1309    // Jest/Mocha/Jasmine: describe/it/test block openers
1310    tests: &[
1311        "describe(",
1312        "it(",
1313        "test(",
1314        "it.each(",
1315        "test.each(",
1316        "describe.each(",
1317    ],
1318    assertions: &["expect("],
1319    test_suites: &[],
1320    variables_prefix_no_paren: &[],
1321};
1322
1323const SP_TS: SymbolPatterns = SymbolPatterns {
1324    functions: &[
1325        "function ",
1326        "async function ",
1327        "export function ",
1328        "export async function ",
1329        "export default function ",
1330    ],
1331    functions_prefix_paren: &[],
1332    classes: &[
1333        "class ",
1334        "export class ",
1335        "export default class ",
1336        "abstract class ",
1337        "export abstract class ",
1338        "interface ",
1339        "export interface ",
1340        "declare class ",
1341        "declare interface ",
1342    ],
1343    variables: &[
1344        "var ",
1345        "let ",
1346        "const ",
1347        "export var ",
1348        "export let ",
1349        "export const ",
1350    ],
1351    imports: &["import "],
1352    // Jest/Mocha/Jasmine/Vitest: describe/it/test block openers
1353    tests: &[
1354        "describe(",
1355        "it(",
1356        "test(",
1357        "it.each(",
1358        "test.each(",
1359        "describe.each(",
1360    ],
1361    assertions: &["expect("],
1362    test_suites: &[],
1363    variables_prefix_no_paren: &[],
1364};
1365
1366const SP_GO: SymbolPatterns = SymbolPatterns {
1367    functions: &["func "],
1368    functions_prefix_paren: &[],
1369    classes: &["type "],
1370    variables: &["var "],
1371    imports: &["import "],
1372    // Go standard testing: Test* functions (convention is practically exclusive to _test.go files)
1373    tests: &["func Test", "func Benchmark", "func Fuzz"],
1374    assertions: &[],
1375    test_suites: &[],
1376    variables_prefix_no_paren: &[],
1377};
1378
1379const SP_JAVA: SymbolPatterns = SymbolPatterns {
1380    functions: &[],
1381    functions_prefix_paren: &[],
1382    classes: &[
1383        "class ",
1384        "public class ",
1385        "private class ",
1386        "protected class ",
1387        "abstract class ",
1388        "final class ",
1389        "public abstract class ",
1390        "public final class ",
1391        "interface ",
1392        "public interface ",
1393        "enum ",
1394        "public enum ",
1395        "record ",
1396        "public record ",
1397        "@interface ",
1398    ],
1399    variables: &[],
1400    imports: &["import "],
1401    // JUnit 4 & 5, TestNG — annotations appear on their own line before the method
1402    tests: &[
1403        "@Test",
1404        "@ParameterizedTest",
1405        "@RepeatedTest",
1406        "@TestFactory",
1407        "@TestTemplate",
1408    ],
1409    assertions: &[
1410        "assertEquals(",
1411        "assertNotEquals(",
1412        "assertTrue(",
1413        "assertFalse(",
1414        "assertNull(",
1415        "assertNotNull(",
1416        "assertThat(",
1417        "assertThrows(",
1418        "assertAll(",
1419        "assertArrayEquals(",
1420        "assertIterableEquals(",
1421        "assertLinesMatch(",
1422    ],
1423    test_suites: &[],
1424    variables_prefix_no_paren: &[],
1425};
1426
1427const SP_CSHARP: SymbolPatterns = SymbolPatterns {
1428    functions: &[],
1429    functions_prefix_paren: &[],
1430    classes: &[
1431        "class ",
1432        "public class ",
1433        "private class ",
1434        "protected class ",
1435        "internal class ",
1436        "abstract class ",
1437        "sealed class ",
1438        "static class ",
1439        "partial class ",
1440        "public abstract class ",
1441        "public sealed class ",
1442        "public static class ",
1443        "interface ",
1444        "public interface ",
1445        "internal interface ",
1446        "enum ",
1447        "public enum ",
1448        "struct ",
1449        "public struct ",
1450        "record ",
1451        "public record ",
1452    ],
1453    variables: &["var "],
1454    imports: &["using "],
1455    // MSTest, NUnit, xUnit — attributes on their own line before the method
1456    tests: &[
1457        "[TestMethod]",
1458        "[Test]",
1459        "[Fact]",
1460        "[Theory]",
1461        "[TestCase(",
1462        "[DataRow(",
1463        "[InlineData(",
1464        "[MemberData(",
1465    ],
1466    assertions: &[
1467        "Assert.AreEqual(",
1468        "Assert.AreNotEqual(",
1469        "Assert.IsTrue(",
1470        "Assert.IsFalse(",
1471        "Assert.IsNull(",
1472        "Assert.IsNotNull(",
1473        "Assert.Equal(",
1474        "Assert.NotEqual(",
1475        "Assert.True(",
1476        "Assert.False(",
1477        "Assert.That(",
1478        "Assert.Contains(",
1479        "Assert.Throws(",
1480        "Assert.ThrowsAsync(",
1481        "Assert.IsInstanceOfType(",
1482    ],
1483    test_suites: &["[TestClass]", "[TestFixture]", "[SetUpFixture]"],
1484    variables_prefix_no_paren: &[],
1485};
1486
1487// GTest, Catch2/doctest, Boost.Test, Unity, Check, CMocka, CppUTest patterns for C and C++.
1488const TEST_PATTERNS_C_CPP: &[&str] = &[
1489    // Google Test
1490    "TEST(",
1491    "TEST_F(",
1492    "TEST_P(",
1493    "TYPED_TEST(",
1494    "TYPED_TEST_P(",
1495    "INSTANTIATE_TEST_SUITE_P(",
1496    "INSTANTIATE_TYPED_TEST_SUITE_P(",
1497    // Catch2 / doctest
1498    "TEST_CASE(",
1499    "SECTION(",
1500    "SCENARIO(",
1501    "SCENARIO_METHOD(",
1502    "TEST_CASE_METHOD(",
1503    // Boost.Test
1504    "BOOST_AUTO_TEST_CASE(",
1505    "BOOST_FIXTURE_TEST_CASE(",
1506    "BOOST_AUTO_TEST_SUITE(",
1507    "BOOST_PARAM_TEST_CASE(",
1508    // CppUnit
1509    "CPPUNIT_TEST(",
1510    "CPPUNIT_TEST_SUITE(",
1511    // Unity (embedded C)
1512    "RUN_TEST(",
1513    "TEST_IGNORE(",
1514    "TEST_FAIL(",
1515    // Check (libcheck — embedded C)
1516    "START_TEST(",
1517    "tcase_add_test(",
1518    "suite_create(",
1519    // CMocka (embedded C)
1520    "cmocka_unit_test(",
1521    "cmocka_run_group_tests(",
1522    // CppUTest
1523    "IGNORE_TEST(",
1524    "TEST_GROUP(",
1525    "TEST_GROUP_BASE(",
1526];
1527
1528// Test assertion patterns shared by C and C++.
1529const ASSERT_PATTERNS_C_CPP: &[&str] = &[
1530    // Google Test ASSERT_* (test-stopping failures)
1531    "ASSERT_EQ(",
1532    "ASSERT_NE(",
1533    "ASSERT_LT(",
1534    "ASSERT_LE(",
1535    "ASSERT_GT(",
1536    "ASSERT_GE(",
1537    "ASSERT_TRUE(",
1538    "ASSERT_FALSE(",
1539    "ASSERT_STREQ(",
1540    "ASSERT_STRNE(",
1541    "ASSERT_FLOAT_EQ(",
1542    "ASSERT_DOUBLE_EQ(",
1543    "ASSERT_NEAR(",
1544    "ASSERT_THROW(",
1545    "ASSERT_NO_THROW(",
1546    "ASSERT_ANY_THROW(",
1547    // Google Test EXPECT_* (non-stopping failures)
1548    "EXPECT_EQ(",
1549    "EXPECT_NE(",
1550    "EXPECT_LT(",
1551    "EXPECT_LE(",
1552    "EXPECT_GT(",
1553    "EXPECT_GE(",
1554    "EXPECT_TRUE(",
1555    "EXPECT_FALSE(",
1556    "EXPECT_STREQ(",
1557    "EXPECT_STRNE(",
1558    "EXPECT_FLOAT_EQ(",
1559    "EXPECT_DOUBLE_EQ(",
1560    "EXPECT_NEAR(",
1561    "EXPECT_THROW(",
1562    "EXPECT_NO_THROW(",
1563    "EXPECT_ANY_THROW(",
1564    // Catch2 / doctest assertions
1565    "REQUIRE(",
1566    "CHECK(",
1567    "REQUIRE_FALSE(",
1568    "CHECK_FALSE(",
1569    "REQUIRE_NOTHROW(",
1570    "CHECK_NOTHROW(",
1571    "REQUIRE_THROWS(",
1572    "CHECK_THROWS(",
1573    "REQUIRE_THAT(",
1574    "CHECK_THAT(",
1575    // Unity assertions (embedded C)
1576    "TEST_ASSERT_EQUAL(",
1577    "TEST_ASSERT_EQUAL_INT(",
1578    "TEST_ASSERT_EQUAL_STRING(",
1579    "TEST_ASSERT_EQUAL_FLOAT(",
1580    "TEST_ASSERT_EQUAL_DOUBLE(",
1581    "TEST_ASSERT_EQUAL_PTR(",
1582    "TEST_ASSERT_TRUE(",
1583    "TEST_ASSERT_FALSE(",
1584    "TEST_ASSERT_NULL(",
1585    "TEST_ASSERT_NOT_NULL(",
1586    "TEST_ASSERT_BITS_HIGH(",
1587    "TEST_ASSERT_BITS_LOW(",
1588    // CMocka assertions (embedded C)
1589    "assert_int_equal(",
1590    "assert_int_not_equal(",
1591    "assert_string_equal(",
1592    "assert_string_not_equal(",
1593    "assert_true(",
1594    "assert_false(",
1595    "assert_null(",
1596    "assert_non_null(",
1597    "assert_ptr_equal(",
1598    "assert_memory_equal(",
1599    "assert_return_code(",
1600];
1601
1602// Test suite/group declaration patterns for C and C++.
1603const SUITE_PATTERNS_C_CPP: &[&str] = &[
1604    "TEST_GROUP(",
1605    "TEST_GROUP_BASE(",
1606    "BOOST_AUTO_TEST_SUITE(",
1607    "CPPUNIT_TEST_SUITE(",
1608    "CPPUNIT_TEST_SUITE_END(",
1609];
1610
1611const SP_C: SymbolPatterns = SymbolPatterns {
1612    // C has no function keyword; detect by common return types that precede `(` with no `=`.
1613    functions: &[],
1614    functions_prefix_paren: &[
1615        "void ",
1616        "int ",
1617        "char ",
1618        "float ",
1619        "double ",
1620        "long ",
1621        "unsigned ",
1622        "size_t ",
1623        "static ",
1624        "inline ",
1625        "const ",
1626        "extern ",
1627    ],
1628    classes: &[
1629        "struct ",
1630        "typedef struct ",
1631        "union ",
1632        "typedef union ",
1633        "typedef enum ",
1634    ],
1635    variables: &[],
1636    imports: &["#include "],
1637    tests: TEST_PATTERNS_C_CPP,
1638    assertions: ASSERT_PATTERNS_C_CPP,
1639    test_suites: SUITE_PATTERNS_C_CPP,
1640    // Same type keywords as functions_prefix_paren; the complement paren guard (no unguarded `(`
1641    // in the line) distinguishes `int x;` / `int x = 5;` (variable) from `int foo()` (function).
1642    variables_prefix_no_paren: &[
1643        "void ",
1644        "int ",
1645        "char ",
1646        "float ",
1647        "double ",
1648        "long ",
1649        "unsigned ",
1650        "size_t ",
1651        "static ",
1652        "inline ",
1653        "const ",
1654        "extern ",
1655    ],
1656};
1657
1658const SP_CPP: SymbolPatterns = SymbolPatterns {
1659    // C++ specific function keyword-prefixes; return-type-led patterns use functions_prefix_paren.
1660    functions: &[
1661        "virtual ",  // virtual method declaration/definition
1662        "explicit ", // explicit constructor modifier
1663        "~",         // destructor (e.g. ~MyClass())
1664        "operator",  // operator overload (operator==, operator+, …)
1665    ],
1666    functions_prefix_paren: &[
1667        "void ",
1668        "bool ",
1669        "int ",
1670        "char ",
1671        "float ",
1672        "double ",
1673        "long ",
1674        "unsigned ",
1675        "size_t ",
1676        "auto ",
1677        "static ",
1678        "inline ",
1679        "constexpr ",
1680        "const ",
1681        "extern ",
1682    ],
1683    // `template<` (no space) is the dominant modern style alongside `template ` (with space).
1684    classes: &["class ", "struct ", "namespace ", "template ", "template<"],
1685    variables: &[],
1686    imports: &["#include "],
1687    tests: TEST_PATTERNS_C_CPP,
1688    assertions: ASSERT_PATTERNS_C_CPP,
1689    test_suites: SUITE_PATTERNS_C_CPP,
1690    // Mirror of functions_prefix_paren; complement paren guard splits variables from functions.
1691    variables_prefix_no_paren: &[
1692        "void ",
1693        "bool ",
1694        "int ",
1695        "char ",
1696        "float ",
1697        "double ",
1698        "long ",
1699        "unsigned ",
1700        "size_t ",
1701        "auto ",
1702        "static ",
1703        "inline ",
1704        "constexpr ",
1705        "const ",
1706        "extern ",
1707    ],
1708};
1709
1710const SP_SHELL: SymbolPatterns = SymbolPatterns {
1711    functions: &["function "],
1712    functions_prefix_paren: &[],
1713    classes: &[],
1714    variables: &["declare ", "local ", "export "],
1715    imports: &["source ", ". "],
1716    tests: &[],
1717    assertions: &[],
1718    test_suites: &[],
1719    variables_prefix_no_paren: &[],
1720};
1721
1722const SP_POWERSHELL: SymbolPatterns = SymbolPatterns {
1723    functions: &["function ", "Function "],
1724    functions_prefix_paren: &[],
1725    classes: &["class "],
1726    variables: &[],
1727    imports: &["Import-Module ", "using "],
1728    // Pester test framework
1729    tests: &["Describe ", "It ", "Context "],
1730    assertions: &[],
1731    test_suites: &[],
1732    variables_prefix_no_paren: &[],
1733};
1734
1735const SP_KOTLIN: SymbolPatterns = SymbolPatterns {
1736    functions: &[
1737        "fun ",
1738        "private fun ",
1739        "public fun ",
1740        "protected fun ",
1741        "internal fun ",
1742        "override fun ",
1743        "suspend fun ",
1744        "abstract fun ",
1745        "open fun ",
1746        "private suspend fun ",
1747        "public suspend fun ",
1748    ],
1749    functions_prefix_paren: &[],
1750    classes: &[
1751        "class ",
1752        "data class ",
1753        "sealed class ",
1754        "abstract class ",
1755        "open class ",
1756        "object ",
1757        "companion object",
1758        "interface ",
1759        "enum class ",
1760        "annotation class ",
1761    ],
1762    variables: &["val ", "var ", "private val ", "private var ", "const val "],
1763    imports: &["import "],
1764    // JUnit 4/5, KotlinTest, Kotest
1765    tests: &[
1766        "@Test",
1767        "@ParameterizedTest",
1768        "@RepeatedTest",
1769        "\"should ",
1770        "\"it ",
1771    ],
1772    assertions: &[
1773        "assertEquals(",
1774        "assertNotEquals(",
1775        "assertTrue(",
1776        "assertFalse(",
1777        "assertNull(",
1778        "assertNotNull(",
1779        "assertThat(",
1780        "assertThrows(",
1781        "shouldBe(",
1782        "shouldNotBe(",
1783        "shouldThrow(",
1784    ],
1785    test_suites: &[],
1786    variables_prefix_no_paren: &[],
1787};
1788
1789const SP_SWIFT: SymbolPatterns = SymbolPatterns {
1790    functions: &[
1791        "func ",
1792        "private func ",
1793        "public func ",
1794        "internal func ",
1795        "override func ",
1796        "open func ",
1797        "static func ",
1798        "class func ",
1799        "mutating func ",
1800        "private static func ",
1801        "public static func ",
1802    ],
1803    functions_prefix_paren: &[],
1804    classes: &[
1805        "class ",
1806        "struct ",
1807        "protocol ",
1808        "enum ",
1809        "extension ",
1810        "actor ",
1811        "public class ",
1812        "private class ",
1813        "open class ",
1814        "final class ",
1815        "public struct ",
1816        "private struct ",
1817        "public protocol ",
1818    ],
1819    variables: &[
1820        "var ",
1821        "let ",
1822        "private var ",
1823        "private let ",
1824        "static var ",
1825        "static let ",
1826    ],
1827    imports: &["import "],
1828    // XCTest: test functions are named test* by convention; Swift Testing: @Test attribute
1829    tests: &["func test", "func Test", "@Test"],
1830    assertions: &[
1831        "XCTAssertEqual(",
1832        "XCTAssertNotEqual(",
1833        "XCTAssertTrue(",
1834        "XCTAssertFalse(",
1835        "XCTAssertNil(",
1836        "XCTAssertNotNil(",
1837        "XCTAssertGreaterThan(",
1838        "XCTAssertLessThan(",
1839        "XCTAssertThrowsError(",
1840        "XCTAssertNoThrow(",
1841        "#expect(",
1842    ],
1843    test_suites: &[],
1844    variables_prefix_no_paren: &[],
1845};
1846
1847const SP_RUBY: SymbolPatterns = SymbolPatterns {
1848    functions: &["def ", "private def ", "protected def "],
1849    functions_prefix_paren: &[],
1850    classes: &["class ", "module "],
1851    variables: &[],
1852    imports: &["require ", "require_relative "],
1853    // RSpec / minitest
1854    tests: &["it ", "it(", "describe ", "context ", "test "],
1855    assertions: &[],
1856    test_suites: &[],
1857    variables_prefix_no_paren: &[],
1858};
1859
1860const SP_SCALA: SymbolPatterns = SymbolPatterns {
1861    functions: &["def ", "private def ", "protected def ", "override def "],
1862    functions_prefix_paren: &[],
1863    classes: &[
1864        "class ",
1865        "case class ",
1866        "abstract class ",
1867        "sealed class ",
1868        "object ",
1869        "trait ",
1870    ],
1871    variables: &["val ", "var ", "lazy val "],
1872    imports: &["import "],
1873    // ScalaTest / MUnit: FunSuite test("..."), FlatSpec it("..."), AnyWordSpec "..." should
1874    tests: &["test(", "it(", "describe("],
1875    assertions: &[],
1876    test_suites: &[],
1877    variables_prefix_no_paren: &[],
1878};
1879
1880const SP_PHP: SymbolPatterns = SymbolPatterns {
1881    functions: &[
1882        "function ",
1883        "public function ",
1884        "private function ",
1885        "protected function ",
1886        "static function ",
1887        "abstract function ",
1888        "final function ",
1889        "public static function ",
1890        "private static function ",
1891        "protected static function ",
1892    ],
1893    functions_prefix_paren: &[],
1894    classes: &[
1895        "class ",
1896        "abstract class ",
1897        "final class ",
1898        "interface ",
1899        "trait ",
1900        "enum ",
1901    ],
1902    variables: &[],
1903    imports: &[
1904        "use ",
1905        "require ",
1906        "require_once ",
1907        "include ",
1908        "include_once ",
1909    ],
1910    // PHPUnit: test methods start with test, or use @test annotation
1911    tests: &[
1912        "public function test",
1913        "function test",
1914        "#[Test]",
1915        "#[DataProvider(",
1916    ],
1917    assertions: &[],
1918    test_suites: &[],
1919    variables_prefix_no_paren: &[],
1920};
1921
1922const SP_ELIXIR: SymbolPatterns = SymbolPatterns {
1923    functions: &[
1924        "def ",
1925        "defp ",
1926        "defmacro ",
1927        "defmacrop ",
1928        "defguard ",
1929        "defguardp ",
1930    ],
1931    functions_prefix_paren: &[],
1932    classes: &["defmodule ", "defprotocol ", "defimpl "],
1933    variables: &[],
1934    imports: &["import ", "alias ", "use ", "require "],
1935    // ExUnit
1936    tests: &["test ", "describe "],
1937    assertions: &[],
1938    test_suites: &[],
1939    variables_prefix_no_paren: &[],
1940};
1941
1942const SP_ERLANG: SymbolPatterns = SymbolPatterns {
1943    functions: &[],
1944    functions_prefix_paren: &[],
1945    classes: &["-module("],
1946    variables: &[],
1947    imports: &["-import(", "-include(", "-include_lib("],
1948    tests: &[],
1949    assertions: &[],
1950    test_suites: &[],
1951    variables_prefix_no_paren: &[],
1952};
1953
1954const SP_FSHARP: SymbolPatterns = SymbolPatterns {
1955    functions: &[
1956        "let ",
1957        "let rec ",
1958        "member ",
1959        "override ",
1960        "abstract member ",
1961    ],
1962    functions_prefix_paren: &[],
1963    classes: &["type "],
1964    variables: &["let mutable "],
1965    imports: &["open "],
1966    // NUnit / xUnit attributes on their own line; FsUnit uses [<Test>] / [<Fact>]
1967    tests: &["[<Test>]", "[<Fact>]", "[<Theory>]", "[<TestCase("],
1968    assertions: &[],
1969    test_suites: &[],
1970    variables_prefix_no_paren: &[],
1971};
1972
1973const SP_GROOVY: SymbolPatterns = SymbolPatterns {
1974    functions: &["def ", "private def ", "public def ", "protected def "],
1975    functions_prefix_paren: &[],
1976    classes: &["class ", "abstract class ", "interface ", "enum ", "trait "],
1977    variables: &[],
1978    imports: &["import "],
1979    // Spock framework: feature methods; JUnit annotations
1980    tests: &["def \"", "@Test", "given:", "when:", "then:", "expect:"],
1981    assertions: &[],
1982    test_suites: &[],
1983    variables_prefix_no_paren: &[],
1984};
1985
1986const SP_HASKELL: SymbolPatterns = SymbolPatterns {
1987    functions: &[],
1988    functions_prefix_paren: &[],
1989    classes: &["class ", "data ", "newtype ", "type "],
1990    variables: &[],
1991    imports: &["import "],
1992    tests: &[],
1993    assertions: &[],
1994    test_suites: &[],
1995    variables_prefix_no_paren: &[],
1996};
1997
1998const SP_LUA: SymbolPatterns = SymbolPatterns {
1999    functions: &["function ", "local function "],
2000    functions_prefix_paren: &[],
2001    classes: &[],
2002    variables: &["local "],
2003    imports: &[],
2004    // busted test framework
2005    tests: &["it(", "describe(", "pending("],
2006    assertions: &[],
2007    test_suites: &[],
2008    variables_prefix_no_paren: &[],
2009};
2010
2011const SP_NIM: SymbolPatterns = SymbolPatterns {
2012    functions: &[
2013        "proc ",
2014        "func ",
2015        "method ",
2016        "iterator ",
2017        "converter ",
2018        "template ",
2019        "macro ",
2020    ],
2021    functions_prefix_paren: &[],
2022    classes: &["type "],
2023    variables: &["var ", "let ", "const "],
2024    imports: &["import ", "from "],
2025    // unittest module
2026    tests: &["test "],
2027    assertions: &[],
2028    test_suites: &[],
2029    variables_prefix_no_paren: &[],
2030};
2031
2032const SP_OBJECTIVEC: SymbolPatterns = SymbolPatterns {
2033    functions: &["- (", "+ ("],
2034    functions_prefix_paren: &[],
2035    classes: &["@interface ", "@implementation ", "@protocol "],
2036    variables: &[],
2037    imports: &["#import ", "#include "],
2038    // XCTest: test methods start with - (void)test
2039    tests: &["- (void)test"],
2040    assertions: &[
2041        "XCTAssertEqual(",
2042        "XCTAssertNotEqual(",
2043        "XCTAssertTrue(",
2044        "XCTAssertFalse(",
2045        "XCTAssertNil(",
2046        "XCTAssertNotNil(",
2047        "XCTAssertGreaterThan(",
2048        "XCTAssertLessThan(",
2049        "XCTAssertThrowsError(",
2050        "XCTAssertNoThrow(",
2051    ],
2052    test_suites: &[],
2053    variables_prefix_no_paren: &[],
2054};
2055
2056const SP_OCAML: SymbolPatterns = SymbolPatterns {
2057    functions: &["let ", "let rec "],
2058    functions_prefix_paren: &[],
2059    classes: &["type ", "module ", "class "],
2060    variables: &[],
2061    imports: &["open "],
2062    tests: &[],
2063    assertions: &[],
2064    test_suites: &[],
2065    variables_prefix_no_paren: &[],
2066};
2067
2068const SP_PERL: SymbolPatterns = SymbolPatterns {
2069    functions: &["sub "],
2070    functions_prefix_paren: &[],
2071    classes: &["package "],
2072    variables: &["my ", "our ", "local "],
2073    imports: &["use ", "require "],
2074    tests: &[],
2075    assertions: &[],
2076    test_suites: &[],
2077    variables_prefix_no_paren: &[],
2078};
2079
2080const SP_CLOJURE: SymbolPatterns = SymbolPatterns {
2081    functions: &["(defn ", "(defn- ", "(defmacro ", "(defmulti "],
2082    functions_prefix_paren: &[],
2083    classes: &[
2084        "(defrecord ",
2085        "(defprotocol ",
2086        "(deftype ",
2087        "(definterface ",
2088    ],
2089    variables: &["(def ", "(defonce "],
2090    imports: &["(ns ", "(require "],
2091    // clojure.test
2092    tests: &["(deftest ", "(testing "],
2093    assertions: &[],
2094    test_suites: &[],
2095    variables_prefix_no_paren: &[],
2096};
2097
2098const SP_JULIA: SymbolPatterns = SymbolPatterns {
2099    functions: &["function ", "macro "],
2100    functions_prefix_paren: &[],
2101    classes: &[
2102        "struct ",
2103        "mutable struct ",
2104        "abstract type ",
2105        "primitive type ",
2106    ],
2107    variables: &["const "],
2108    imports: &["import ", "using "],
2109    // Test.jl standard library
2110    tests: &["@test ", "@testset "],
2111    assertions: &[],
2112    test_suites: &[],
2113    variables_prefix_no_paren: &[],
2114};
2115
2116const SP_DART: SymbolPatterns = SymbolPatterns {
2117    functions: &[],
2118    functions_prefix_paren: &[],
2119    classes: &["class ", "abstract class ", "mixin ", "extension ", "enum "],
2120    variables: &["var ", "final ", "const ", "late "],
2121    imports: &["import "],
2122    // flutter_test / test package
2123    tests: &["test(", "testWidgets(", "group("],
2124    assertions: &[],
2125    test_suites: &[],
2126    variables_prefix_no_paren: &[],
2127};
2128
2129const SP_R: SymbolPatterns = SymbolPatterns {
2130    functions: &[],
2131    functions_prefix_paren: &[],
2132    classes: &[],
2133    variables: &[],
2134    imports: &["library(", "source("],
2135    // testthat
2136    tests: &["test_that(", "it(", "describe(", "expect_"],
2137    assertions: &[],
2138    test_suites: &[],
2139    variables_prefix_no_paren: &[],
2140};
2141
2142const SP_SQL: SymbolPatterns = SymbolPatterns {
2143    functions: &[
2144        "create function ",
2145        "create or replace function ",
2146        "create procedure ",
2147        "create or replace procedure ",
2148        "CREATE FUNCTION ",
2149        "CREATE OR REPLACE FUNCTION ",
2150        "CREATE PROCEDURE ",
2151        "CREATE OR REPLACE PROCEDURE ",
2152    ],
2153    functions_prefix_paren: &[],
2154    classes: &[
2155        "create table ",
2156        "create view ",
2157        "create schema ",
2158        "CREATE TABLE ",
2159        "CREATE VIEW ",
2160        "CREATE SCHEMA ",
2161    ],
2162    variables: &["declare ", "DECLARE "],
2163    imports: &[],
2164    tests: &[],
2165    assertions: &[],
2166    test_suites: &[],
2167    variables_prefix_no_paren: &[],
2168};
2169
2170const SP_ASSEMBLY: SymbolPatterns = SymbolPatterns {
2171    functions: &["proc ", "PROC "],
2172    functions_prefix_paren: &[],
2173    classes: &[],
2174    variables: &[],
2175    imports: &["include ", "INCLUDE ", "%include "],
2176    tests: &[],
2177    assertions: &[],
2178    test_suites: &[],
2179    variables_prefix_no_paren: &[],
2180};
2181
2182const SP_ZIG: SymbolPatterns = SymbolPatterns {
2183    functions: &[
2184        "fn ",
2185        "pub fn ",
2186        "export fn ",
2187        "inline fn ",
2188        "pub inline fn ",
2189    ],
2190    functions_prefix_paren: &[],
2191    classes: &[],
2192    variables: &["var ", "pub var "],
2193    imports: &[],
2194    // Zig built-in test blocks
2195    tests: &["test \"", "test{"],
2196    assertions: &[],
2197    test_suites: &[],
2198    variables_prefix_no_paren: &[],
2199};
2200
2201/// Static (non-heap) language scanning parameters.  All fields are `'static` so this struct
2202/// can be stored in a `static` array.  The dynamic `skip_lines` set (used only for Python
2203/// docstring detection) is kept in `ScanConfig` and populated by the caller after lookup.
2204#[allow(clippy::struct_excessive_bools)]
2205#[derive(Clone, Copy)]
2206struct StaticLangConfig {
2207    line_comments: &'static [&'static str],
2208    block_comment: Option<(&'static str, &'static str)>,
2209    allow_single_quote_strings: bool,
2210    allow_double_quote_strings: bool,
2211    allow_triple_quote_strings: bool,
2212    allow_csharp_verbatim_strings: bool,
2213    symbol_patterns: SymbolPatterns,
2214    /// `true` for C, C++, and Objective-C (languages that have a C preprocessor).
2215    has_preprocessor: bool,
2216}
2217
2218#[allow(clippy::struct_excessive_bools)]
2219#[derive(Debug, Clone)]
2220struct ScanConfig {
2221    line_comments: &'static [&'static str],
2222    block_comment: Option<(&'static str, &'static str)>,
2223    allow_single_quote_strings: bool,
2224    allow_double_quote_strings: bool,
2225    allow_triple_quote_strings: bool,
2226    allow_csharp_verbatim_strings: bool,
2227    skip_lines: HashSet<usize>,
2228    symbol_patterns: SymbolPatterns,
2229    /// Branch keywords used to approximate cyclomatic complexity.
2230    branch_keywords: &'static [&'static str],
2231    /// Strategy for computing Logical SLOC.
2232    lsloc_strategy: LslocStrategy,
2233}
2234
2235// ── Per-family base configurations ───────────────────────────────────────────
2236//
2237// Most languages share one of two comment styles.  Define a base `const` for
2238// each family; table entries override only the fields that differ (symbol
2239// patterns, preprocessor flag, verbatim-string flag, etc.).
2240//
2241// C-slash family: `//` line, `/* */` block, single + double quotes.
2242// Covers C, C++, Obj-C, C#, Go, Java, JS/TS/Svelte/Vue, Dart, Groovy, Kotlin,
2243// Scala, SCSS, Swift, Rust, and Zig (Zig has no block comment → overridden).
2244const C_SLASH_BASE: StaticLangConfig = StaticLangConfig {
2245    line_comments: &["//"],
2246    block_comment: Some(("/*", "*/")),
2247    allow_single_quote_strings: true,
2248    allow_double_quote_strings: true,
2249    allow_triple_quote_strings: false,
2250    allow_csharp_verbatim_strings: false,
2251    symbol_patterns: SP_NONE,
2252    has_preprocessor: false,
2253};
2254
2255// Hash-comment family: `#` line comment, no block comment, single + double
2256// quotes.  Covers Shell, Ruby, R, Perl, Elixir (each overrides only SP_*);
2257// Python overrides triple-quote; PowerShell and Nim override block_comment.
2258const HASH_BASE: StaticLangConfig = StaticLangConfig {
2259    line_comments: &["#"],
2260    block_comment: None,
2261    allow_single_quote_strings: true,
2262    allow_double_quote_strings: true,
2263    allow_triple_quote_strings: false,
2264    allow_csharp_verbatim_strings: false,
2265    symbol_patterns: SP_NONE,
2266    has_preprocessor: false,
2267};
2268
2269/// Static language-scan configuration table — one entry per supported language.
2270/// Used by `language_scan_config` to avoid a 41-arm match.  All `SP_*` constants
2271/// referenced here are defined above in the same module.
2272static LANG_SCAN_TABLE: &[(Language, StaticLangConfig)] = &[
2273    // ── C preprocessor family ─────────────────────────────────────────────────
2274    (
2275        Language::C,
2276        StaticLangConfig {
2277            symbol_patterns: SP_C,
2278            has_preprocessor: true,
2279            ..C_SLASH_BASE
2280        },
2281    ),
2282    (
2283        Language::Cpp,
2284        StaticLangConfig {
2285            symbol_patterns: SP_CPP,
2286            has_preprocessor: true,
2287            ..C_SLASH_BASE
2288        },
2289    ),
2290    (
2291        Language::ObjectiveC,
2292        StaticLangConfig {
2293            symbol_patterns: SP_OBJECTIVEC,
2294            has_preprocessor: true,
2295            ..C_SLASH_BASE
2296        },
2297    ),
2298    // ── C-slash family ────────────────────────────────────────────────────────
2299    (
2300        Language::CSharp,
2301        StaticLangConfig {
2302            symbol_patterns: SP_CSHARP,
2303            allow_csharp_verbatim_strings: true,
2304            ..C_SLASH_BASE
2305        },
2306    ),
2307    (
2308        Language::Go,
2309        StaticLangConfig {
2310            symbol_patterns: SP_GO,
2311            ..C_SLASH_BASE
2312        },
2313    ),
2314    (
2315        Language::Java,
2316        StaticLangConfig {
2317            symbol_patterns: SP_JAVA,
2318            ..C_SLASH_BASE
2319        },
2320    ),
2321    (
2322        Language::JavaScript,
2323        StaticLangConfig {
2324            symbol_patterns: SP_JS,
2325            ..C_SLASH_BASE
2326        },
2327    ),
2328    (
2329        Language::TypeScript,
2330        StaticLangConfig {
2331            symbol_patterns: SP_TS,
2332            ..C_SLASH_BASE
2333        },
2334    ),
2335    (
2336        Language::Svelte,
2337        StaticLangConfig {
2338            symbol_patterns: SP_JS,
2339            ..C_SLASH_BASE
2340        },
2341    ),
2342    (
2343        Language::Vue,
2344        StaticLangConfig {
2345            symbol_patterns: SP_JS,
2346            ..C_SLASH_BASE
2347        },
2348    ),
2349    (
2350        Language::Dart,
2351        StaticLangConfig {
2352            symbol_patterns: SP_DART,
2353            ..C_SLASH_BASE
2354        },
2355    ),
2356    (
2357        Language::Groovy,
2358        StaticLangConfig {
2359            symbol_patterns: SP_GROOVY,
2360            ..C_SLASH_BASE
2361        },
2362    ),
2363    (
2364        Language::Kotlin,
2365        StaticLangConfig {
2366            symbol_patterns: SP_KOTLIN,
2367            ..C_SLASH_BASE
2368        },
2369    ),
2370    (
2371        Language::Scala,
2372        StaticLangConfig {
2373            symbol_patterns: SP_SCALA,
2374            ..C_SLASH_BASE
2375        },
2376    ),
2377    (
2378        Language::Scss,
2379        StaticLangConfig {
2380            symbol_patterns: SP_NONE,
2381            ..C_SLASH_BASE
2382        },
2383    ),
2384    // Rust: no single-quote char literals (they're lifetime annotations)
2385    (
2386        Language::Rust,
2387        StaticLangConfig {
2388            symbol_patterns: SP_RUST,
2389            allow_single_quote_strings: false,
2390            ..C_SLASH_BASE
2391        },
2392    ),
2393    // Swift: no single-quote strings
2394    (
2395        Language::Swift,
2396        StaticLangConfig {
2397            symbol_patterns: SP_SWIFT,
2398            allow_single_quote_strings: false,
2399            ..C_SLASH_BASE
2400        },
2401    ),
2402    // Zig: no block comment
2403    (
2404        Language::Zig,
2405        StaticLangConfig {
2406            symbol_patterns: SP_ZIG,
2407            block_comment: None,
2408            ..C_SLASH_BASE
2409        },
2410    ),
2411    // F#: `(*` … `*)` block comment, no single-quote strings
2412    (
2413        Language::FSharp,
2414        StaticLangConfig {
2415            line_comments: &["//"],
2416            block_comment: Some(("(*", "*)")),
2417            allow_single_quote_strings: false,
2418            allow_double_quote_strings: true,
2419            symbol_patterns: SP_FSHARP,
2420            ..C_SLASH_BASE
2421        },
2422    ),
2423    // ── Hash-comment family ───────────────────────────────────────────────────
2424    (
2425        Language::Shell,
2426        StaticLangConfig {
2427            symbol_patterns: SP_SHELL,
2428            ..HASH_BASE
2429        },
2430    ),
2431    (
2432        Language::Elixir,
2433        StaticLangConfig {
2434            symbol_patterns: SP_ELIXIR,
2435            ..HASH_BASE
2436        },
2437    ),
2438    (
2439        Language::Perl,
2440        StaticLangConfig {
2441            symbol_patterns: SP_PERL,
2442            ..HASH_BASE
2443        },
2444    ),
2445    (
2446        Language::R,
2447        StaticLangConfig {
2448            symbol_patterns: SP_R,
2449            ..HASH_BASE
2450        },
2451    ),
2452    (
2453        Language::Ruby,
2454        StaticLangConfig {
2455            symbol_patterns: SP_RUBY,
2456            ..HASH_BASE
2457        },
2458    ),
2459    // Python: triple-quote string literals
2460    (
2461        Language::Python,
2462        StaticLangConfig {
2463            symbol_patterns: SP_PYTHON,
2464            allow_triple_quote_strings: true,
2465            ..HASH_BASE
2466        },
2467    ),
2468    // PowerShell: `<# … #>` block comment
2469    (
2470        Language::PowerShell,
2471        StaticLangConfig {
2472            symbol_patterns: SP_POWERSHELL,
2473            block_comment: Some(("<#", "#>")),
2474            ..HASH_BASE
2475        },
2476    ),
2477    // Nim: `#[` … `]#` block comment
2478    (
2479        Language::Nim,
2480        StaticLangConfig {
2481            symbol_patterns: SP_NIM,
2482            block_comment: Some(("#[", "]#")),
2483            ..HASH_BASE
2484        },
2485    ),
2486    // Makefile / Dockerfile: `#` only, no string literals
2487    (
2488        Language::Makefile,
2489        StaticLangConfig {
2490            symbol_patterns: SP_NONE,
2491            allow_single_quote_strings: false,
2492            allow_double_quote_strings: false,
2493            ..HASH_BASE
2494        },
2495    ),
2496    (
2497        Language::Dockerfile,
2498        StaticLangConfig {
2499            symbol_patterns: SP_NONE,
2500            allow_single_quote_strings: false,
2501            allow_double_quote_strings: false,
2502            ..HASH_BASE
2503        },
2504    ),
2505    // ── Other unique comment styles ───────────────────────────────────────────
2506    // CSS / SCSS: only `/* */` block, no line comment
2507    (
2508        Language::Css,
2509        StaticLangConfig {
2510            line_comments: &[],
2511            block_comment: Some(("/*", "*/")),
2512            symbol_patterns: SP_NONE,
2513            ..C_SLASH_BASE
2514        },
2515    ),
2516    // HTML / XML: `<!-- -->` block, no line comment, no string literals
2517    (
2518        Language::Html,
2519        StaticLangConfig {
2520            line_comments: &[],
2521            block_comment: Some(("<!--", "-->")),
2522            allow_single_quote_strings: false,
2523            allow_double_quote_strings: false,
2524            symbol_patterns: SP_NONE,
2525            ..C_SLASH_BASE
2526        },
2527    ),
2528    (
2529        Language::Xml,
2530        StaticLangConfig {
2531            line_comments: &[],
2532            block_comment: Some(("<!--", "-->")),
2533            allow_single_quote_strings: false,
2534            allow_double_quote_strings: false,
2535            symbol_patterns: SP_NONE,
2536            ..C_SLASH_BASE
2537        },
2538    ),
2539    // Lua: `--` line, `--[[ ]]` block
2540    (
2541        Language::Lua,
2542        StaticLangConfig {
2543            line_comments: &["--"],
2544            block_comment: Some(("--[[", "]]")),
2545            symbol_patterns: SP_LUA,
2546            ..C_SLASH_BASE
2547        },
2548    ),
2549    // Haskell: `--` line, `{- -}` block
2550    (
2551        Language::Haskell,
2552        StaticLangConfig {
2553            line_comments: &["--"],
2554            block_comment: Some(("{-", "-}")),
2555            symbol_patterns: SP_HASKELL,
2556            ..C_SLASH_BASE
2557        },
2558    ),
2559    // SQL: `--` line, `/* */` block, single quote only
2560    (
2561        Language::Sql,
2562        StaticLangConfig {
2563            line_comments: &["--"],
2564            block_comment: Some(("/*", "*/")),
2565            allow_single_quote_strings: true,
2566            allow_double_quote_strings: false,
2567            symbol_patterns: SP_SQL,
2568            ..C_SLASH_BASE
2569        },
2570    ),
2571    // OCaml: `(*` … `*)` only, no line comment, no single-quote strings
2572    (
2573        Language::Ocaml,
2574        StaticLangConfig {
2575            line_comments: &[],
2576            block_comment: Some(("(*", "*)")),
2577            allow_single_quote_strings: false,
2578            symbol_patterns: SP_OCAML,
2579            ..C_SLASH_BASE
2580        },
2581    ),
2582    // Assembly: `;` line comment (NASM/MASM) + `/* */` block (GAS), double-quote
2583    // strings for `.ascii`/`.string` directives. `#` (GAS x86) and `@` (ARM) line
2584    // comments are intentionally NOT added: `#` is an immediate prefix in ARM
2585    // (`mov r0, #5`) and `@` appears in x86 symbol versioning (`memcpy@plt`), so a
2586    // universal superset would mis-count one dialect or the other.
2587    (
2588        Language::Assembly,
2589        StaticLangConfig {
2590            line_comments: &[";"],
2591            block_comment: Some(("/*", "*/")),
2592            allow_single_quote_strings: false,
2593            allow_double_quote_strings: true,
2594            symbol_patterns: SP_ASSEMBLY,
2595            ..C_SLASH_BASE
2596        },
2597    ),
2598    (
2599        Language::Clojure,
2600        StaticLangConfig {
2601            line_comments: &[";"],
2602            block_comment: None,
2603            allow_single_quote_strings: false,
2604            symbol_patterns: SP_CLOJURE,
2605            ..C_SLASH_BASE
2606        },
2607    ),
2608    // Erlang: `%` line comment, no block, no single-quote strings
2609    (
2610        Language::Erlang,
2611        StaticLangConfig {
2612            line_comments: &["%"],
2613            block_comment: None,
2614            allow_single_quote_strings: false,
2615            symbol_patterns: SP_ERLANG,
2616            ..C_SLASH_BASE
2617        },
2618    ),
2619    // PHP: `//` or `#` line, `/* */` block
2620    (
2621        Language::Php,
2622        StaticLangConfig {
2623            line_comments: &["//", "#"],
2624            block_comment: Some(("/*", "*/")),
2625            symbol_patterns: SP_PHP,
2626            ..C_SLASH_BASE
2627        },
2628    ),
2629    // Julia: `#` line, `#= =#` block, double + triple quotes, no single
2630    (
2631        Language::Julia,
2632        StaticLangConfig {
2633            line_comments: &["#"],
2634            block_comment: Some(("#=", "=#")),
2635            allow_single_quote_strings: false,
2636            allow_triple_quote_strings: true,
2637            symbol_patterns: SP_JULIA,
2638            ..C_SLASH_BASE
2639        },
2640    ),
2641    // ── Pass 1 additions ──────────────────────────────────────────────────────
2642    // Solidity: C-slash family (`//`, `/* */`, single + double quotes).
2643    (
2644        Language::Solidity,
2645        StaticLangConfig {
2646            symbol_patterns: SP_SOLIDITY,
2647            ..C_SLASH_BASE
2648        },
2649    ),
2650    // Protocol Buffers: C-slash family, statements terminated by `;`.
2651    (
2652        Language::Protobuf,
2653        StaticLangConfig {
2654            symbol_patterns: SP_PROTOBUF,
2655            ..C_SLASH_BASE
2656        },
2657    ),
2658    // HCL / Terraform: `#` or `//` line, `/* */` block, double-quote strings only.
2659    (
2660        Language::Hcl,
2661        StaticLangConfig {
2662            line_comments: &["#", "//"],
2663            allow_single_quote_strings: false,
2664            symbol_patterns: SP_NONE,
2665            ..C_SLASH_BASE
2666        },
2667    ),
2668    // GraphQL: `#` line comment, no block; `"""` block-string descriptions, no single quotes.
2669    (
2670        Language::GraphQl,
2671        StaticLangConfig {
2672            allow_single_quote_strings: false,
2673            allow_triple_quote_strings: true,
2674            symbol_patterns: SP_NONE,
2675            ..HASH_BASE
2676        },
2677    ),
2678    // ── Pass 2 additions (legacy + embedded / HDL) ────────────────────────────
2679    // Ada: `--` line comment, no block; `'` is a char/attribute tick, not a string.
2680    (
2681        Language::Ada,
2682        StaticLangConfig {
2683            line_comments: &["--"],
2684            block_comment: None,
2685            allow_single_quote_strings: false,
2686            symbol_patterns: SP_ADA,
2687            ..C_SLASH_BASE
2688        },
2689    ),
2690    // VHDL: `--` line comment, no block; `'` is a bit/char literal, not a string.
2691    (
2692        Language::Vhdl,
2693        StaticLangConfig {
2694            line_comments: &["--"],
2695            block_comment: None,
2696            allow_single_quote_strings: false,
2697            symbol_patterns: SP_VHDL,
2698            ..C_SLASH_BASE
2699        },
2700    ),
2701    // Verilog / SystemVerilog: C-slash family; `'` is a sized-literal base, not a string.
2702    (
2703        Language::Verilog,
2704        StaticLangConfig {
2705            allow_single_quote_strings: false,
2706            symbol_patterns: SP_VERILOG,
2707            ..C_SLASH_BASE
2708        },
2709    ),
2710    // Tcl: `#` line comment, no block; `"` strings only.
2711    (
2712        Language::Tcl,
2713        StaticLangConfig {
2714            allow_single_quote_strings: false,
2715            symbol_patterns: SP_TCL,
2716            ..HASH_BASE
2717        },
2718    ),
2719    // Pascal / Delphi: `//` line, `{ }` block; strings are single-quoted.
2720    (
2721        Language::Pascal,
2722        StaticLangConfig {
2723            line_comments: &["//"],
2724            block_comment: Some(("{", "}")),
2725            allow_single_quote_strings: true,
2726            allow_double_quote_strings: false,
2727            symbol_patterns: SP_PASCAL,
2728            ..C_SLASH_BASE
2729        },
2730    ),
2731    // Visual Basic: `'` line comment, no block; `"` strings only.
2732    (
2733        Language::VisualBasic,
2734        StaticLangConfig {
2735            line_comments: &["'"],
2736            block_comment: None,
2737            allow_single_quote_strings: false,
2738            allow_double_quote_strings: true,
2739            symbol_patterns: SP_VB,
2740            ..C_SLASH_BASE
2741        },
2742    ),
2743    // Lisp / Scheme: `;` line comment, `#| |#` block; `"` strings, `'` is the quote operator.
2744    (
2745        Language::Lisp,
2746        StaticLangConfig {
2747            line_comments: &[";"],
2748            block_comment: Some(("#|", "|#")),
2749            allow_single_quote_strings: false,
2750            symbol_patterns: SP_LISP,
2751            ..C_SLASH_BASE
2752        },
2753    ),
2754    // ── Pass 3 additions (scientific / infra / systems / graphics) ────────────
2755    // Fortran: `!` line comment (free-form), no block; single + double strings.
2756    (
2757        Language::Fortran,
2758        StaticLangConfig {
2759            line_comments: &["!"],
2760            block_comment: None,
2761            symbol_patterns: SP_FORTRAN,
2762            ..C_SLASH_BASE
2763        },
2764    ),
2765    // Nix: `#` line, `/* */` block; double-quote strings (and `''` multi-line).
2766    (
2767        Language::Nix,
2768        StaticLangConfig {
2769            block_comment: Some(("/*", "*/")),
2770            allow_single_quote_strings: false,
2771            symbol_patterns: SP_NONE,
2772            ..HASH_BASE
2773        },
2774    ),
2775    // Crystal: `#` line comment, no block; Ruby-like single + double strings.
2776    (
2777        Language::Crystal,
2778        StaticLangConfig {
2779            symbol_patterns: SP_CRYSTAL,
2780            ..HASH_BASE
2781        },
2782    ),
2783    // D: C-slash family (`//`, `/* */`); single-quote char literals + double strings.
2784    (
2785        Language::D,
2786        StaticLangConfig {
2787            symbol_patterns: SP_D,
2788            ..C_SLASH_BASE
2789        },
2790    ),
2791    // GLSL / HLSL / WGSL shaders: C-slash family; no char literals.
2792    (
2793        Language::Glsl,
2794        StaticLangConfig {
2795            allow_single_quote_strings: false,
2796            symbol_patterns: SP_NONE,
2797            ..C_SLASH_BASE
2798        },
2799    ),
2800    // CMake: `#` line, `#[[ ]]` block; double-quote strings only.
2801    (
2802        Language::Cmake,
2803        StaticLangConfig {
2804            block_comment: Some(("#[[", "]]")),
2805            allow_single_quote_strings: false,
2806            symbol_patterns: SP_CMAKE,
2807            ..HASH_BASE
2808        },
2809    ),
2810    // Elm: `--` line, `{- -}` block; double-quote strings only.
2811    (
2812        Language::Elm,
2813        StaticLangConfig {
2814            line_comments: &["--"],
2815            block_comment: Some(("{-", "-}")),
2816            allow_single_quote_strings: false,
2817            symbol_patterns: SP_ELM,
2818            ..C_SLASH_BASE
2819        },
2820    ),
2821    // Awk: `#` line comment, no block; double-quote strings only.
2822    (
2823        Language::Awk,
2824        StaticLangConfig {
2825            allow_single_quote_strings: false,
2826            symbol_patterns: SP_AWK,
2827            ..HASH_BASE
2828        },
2829    ),
2830];
2831
2832/// Per-call IEEE 1045-1992 flags derived from `AnalysisOptions` plus per-language properties.
2833/// Private to this crate; constructed inside `analyze_text`.
2834#[derive(Debug, Clone, Copy)]
2835struct IeeeFlags {
2836    /// True for C, C++, and Objective-C — languages with a C preprocessor.
2837    has_preprocessor_directives: bool,
2838    /// Mirrors `AnalysisOptions::blank_in_block_comment_as_comment`.
2839    blank_in_block_comment_as_comment: bool,
2840    /// Mirrors `AnalysisOptions::collapse_continuation_lines`.
2841    collapse_continuation_lines: bool,
2842}
2843
2844#[derive(Debug, Clone, Copy)]
2845enum StringState {
2846    Single(char),
2847    Triple(&'static str),
2848    VerbatimDouble,
2849}
2850
2851#[allow(clippy::struct_excessive_bools)]
2852#[derive(Debug, Default)]
2853struct LineFacts {
2854    has_code: bool,
2855    has_single_comment: bool,
2856    has_multi_comment: bool,
2857    has_docstring: bool,
2858}
2859
2860/// Process one character while the lexer is inside a string literal.
2861///
2862/// Returns `(new_string_state, advance)` where `advance` is the number of chars to skip.
2863fn process_string_char(
2864    state: StringState,
2865    chars: &[char],
2866    i: usize,
2867) -> (Option<StringState>, usize) {
2868    match state {
2869        StringState::Single(delim) => {
2870            if chars[i] == '\\' {
2871                return (Some(state), 2); // skip escaped character
2872            }
2873            if chars[i] == delim {
2874                (None, 1)
2875            } else {
2876                (Some(state), 1)
2877            }
2878        }
2879        StringState::Triple(delim) => {
2880            if starts_with(chars, i, delim) {
2881                (None, delim.len())
2882            } else {
2883                (Some(state), 1)
2884            }
2885        }
2886        StringState::VerbatimDouble => {
2887            if starts_with(chars, i, "\"\"") {
2888                return (Some(state), 2); // escaped quote-quote inside verbatim string
2889            }
2890            if chars[i] == '"' {
2891                (None, 1)
2892            } else {
2893                (Some(state), 1)
2894            }
2895        }
2896    }
2897}
2898
2899/// Process one character while the lexer is inside a block comment.
2900///
2901/// Returns `(still_in_block_comment, advance)`.
2902fn process_block_comment_char(chars: &[char], i: usize, close: &str) -> (bool, usize) {
2903    if starts_with(chars, i, close) {
2904        (false, close.len())
2905    } else {
2906        (true, 1)
2907    }
2908}
2909
2910/// Attempt to begin a new string literal at position `i`.
2911///
2912/// Returns `Some((new_state, advance))` when a string opener is detected, else `None`.
2913fn try_open_string(chars: &[char], i: usize, config: &ScanConfig) -> Option<(StringState, usize)> {
2914    if config.allow_csharp_verbatim_strings && starts_with(chars, i, "@\"") {
2915        return Some((StringState::VerbatimDouble, 2));
2916    }
2917    if config.allow_triple_quote_strings {
2918        if starts_with(chars, i, "\"\"\"") {
2919            return Some((StringState::Triple("\"\"\""), 3));
2920        }
2921        if starts_with(chars, i, "'''") {
2922            return Some((StringState::Triple("'''"), 3));
2923        }
2924    }
2925    if config.allow_single_quote_strings && chars[i] == '\'' {
2926        return Some((StringState::Single('\''), 1));
2927    }
2928    if config.allow_double_quote_strings && chars[i] == '"' {
2929        return Some((StringState::Single('"'), 1));
2930    }
2931    None
2932}
2933
2934/// Advance past one character position while inside a block comment.
2935///
2936/// Updates `in_block_comment` if the closing delimiter is found and returns the
2937/// number of characters consumed. Returns 0 when no block-comment config is set
2938/// (preserving the caller's `continue`-without-advance behaviour for that impossible state).
2939fn step_through_block_comment(
2940    chars: &[char],
2941    i: usize,
2942    block_comment: Option<(&'static str, &'static str)>,
2943    in_block_comment: &mut bool,
2944) -> usize {
2945    if let Some((_, close)) = block_comment {
2946        let (still_in, advance) = process_block_comment_char(chars, i, close);
2947        *in_block_comment = still_in;
2948        return advance;
2949    }
2950    0
2951}
2952
2953/// If the character at `i` starts a block comment, return the length of the opening
2954/// delimiter so the caller can advance past it. Returns `None` if no match.
2955fn try_open_block_comment(
2956    chars: &[char],
2957    i: usize,
2958    block_comment: Option<(&'static str, &'static str)>,
2959) -> Option<usize> {
2960    let (open, _) = block_comment?;
2961    starts_with(chars, i, open).then_some(open.len())
2962}
2963
2964/// Scan a single physical line and update `facts`, `in_block_comment`, and `string_state`.
2965///
2966/// Returns `true` when the caller should break out of the per-line loop early (line comment hit).
2967fn scan_line(
2968    chars: &[char],
2969    config: &ScanConfig,
2970    facts: &mut LineFacts,
2971    in_block_comment: &mut bool,
2972    string_state: &mut Option<StringState>,
2973) {
2974    let mut i = 0usize;
2975    while i < chars.len() {
2976        // Inside a string literal — advance until the closing delimiter.
2977        if let Some(state) = *string_state {
2978            facts.has_code = true;
2979            let (new_state, advance) = process_string_char(state, chars, i);
2980            *string_state = new_state;
2981            i += advance;
2982            continue;
2983        }
2984
2985        // Inside a block comment — advance until the closing delimiter.
2986        if *in_block_comment {
2987            facts.has_multi_comment = true;
2988            i += step_through_block_comment(chars, i, config.block_comment, in_block_comment);
2989            continue;
2990        }
2991
2992        // Whitespace outside any string/comment — skip.
2993        if chars[i].is_whitespace() {
2994            i += 1;
2995            continue;
2996        }
2997
2998        // Attempt to open a string literal.
2999        if let Some((new_state, advance)) = try_open_string(chars, i, config) {
3000            facts.has_code = true;
3001            *string_state = Some(new_state);
3002            i += advance;
3003            continue;
3004        }
3005
3006        // Attempt to open a block comment.
3007        if let Some(advance) = try_open_block_comment(chars, i, config.block_comment) {
3008            facts.has_multi_comment = true;
3009            *in_block_comment = true;
3010            i += advance;
3011            continue;
3012        }
3013
3014        // Line comment — rest of the line is a comment; stop scanning.
3015        if config
3016            .line_comments
3017            .iter()
3018            .any(|prefix| starts_with(chars, i, prefix))
3019        {
3020            facts.has_single_comment = true;
3021            break;
3022        }
3023
3024        // Plain code character.
3025        facts.has_code = true;
3026        i += 1;
3027    }
3028}
3029
3030/// Apply IEEE 1045-1992 §4.2 preprocessor-directive tracking and continuation-line merging,
3031/// then emit the finalized `LineFacts` for this physical line.
3032///
3033/// Returns `None` when the line is part of a continuation sequence and should be deferred.
3034fn finalize_line_facts(
3035    facts: LineFacts,
3036    trimmed: &str,
3037    raw: &mut RawLineCounts,
3038    ieee: IeeeFlags,
3039    in_block_comment: bool,
3040    string_state: Option<StringState>,
3041    pending_continuation: &mut Option<LineFacts>,
3042) -> Option<LineFacts> {
3043    // IEEE 1045-1992 §4.2: track preprocessor/compiler directive lines (C/C++/ObjC).
3044    // A directive line is a pure code line (no comment on the same physical line) whose
3045    // trimmed content starts with '#'.
3046    if ieee.has_preprocessor_directives
3047        && facts.has_code
3048        && !facts.has_single_comment
3049        && !facts.has_multi_comment
3050        && trimmed.starts_with('#')
3051    {
3052        raw.compiler_directive_lines += 1;
3053    }
3054
3055    // IEEE 1045-1992 continuation-line handling.
3056    // A line is a continuation starter when it ends with '\' outside any comment or string.
3057    let is_continuation = ieee.collapse_continuation_lines
3058        && !in_block_comment
3059        && string_state.is_none()
3060        && trimmed.ends_with('\\');
3061
3062    if is_continuation {
3063        let pending = pending_continuation.get_or_insert_with(LineFacts::default);
3064        pending.has_code |= facts.has_code;
3065        pending.has_single_comment |= facts.has_single_comment;
3066        pending.has_multi_comment |= facts.has_multi_comment;
3067        pending.has_docstring |= facts.has_docstring;
3068        return None; // defer classification until the sequence ends
3069    }
3070
3071    // Merge any accumulated continuation facts into the final line.
3072    let emit = if let Some(pending) = pending_continuation.take() {
3073        LineFacts {
3074            has_code: pending.has_code | facts.has_code,
3075            has_single_comment: pending.has_single_comment | facts.has_single_comment,
3076            has_multi_comment: pending.has_multi_comment | facts.has_multi_comment,
3077            has_docstring: pending.has_docstring | facts.has_docstring,
3078        }
3079    } else {
3080        facts
3081    };
3082    Some(emit)
3083}
3084
3085/// Scan and classify one physical line, updating all running state in place.
3086///
3087/// Pre-classified lines (present in `config.skip_lines`) are counted as docstring-comment
3088/// lines and returned early without further analysis.
3089#[allow(clippy::needless_pass_by_value)]
3090#[allow(clippy::too_many_arguments)]
3091#[allow(clippy::many_single_char_names)] // destructuring return from count_symbols; names match field roles
3092fn process_physical_line(
3093    line: &str,
3094    line_idx: usize,
3095    config: &ScanConfig,
3096    raw: &mut RawLineCounts,
3097    in_block_comment: &mut bool,
3098    string_state: &mut Option<StringState>,
3099    pending_continuation: &mut Option<LineFacts>,
3100    ieee: IeeeFlags,
3101) {
3102    raw.total_physical_lines += 1;
3103
3104    if config.skip_lines.contains(&line_idx) {
3105        raw.docstring_comment_lines += 1;
3106        return;
3107    }
3108
3109    let trimmed = line.trim();
3110    let mut facts = LineFacts::default();
3111
3112    // IEEE 1045-1992: blank lines inside block comments are comment lines by default.
3113    // When blank_in_block_comment_as_comment is false, blank lines keep their blank
3114    // classification even while inside a block comment.
3115    if *in_block_comment && (ieee.blank_in_block_comment_as_comment || !trimmed.is_empty()) {
3116        facts.has_multi_comment = true;
3117    }
3118
3119    let chars: Vec<char> = line.chars().collect();
3120    scan_line(&chars, config, &mut facts, in_block_comment, string_state);
3121
3122    let Some(emit) = finalize_line_facts(
3123        facts,
3124        trimmed,
3125        raw,
3126        ieee,
3127        *in_block_comment,
3128        *string_state,
3129        pending_continuation,
3130    ) else {
3131        return;
3132    };
3133
3134    classify_line(raw, &emit, trimmed);
3135
3136    if emit.has_code {
3137        use std::hash::{DefaultHasher, Hash, Hasher};
3138        let (f, c, v, i, t, a, s) = count_symbols(&config.symbol_patterns, trimmed);
3139        raw.functions += f;
3140        raw.classes += c;
3141        raw.variables += v;
3142        raw.imports += i;
3143        raw.test_count += t;
3144        raw.test_assertion_count += a;
3145        raw.test_suite_count += s;
3146
3147        // Cyclomatic complexity: count branch decision keywords on code lines.
3148        raw.cyclomatic_complexity +=
3149            count_branch_in_line(trimmed.as_bytes(), config.branch_keywords);
3150
3151        // Logical SLOC (language-specific strategy).
3152        match config.lsloc_strategy {
3153            LslocStrategy::Semicolons => {
3154                let semi = u32::try_from(trimmed.bytes().filter(|&b| b == b';').count())
3155                    .unwrap_or(u32::MAX);
3156                *raw.lsloc.get_or_insert(0) += semi;
3157            }
3158            LslocStrategy::NonContinuationNewlines => {
3159                let cont = trimmed.ends_with('\\')
3160                    || trimmed.ends_with(',')
3161                    || trimmed.ends_with('(')
3162                    || trimmed.ends_with('[')
3163                    || trimmed.ends_with('{');
3164                if !cont {
3165                    *raw.lsloc.get_or_insert(0) += 1;
3166                }
3167            }
3168            LslocStrategy::Unsupported => {}
3169        }
3170
3171        // ULOC: hash each trimmed code line for cross-file unique-line counting.
3172        let mut h = DefaultHasher::new();
3173        trimmed.hash(&mut h);
3174        raw.code_line_hashes.push(h.finish());
3175    }
3176}
3177
3178#[allow(clippy::needless_pass_by_value)]
3179fn analyze_generic(text: &str, config: ScanConfig, ieee: IeeeFlags) -> RawFileAnalysis {
3180    let normalized = text.replace("\r\n", "\n").replace('\r', "\n");
3181    let lines: Vec<&str> = normalized.split_terminator('\n').collect();
3182
3183    let mut raw = RawLineCounts::default();
3184    let mut warnings = Vec::new();
3185
3186    let mut in_block_comment = false;
3187    let mut string_state: Option<StringState> = None;
3188    // IEEE continuation-line state: accumulates facts across a backslash-continued sequence.
3189    let mut pending_continuation: Option<LineFacts> = None;
3190
3191    for (line_idx, line) in lines.iter().enumerate() {
3192        process_physical_line(
3193            line,
3194            line_idx,
3195            &config,
3196            &mut raw,
3197            &mut in_block_comment,
3198            &mut string_state,
3199            &mut pending_continuation,
3200            ieee,
3201        );
3202    }
3203
3204    // Flush any pending continuation that reaches end-of-file without a closing line.
3205    if let Some(pending) = pending_continuation.take() {
3206        classify_line(&mut raw, &pending, "");
3207    }
3208
3209    if in_block_comment {
3210        warnings.push("unclosed block comment detected; result is best effort".into());
3211    }
3212    if string_state.is_some() {
3213        warnings.push("unclosed string literal detected; result is best effort".into());
3214    }
3215
3216    RawFileAnalysis {
3217        raw,
3218        parse_mode: if warnings.is_empty() {
3219            ParseMode::Lexical
3220        } else {
3221            ParseMode::LexicalBestEffort
3222        },
3223        warnings,
3224        style_analysis: None,
3225    }
3226}
3227
3228const fn classify_line(raw: &mut RawLineCounts, facts: &LineFacts, trimmed: &str) {
3229    if facts.has_docstring {
3230        raw.docstring_comment_lines += 1;
3231    } else if !facts.has_code
3232        && !facts.has_single_comment
3233        && !facts.has_multi_comment
3234        && trimmed.is_empty()
3235    {
3236        raw.blank_only_lines += 1;
3237    } else if facts.has_code && facts.has_single_comment {
3238        raw.mixed_code_single_comment_lines += 1;
3239    } else if facts.has_code && facts.has_multi_comment {
3240        raw.mixed_code_multi_comment_lines += 1;
3241    } else if facts.has_code {
3242        raw.code_only_lines += 1;
3243    } else if facts.has_single_comment {
3244        raw.single_comment_only_lines += 1;
3245    } else if facts.has_multi_comment {
3246        raw.multi_comment_only_lines += 1;
3247    } else if trimmed.is_empty() {
3248        raw.blank_only_lines += 1;
3249    } else {
3250        raw.skipped_unknown_lines += 1;
3251    }
3252}
3253
3254fn count_symbols(patterns: &SymbolPatterns, trimmed: &str) -> (u64, u64, u64, u64, u64, u64, u64) {
3255    let hit = |pats: &[&str]| u64::from(pats.iter().any(|p| trimmed.starts_with(p)));
3256    // For return-type-led languages (C/C++): match prefix AND `(` present AND no `=` sits
3257    // between the prefix start and the first `(` (guards against `void* p = malloc(n)`).
3258    let fn_pp = if patterns.functions_prefix_paren.is_empty() {
3259        0
3260    } else if let Some(paren_pos) = trimmed.find('(') {
3261        if trimmed[..paren_pos].contains('=') {
3262            0
3263        } else {
3264            hit(patterns.functions_prefix_paren)
3265        }
3266    } else {
3267        0
3268    };
3269    let test_hit = hit(patterns.tests);
3270    // Lines matching a test pattern count as tests, not as plain functions or classes.
3271    // This prevents double-counting in Python (`def test_` / `class Test`) and Go
3272    // (`func Test` / `func Benchmark` / `func Fuzz`) where the same line satisfies both
3273    // a function/class prefix and a test pattern. Rust is unaffected: `#[test]` is a
3274    // standalone attribute line; the `fn` declaration on the next line does not match any
3275    // test pattern and still increments functions correctly.
3276    let fn_hit = if test_hit == 0 {
3277        hit(patterns.functions) | fn_pp
3278    } else {
3279        0
3280    };
3281    let class_hit = if test_hit == 0 {
3282        hit(patterns.classes)
3283    } else {
3284        0
3285    };
3286    // Complement of `functions_prefix_paren`: same type keywords, but triggered when
3287    // there is no unguarded `(` on the line (i.e. not a function definition).
3288    let var_pnp: u64 = if !patterns.variables_prefix_no_paren.is_empty()
3289        && hit(patterns.variables_prefix_no_paren) != 0
3290    {
3291        if let Some(pp) = trimmed.find('(') {
3292            if trimmed[..pp].contains('=') {
3293                1
3294            } else {
3295                0
3296            }
3297        } else {
3298            1
3299        }
3300    } else {
3301        0
3302    };
3303    (
3304        fn_hit,
3305        class_hit,
3306        hit(patterns.variables) | var_pnp,
3307        hit(patterns.imports),
3308        test_hit,
3309        hit(patterns.assertions),
3310        hit(patterns.test_suites),
3311    )
3312}
3313
3314/// True when `line[start..end]` is surrounded by non-identifier characters.
3315fn is_word_boundary(line: &[u8], start: usize, end: usize) -> bool {
3316    let before_ok =
3317        start == 0 || (!line[start - 1].is_ascii_alphanumeric() && line[start - 1] != b'_');
3318    let after_ok = end >= line.len() || (!line[end].is_ascii_alphanumeric() && line[end] != b'_');
3319    before_ok && after_ok
3320}
3321
3322/// True when `kw_bytes` appears at `line[i..]`, respecting word boundaries when `word_kw` is set.
3323fn keyword_matches_at(line: &[u8], i: usize, kw_bytes: &[u8], word_kw: bool) -> bool {
3324    if &line[i..i + kw_bytes.len()] != kw_bytes {
3325        return false;
3326    }
3327    !word_kw || is_word_boundary(line, i, i + kw_bytes.len())
3328}
3329
3330/// Count branch keyword occurrences in `line` (ASCII bytes of a trimmed code line).
3331///
3332/// Alphabetic keywords are matched word-bounded (not as substrings of longer identifiers).
3333/// Operator tokens (`||`, `&&`, `?`) are matched as raw substrings.
3334fn count_branch_in_line(line: &[u8], keywords: &[&str]) -> u32 {
3335    if keywords.is_empty() || line.is_empty() {
3336        return 0;
3337    }
3338    let mut total = 0u32;
3339    for &kw in keywords {
3340        let kw_bytes = kw.as_bytes();
3341        let word_kw = kw.bytes().all(|b| b.is_ascii_alphabetic() || b == b'_');
3342        let mut i = 0usize;
3343        while i + kw_bytes.len() <= line.len() {
3344            if keyword_matches_at(line, i, kw_bytes, word_kw) {
3345                total += 1;
3346                i += kw_bytes.len();
3347            } else {
3348                i += 1;
3349            }
3350        }
3351    }
3352    total
3353}
3354
3355fn starts_with(chars: &[char], index: usize, needle: &str) -> bool {
3356    let needle_chars: Vec<char> = needle.chars().collect();
3357    chars.get(index..index + needle_chars.len()) == Some(needle_chars.as_slice())
3358}
3359
3360#[derive(Debug, Clone)]
3361struct PyContext {
3362    indent: usize,
3363    expect_docstring: bool,
3364}
3365
3366/// Update `contexts` to pop any scopes that the current `indent` has outdented past.
3367fn py_pop_outdented_contexts(contexts: &mut Vec<PyContext>, indent: usize) {
3368    while contexts.len() > 1 && indent < contexts.last().map_or(0, |c| c.indent) {
3369        contexts.pop();
3370    }
3371}
3372
3373/// Handle `pending_block_indent` transition: push a new docstring-expecting context when we
3374/// detect the first indented line of a new block, or cancel the pending state otherwise.
3375fn py_handle_pending_indent(
3376    pending_block_indent: &mut Option<usize>,
3377    contexts: &mut Vec<PyContext>,
3378    indent: usize,
3379    trimmed: &str,
3380) {
3381    let Some(base_indent) = *pending_block_indent else {
3382        return;
3383    };
3384    if indent > base_indent {
3385        contexts.push(PyContext {
3386            indent,
3387            expect_docstring: true,
3388        });
3389        *pending_block_indent = None;
3390    } else if !trimmed.starts_with('@') {
3391        *pending_block_indent = None;
3392    }
3393}
3394
3395/// Check whether the current line is a docstring opener in the current context.
3396///
3397/// If it is, records the line, adjusts `ctx.expect_docstring`, and optionally sets
3398/// `active_docstring` for multi-line docstrings. Returns `true` when the caller should
3399/// `continue` to the next line.
3400fn py_try_record_docstring(
3401    ctx: &mut PyContext,
3402    trimmed: &str,
3403    idx: usize,
3404    docstring_lines: &mut HashSet<usize>,
3405    active_docstring: &mut Option<(&'static str, usize)>,
3406) -> bool {
3407    if !ctx.expect_docstring {
3408        return false;
3409    }
3410    if let Some(delim) = docstring_delimiter(trimmed) {
3411        docstring_lines.insert(idx);
3412        ctx.expect_docstring = false;
3413        if !closes_triple_docstring(trimmed, delim, true) {
3414            *active_docstring = Some((delim, idx));
3415        }
3416        return true;
3417    }
3418    ctx.expect_docstring = false;
3419    false
3420}
3421
3422/// Advance through an active multi-line docstring: marks the current line and clears
3423/// `active_docstring` when the closing delimiter is found. Returns `true` when the caller
3424/// should `continue` to the next line (i.e. we were inside a docstring).
3425fn track_active_docstring(
3426    active_docstring: &mut Option<(&'static str, usize)>,
3427    docstring_lines: &mut HashSet<usize>,
3428    idx: usize,
3429    trimmed: &str,
3430) -> bool {
3431    let Some((delim, start_line)) = *active_docstring else {
3432        return false;
3433    };
3434    docstring_lines.insert(idx);
3435    if closes_triple_docstring(trimmed, delim, idx == start_line) {
3436        *active_docstring = None;
3437    }
3438    true
3439}
3440
3441/// Attempt to record a docstring opener using the top of the context stack.
3442/// Returns `true` when the caller should `continue` to the next line.
3443fn try_record_docstring_if_context(
3444    contexts: &mut [PyContext],
3445    trimmed: &str,
3446    idx: usize,
3447    docstring_lines: &mut HashSet<usize>,
3448    active_docstring: &mut Option<(&'static str, usize)>,
3449) -> bool {
3450    let Some(ctx) = contexts.last_mut() else {
3451        return false;
3452    };
3453    py_try_record_docstring(ctx, trimmed, idx, docstring_lines, active_docstring)
3454}
3455
3456/// If an unclosed docstring is still active at end-of-file, mark all remaining lines.
3457fn mark_unclosed_docstring_lines(
3458    active_docstring: Option<&(&'static str, usize)>,
3459    docstring_lines: &mut HashSet<usize>,
3460    num_lines: usize,
3461) {
3462    if let Some(&(_, start_line)) = active_docstring {
3463        for idx in start_line..num_lines {
3464            docstring_lines.insert(idx);
3465        }
3466    }
3467}
3468
3469fn detect_python_docstring_lines(text: &str) -> HashSet<usize> {
3470    let normalized = text.replace("\r\n", "\n").replace('\r', "\n");
3471    let lines: Vec<&str> = normalized.split_terminator('\n').collect();
3472
3473    let mut docstring_lines = HashSet::new();
3474    let mut contexts = vec![PyContext {
3475        indent: 0,
3476        expect_docstring: true,
3477    }];
3478    let mut pending_block_indent: Option<usize> = None;
3479    let mut active_docstring: Option<(&'static str, usize)> = None;
3480
3481    for (idx, line) in lines.iter().enumerate() {
3482        let trimmed = line.trim();
3483        let indent = leading_indent(line);
3484
3485        if track_active_docstring(&mut active_docstring, &mut docstring_lines, idx, trimmed) {
3486            continue;
3487        }
3488
3489        // Blank lines and comment lines don't affect docstring detection.
3490        if trimmed.is_empty() || trimmed.starts_with('#') {
3491            continue;
3492        }
3493
3494        py_pop_outdented_contexts(&mut contexts, indent);
3495        py_handle_pending_indent(&mut pending_block_indent, &mut contexts, indent, trimmed);
3496
3497        if try_record_docstring_if_context(
3498            &mut contexts,
3499            trimmed,
3500            idx,
3501            &mut docstring_lines,
3502            &mut active_docstring,
3503        ) {
3504            continue;
3505        }
3506
3507        if is_python_block_header(trimmed) {
3508            pending_block_indent = Some(indent);
3509        }
3510    }
3511
3512    mark_unclosed_docstring_lines(active_docstring.as_ref(), &mut docstring_lines, lines.len());
3513
3514    docstring_lines
3515}
3516
3517fn leading_indent(line: &str) -> usize {
3518    line.chars().take_while(|c| c.is_whitespace()).count()
3519}
3520
3521fn is_python_block_header(trimmed: &str) -> bool {
3522    (trimmed.starts_with("def ")
3523        || trimmed.starts_with("async def ")
3524        || trimmed.starts_with("class "))
3525        && trimmed.ends_with(':')
3526}
3527
3528fn docstring_delimiter(trimmed: &str) -> Option<&'static str> {
3529    let mut idx = 0usize;
3530    let bytes = trimmed.as_bytes();
3531    while idx < bytes.len() {
3532        let c = bytes[idx] as char;
3533        if matches!(c, 'r' | 'R' | 'u' | 'U' | 'b' | 'B' | 'f' | 'F') {
3534            idx += 1;
3535            continue;
3536        }
3537        break;
3538    }
3539
3540    let rest = &trimmed[idx..];
3541    if rest.starts_with("\"\"\"") {
3542        Some("\"\"\"")
3543    } else if rest.starts_with("'''") {
3544        Some("'''")
3545    } else {
3546        None
3547    }
3548}
3549
3550fn closes_triple_docstring(trimmed: &str, delim: &str, same_line_as_start: bool) -> bool {
3551    let mut occurrences = 0usize;
3552    let mut search = trimmed;
3553    while let Some(index) = search.find(delim) {
3554        occurrences += 1;
3555        search = &search[index + delim.len()..];
3556    }
3557
3558    if same_line_as_start {
3559        occurrences >= 2
3560    } else {
3561        occurrences >= 1
3562    }
3563}
3564
3565/// Tree-sitter-backed adapters (compiled only when the `tree-sitter` feature is enabled).
3566///
3567/// When parsing succeeds the result is used directly; on any failure the caller falls back
3568/// to the lexical state machine.
3569#[cfg(feature = "tree-sitter")]
3570pub mod ts {
3571    use tree_sitter::Node;
3572
3573    use super::{ParseMode, RawFileAnalysis, RawLineCounts};
3574
3575    /// Configuration for which AST node kinds map to symbols in this grammar.
3576    struct SymbolKinds {
3577        /// Node kind name for function definitions (e.g. `"function_definition"`).
3578        function_def: &'static str,
3579        /// Node kind name for class definitions (e.g. `"class_definition"`).
3580        class_def: &'static str,
3581        /// Name field of a function node that, when it starts with this prefix, marks a test.
3582        /// Empty string disables test-prefix detection.
3583        test_fn_prefix: &'static str,
3584        /// Name field of a class node that, when it starts with this prefix, marks a test.
3585        /// Empty string disables test-prefix detection.
3586        test_class_prefix: &'static str,
3587        /// When non-empty, `call` nodes whose `function` is an `attribute` access and whose
3588        /// attribute identifier starts with this prefix are counted as test assertions.
3589        /// Used for Python `self.assertXxx(...)` detection.
3590        assertion_attr_prefix: &'static str,
3591    }
3592
3593    impl SymbolKinds {
3594        const fn none() -> Self {
3595            Self {
3596                function_def: "",
3597                class_def: "",
3598                test_fn_prefix: "",
3599                test_class_prefix: "",
3600                assertion_attr_prefix: "",
3601            }
3602        }
3603    }
3604
3605    /// Classify every line of `text` using a tree-sitter grammar.
3606    ///
3607    /// `comment_node_kinds` — node type names that represent comments in this grammar
3608    /// `docstring_stmt_kind` — optional parent node type whose direct `string` child is a docstring
3609    /// `symbols` — AST node kinds used to populate symbol counters
3610    fn analyze_lines(
3611        text: &str,
3612        ts_language: &tree_sitter::Language,
3613        comment_node_kinds: &[&str],
3614        docstring_stmt_kind: Option<&str>,
3615        symbols: &SymbolKinds,
3616    ) -> Option<RawFileAnalysis> {
3617        let mut parser = tree_sitter::Parser::new();
3618        parser.set_language(ts_language).ok()?;
3619        let tree = parser.parse(text, None)?;
3620
3621        let lines: Vec<&str> = text.split_terminator('\n').collect();
3622        let n = lines.len();
3623
3624        let mut has_code = vec![false; n];
3625        let mut has_comment = vec![false; n];
3626        let mut comment_is_block = vec![false; n];
3627        let mut has_docstring = vec![false; n];
3628
3629        // Walk every node in the tree and mark line arrays.
3630        let mut ctx = VisitCtx {
3631            source: text.as_bytes(),
3632            comment_kinds: comment_node_kinds,
3633            docstring_stmt_kind,
3634            has_code: &mut has_code,
3635            has_comment: &mut has_comment,
3636            comment_is_block: &mut comment_is_block,
3637            has_docstring: &mut has_docstring,
3638        };
3639        visit(tree.root_node(), &mut ctx);
3640
3641        let mut raw = RawLineCounts::default();
3642        classify_ts_lines(
3643            &lines,
3644            &has_code,
3645            &has_comment,
3646            &comment_is_block,
3647            &has_docstring,
3648            &mut raw,
3649        );
3650
3651        // Symbol counting: walk the AST a second time to collect function/class/test counts.
3652        if !symbols.function_def.is_empty() || !symbols.class_def.is_empty() {
3653            count_symbols(tree.root_node(), text.as_bytes(), symbols, &mut raw);
3654        }
3655
3656        Some(RawFileAnalysis {
3657            raw,
3658            parse_mode: ParseMode::TreeSitter,
3659            warnings: Vec::new(),
3660            style_analysis: None,
3661        })
3662    }
3663
3664    /// Recurse into every direct child of `node`.
3665    fn recurse_children(node: Node, source: &[u8], kinds: &SymbolKinds, raw: &mut RawLineCounts) {
3666        for i in 0..node.child_count() {
3667            #[allow(clippy::cast_possible_truncation)]
3668            if let Some(child) = node.child(i as u32) {
3669                count_symbols(child, source, kinds, raw);
3670            }
3671        }
3672    }
3673
3674    /// Handle a function-definition node. Returns `true` if the node matched.
3675    fn try_count_function(
3676        node: Node,
3677        source: &[u8],
3678        kinds: &SymbolKinds,
3679        raw: &mut RawLineCounts,
3680    ) -> bool {
3681        if kinds.function_def.is_empty() || node.kind() != kinds.function_def {
3682            return false;
3683        }
3684        let name = node
3685            .child_by_field_name("name")
3686            .and_then(|n| n.utf8_text(source).ok())
3687            .unwrap_or("");
3688        if !kinds.test_fn_prefix.is_empty() && name.starts_with(kinds.test_fn_prefix) {
3689            raw.test_count += 1;
3690        } else {
3691            raw.functions += 1;
3692        }
3693        recurse_children(node, source, kinds, raw);
3694        true
3695    }
3696
3697    /// Handle a class-definition node. Returns `true` if the node matched.
3698    fn try_count_class(
3699        node: Node,
3700        source: &[u8],
3701        kinds: &SymbolKinds,
3702        raw: &mut RawLineCounts,
3703    ) -> bool {
3704        if kinds.class_def.is_empty() || node.kind() != kinds.class_def {
3705            return false;
3706        }
3707        let name = node
3708            .child_by_field_name("name")
3709            .and_then(|n| n.utf8_text(source).ok())
3710            .unwrap_or("");
3711        if !kinds.test_class_prefix.is_empty() && name.starts_with(kinds.test_class_prefix) {
3712            raw.test_count += 1;
3713        } else {
3714            raw.classes += 1;
3715        }
3716        recurse_children(node, source, kinds, raw);
3717        true
3718    }
3719
3720    /// Handle an assertion call node. Returns `true` if the node matched (skips recursion
3721    /// into arguments, preserving "don't double-count test bodies" semantics).
3722    fn try_count_assertion(
3723        node: Node,
3724        source: &[u8],
3725        kinds: &SymbolKinds,
3726        raw: &mut RawLineCounts,
3727    ) -> bool {
3728        if kinds.assertion_attr_prefix.is_empty() || node.kind() != "call" {
3729            return false;
3730        }
3731        let Some(func) = node.child_by_field_name("function") else {
3732            return false;
3733        };
3734        if func.kind() != "attribute" {
3735            return false;
3736        }
3737        let attr_text = func
3738            .child_by_field_name("attribute")
3739            .and_then(|n| n.utf8_text(source).ok())
3740            .unwrap_or("");
3741        if !attr_text.starts_with(kinds.assertion_attr_prefix) {
3742            return false;
3743        }
3744        raw.test_assertion_count += 1;
3745        true
3746    }
3747
3748    /// Walk the AST and populate `raw.functions`, `raw.classes`, `raw.test_count`,
3749    /// and `raw.test_assertion_count`.
3750    fn count_symbols(node: Node, source: &[u8], kinds: &SymbolKinds, raw: &mut RawLineCounts) {
3751        if try_count_function(node, source, kinds, raw) {
3752            return;
3753        }
3754        if try_count_class(node, source, kinds, raw) {
3755            return;
3756        }
3757        if try_count_assertion(node, source, kinds, raw) {
3758            return;
3759        }
3760        recurse_children(node, source, kinds, raw);
3761    }
3762
3763    /// Flags describing what kinds of content appear on a single line.
3764    // Four bools are the natural representation for these four independent properties.
3765    #[allow(clippy::struct_excessive_bools)]
3766    #[derive(Clone, Copy)]
3767    struct TsLineFlags {
3768        has_code: bool,
3769        has_comment: bool,
3770        comment_is_block: bool,
3771        has_docstring: bool,
3772    }
3773
3774    /// Classify a single tree-sitter-annotated line and accumulate into `raw`.
3775    const fn classify_ts_line(trimmed: &str, flags: TsLineFlags, raw: &mut RawLineCounts) {
3776        if trimmed.is_empty() {
3777            raw.blank_only_lines += 1;
3778        } else if flags.has_docstring && !flags.has_code {
3779            raw.docstring_comment_lines += 1;
3780        } else if flags.has_code && flags.has_comment {
3781            // Classify the mixed line as single or multi based on what kind of comment is on it.
3782            if flags.comment_is_block {
3783                raw.mixed_code_multi_comment_lines += 1;
3784            } else {
3785                raw.mixed_code_single_comment_lines += 1;
3786            }
3787        } else if flags.has_comment {
3788            if flags.comment_is_block {
3789                raw.multi_comment_only_lines += 1;
3790            } else {
3791                raw.single_comment_only_lines += 1;
3792            }
3793        } else {
3794            raw.code_only_lines += 1;
3795        }
3796    }
3797
3798    /// Classify each tree-sitter-annotated line and accumulate counts into `raw`.
3799    fn classify_ts_lines(
3800        lines: &[&str],
3801        has_code: &[bool],
3802        has_comment: &[bool],
3803        comment_is_block: &[bool],
3804        has_docstring: &[bool],
3805        raw: &mut RawLineCounts,
3806    ) {
3807        for i in 0..lines.len() {
3808            raw.total_physical_lines += 1;
3809            classify_ts_line(
3810                lines[i].trim(),
3811                TsLineFlags {
3812                    has_code: has_code[i],
3813                    has_comment: has_comment[i],
3814                    comment_is_block: comment_is_block[i],
3815                    has_docstring: has_docstring[i],
3816                },
3817                raw,
3818            );
3819        }
3820    }
3821
3822    struct VisitCtx<'a> {
3823        source: &'a [u8],
3824        comment_kinds: &'a [&'a str],
3825        docstring_stmt_kind: Option<&'a str>,
3826        has_code: &'a mut Vec<bool>,
3827        has_comment: &'a mut Vec<bool>,
3828        comment_is_block: &'a mut Vec<bool>,
3829        has_docstring: &'a mut Vec<bool>,
3830    }
3831
3832    /// Mark all rows of a comment node and detect whether it is a block comment.
3833    fn visit_comment_node(node: Node, ctx: &mut VisitCtx<'_>) {
3834        let start_row = node.start_position().row;
3835        let end_row = node.end_position().row;
3836        let first_two = node
3837            .utf8_text(ctx.source)
3838            .unwrap_or("")
3839            .get(..2)
3840            .unwrap_or("");
3841        let is_block = first_two == "/*" || first_two == "<#";
3842        for row in start_row..=end_row {
3843            if row < ctx.has_comment.len() {
3844                ctx.has_comment[row] = true;
3845                if is_block {
3846                    ctx.comment_is_block[row] = true;
3847                }
3848            }
3849        }
3850    }
3851
3852    /// If `node` is an `expression_statement` whose sole named child is a string literal,
3853    /// mark those rows as docstring and return `true`.
3854    fn visit_maybe_docstring(node: Node, kind: &str, ctx: &mut VisitCtx<'_>) -> bool {
3855        let Some(stmt_kind) = ctx.docstring_stmt_kind else {
3856            return false;
3857        };
3858        if kind != stmt_kind || node.named_child_count() != 1 {
3859            return false;
3860        }
3861        let Some(child) = node.named_child(0) else {
3862            return false;
3863        };
3864        if child.kind() != "string" {
3865            return false;
3866        }
3867        let child_start = child.start_position().row;
3868        let child_end = child.end_position().row;
3869        for row in child_start..=child_end {
3870            if row < ctx.has_docstring.len() {
3871                ctx.has_docstring[row] = true;
3872            }
3873        }
3874        true
3875    }
3876
3877    /// Mark all rows of a leaf (non-comment, non-extra) node as code.
3878    fn visit_leaf_code(node: Node, ctx: &mut VisitCtx<'_>) {
3879        let start_row = node.start_position().row;
3880        let end_row = node.end_position().row;
3881        for row in start_row..=end_row {
3882            if row < ctx.has_code.len() {
3883                ctx.has_code[row] = true;
3884            }
3885        }
3886    }
3887
3888    #[allow(clippy::too_many_lines)]
3889    fn visit(node: Node, ctx: &mut VisitCtx<'_>) {
3890        let kind = node.kind();
3891
3892        // Comment node — mark rows as comment, detect block vs. line comment.
3893        if ctx.comment_kinds.contains(&kind) {
3894            visit_comment_node(node, ctx);
3895            return;
3896        }
3897
3898        // Python docstring: expression_statement whose only named child is a string literal.
3899        if visit_maybe_docstring(node, kind, ctx) {
3900            return;
3901        }
3902
3903        // Leaf non-comment node: mark as code.
3904        if node.child_count() == 0 && !node.is_extra() {
3905            visit_leaf_code(node, ctx);
3906            return;
3907        }
3908
3909        for i in 0..node.child_count() {
3910            #[allow(clippy::cast_possible_truncation)]
3911            // child_count bounded by tree-sitter u32 capacity
3912            if let Some(child) = node.child(i as u32) {
3913                visit(child, ctx);
3914            }
3915        }
3916    }
3917
3918    const C_SYMBOLS: SymbolKinds = SymbolKinds::none();
3919
3920    const PYTHON_SYMBOLS: SymbolKinds = SymbolKinds {
3921        function_def: "function_definition",
3922        class_def: "class_definition",
3923        test_fn_prefix: "test_",
3924        test_class_prefix: "Test",
3925        assertion_attr_prefix: "assert",
3926    };
3927
3928    /// Parse C or C++ source with tree-sitter-c.
3929    #[must_use]
3930    pub fn analyze_c(text: &str) -> Option<RawFileAnalysis> {
3931        let lang: tree_sitter::Language = tree_sitter_c::LANGUAGE.into();
3932        analyze_lines(text, &lang, &["comment"], None, &C_SYMBOLS)
3933    }
3934
3935    /// Parse Python source with tree-sitter-python.
3936    #[must_use]
3937    pub fn analyze_python(text: &str) -> Option<RawFileAnalysis> {
3938        let lang: tree_sitter::Language = tree_sitter_python::LANGUAGE.into();
3939        analyze_lines(
3940            text,
3941            &lang,
3942            &["comment"],
3943            Some("expression_statement"),
3944            &PYTHON_SYMBOLS,
3945        )
3946    }
3947}
3948
3949#[cfg(test)]
3950mod tests {
3951    use super::*;
3952
3953    #[test]
3954    fn python_docstrings_are_separated() {
3955        let input = r#""""module docs"""
3956
3957
3958def fn_a():
3959    """function docs"""
3960    value = 1  # trailing comment
3961    return value
3962"#;
3963
3964        let result = analyze_text(Language::Python, input, AnalysisOptions::default());
3965        assert_eq!(result.raw.docstring_comment_lines, 2);
3966        assert_eq!(result.raw.mixed_code_single_comment_lines, 1);
3967        assert_eq!(result.raw.code_only_lines, 2);
3968    }
3969
3970    #[test]
3971    fn c_style_mixed_lines_are_captured() {
3972        let input = "int x = 1; // note\n/* block */\n";
3973        let result = analyze_text(Language::C, input, AnalysisOptions::default());
3974        assert_eq!(result.raw.mixed_code_single_comment_lines, 1);
3975        assert_eq!(result.raw.multi_comment_only_lines, 1);
3976    }
3977
3978    #[test]
3979    fn detect_language_by_shebang() {
3980        let language = detect_language(
3981            Path::new("script"),
3982            Some("#!/usr/bin/env bash"),
3983            &BTreeMap::new(),
3984            true,
3985        );
3986        assert_eq!(language, Some(Language::Shell));
3987    }
3988
3989    // ── count_symbols: no double-counting of test functions ──────────────────
3990
3991    fn sym(lang: Language, line: &str) -> (u64, u64, u64, u64, u64, u64, u64) {
3992        let result = analyze_text(lang, &format!("{line}\n"), AnalysisOptions::default());
3993        let r = &result.raw;
3994        (
3995            r.functions,
3996            r.classes,
3997            r.variables,
3998            r.imports,
3999            r.test_count,
4000            r.test_assertion_count,
4001            r.test_suite_count,
4002        )
4003    }
4004
4005    #[test]
4006    fn python_test_fn_not_double_counted() {
4007        // def test_ lines count as tests only, NOT as functions
4008        let (f, c, _, _, t, _, _) = sym(Language::Python, "def test_foo():");
4009        assert_eq!(f, 0, "test fn must not also increment functions");
4010        assert_eq!(t, 1, "must be counted as a test");
4011        assert_eq!(c, 0);
4012    }
4013
4014    #[test]
4015    fn python_test_class_not_double_counted() {
4016        // class Test* lines count as tests only, NOT as classes
4017        let (f, c, _, _, t, _, _) = sym(Language::Python, "class TestFoo:");
4018        assert_eq!(c, 0, "test class must not also increment classes");
4019        assert_eq!(t, 1, "must be counted as a test");
4020        assert_eq!(f, 0);
4021    }
4022
4023    #[test]
4024    fn python_regular_fn_counts_as_function() {
4025        let (f, c, _, _, t, _, _) = sym(Language::Python, "def regular():");
4026        assert_eq!(f, 1, "regular function must be counted");
4027        assert_eq!(t, 0);
4028        assert_eq!(c, 0);
4029    }
4030
4031    #[test]
4032    fn python_regular_class_counts_as_class() {
4033        let (f, c, _, _, t, _, _) = sym(Language::Python, "class Regular:");
4034        assert_eq!(c, 1, "regular class must be counted");
4035        assert_eq!(t, 0);
4036        assert_eq!(f, 0);
4037    }
4038
4039    #[test]
4040    fn go_test_fn_not_double_counted() {
4041        let (f, _, _, _, t, _, _) = sym(Language::Go, "func TestFoo(t *testing.T) {");
4042        assert_eq!(f, 0, "Go test func must not also increment functions");
4043        assert_eq!(t, 1, "must be counted as a test");
4044    }
4045
4046    #[test]
4047    fn go_benchmark_fn_not_double_counted() {
4048        let (f, _, _, _, t, _, _) = sym(Language::Go, "func BenchmarkBar(b *testing.B) {");
4049        assert_eq!(f, 0, "Go benchmark func must not also increment functions");
4050        assert_eq!(t, 1, "must be counted as a test");
4051    }
4052
4053    #[test]
4054    fn go_regular_fn_counts_as_function() {
4055        let (f, _, _, _, t, _, _) = sym(Language::Go, "func doSomething() {");
4056        assert_eq!(f, 1, "regular Go func must be counted");
4057        assert_eq!(t, 0);
4058    }
4059
4060    #[test]
4061    fn rust_test_attr_counts_as_test_not_function() {
4062        // #[test] is a standalone attribute line — counted as a test, never as a function
4063        let (f, _, _, _, t, _, _) = sym(Language::Rust, "#[test]");
4064        assert_eq!(t, 1, "#[test] must be counted as a test");
4065        assert_eq!(f, 0, "#[test] attribute must not be counted as a function");
4066    }
4067
4068    #[test]
4069    fn rust_fn_line_counts_as_function_not_test() {
4070        // The fn declaration after #[test] does NOT match any test pattern
4071        let (f, _, _, _, t, _, _) = sym(Language::Rust, "fn test_something() {");
4072        assert_eq!(f, 1, "fn declaration must count as a function");
4073        assert_eq!(
4074            t, 0,
4075            "fn declaration line must not be double-counted as a test"
4076        );
4077    }
4078
4079    #[test]
4080    fn js_describe_counts_as_test_not_function() {
4081        let (f, _, _, _, t, _, _) = sym(Language::JavaScript, "describe('suite', () => {");
4082        assert_eq!(t, 1, "describe must be counted as a test");
4083        assert_eq!(f, 0, "describe must not be counted as a function");
4084    }
4085
4086    #[test]
4087    fn js_regular_fn_counts_as_function() {
4088        let (f, _, _, _, t, _, _) = sym(Language::JavaScript, "function doWork() {");
4089        assert_eq!(f, 1, "JS function declaration must be counted");
4090        assert_eq!(t, 0);
4091    }
4092
4093    // ── Language detection tests ─────────────────────────────────────────────
4094
4095    use std::collections::BTreeMap;
4096    use std::path::Path;
4097
4098    #[test]
4099    fn detect_language_rs_extension() {
4100        let lang = detect_language(Path::new("foo.rs"), None, &BTreeMap::new(), false);
4101        assert_eq!(lang, Some(Language::Rust));
4102    }
4103
4104    #[test]
4105    fn detect_language_py_extension() {
4106        let lang = detect_language(Path::new("foo.py"), None, &BTreeMap::new(), false);
4107        assert_eq!(lang, Some(Language::Python));
4108    }
4109
4110    #[test]
4111    fn detect_language_ts_extension() {
4112        let lang = detect_language(Path::new("app.ts"), None, &BTreeMap::new(), false);
4113        assert_eq!(lang, Some(Language::TypeScript));
4114    }
4115
4116    #[test]
4117    fn detect_language_js_extension() {
4118        let lang = detect_language(Path::new("app.js"), None, &BTreeMap::new(), false);
4119        assert_eq!(lang, Some(Language::JavaScript));
4120    }
4121
4122    #[test]
4123    fn detect_language_go_extension() {
4124        let lang = detect_language(Path::new("main.go"), None, &BTreeMap::new(), false);
4125        assert_eq!(lang, Some(Language::Go));
4126    }
4127
4128    #[test]
4129    fn detect_language_c_extension() {
4130        let lang = detect_language(Path::new("main.c"), None, &BTreeMap::new(), false);
4131        assert_eq!(lang, Some(Language::C));
4132    }
4133
4134    #[test]
4135    fn detect_language_cpp_extension() {
4136        let lang = detect_language(Path::new("main.cpp"), None, &BTreeMap::new(), false);
4137        assert_eq!(lang, Some(Language::Cpp));
4138    }
4139
4140    #[test]
4141    fn detect_language_java_extension() {
4142        let lang = detect_language(Path::new("Main.java"), None, &BTreeMap::new(), false);
4143        assert_eq!(lang, Some(Language::Java));
4144    }
4145
4146    #[test]
4147    fn detect_language_makefile_exact_name() {
4148        let lang = detect_language(Path::new("Makefile"), None, &BTreeMap::new(), false);
4149        assert_eq!(lang, Some(Language::Makefile));
4150    }
4151
4152    #[test]
4153    fn detect_language_dockerfile_exact_name() {
4154        let lang = detect_language(Path::new("Dockerfile"), None, &BTreeMap::new(), false);
4155        assert_eq!(lang, Some(Language::Dockerfile));
4156    }
4157
4158    #[test]
4159    fn detect_language_rakefile() {
4160        let lang = detect_language(Path::new("Rakefile"), None, &BTreeMap::new(), false);
4161        assert_eq!(lang, Some(Language::Ruby));
4162    }
4163
4164    #[test]
4165    fn detect_language_gemfile() {
4166        let lang = detect_language(Path::new("Gemfile"), None, &BTreeMap::new(), false);
4167        assert_eq!(lang, Some(Language::Ruby));
4168    }
4169
4170    #[test]
4171    fn detect_language_unknown_extension_returns_none() {
4172        let lang = detect_language(Path::new("foo.xyz123"), None, &BTreeMap::new(), false);
4173        assert_eq!(lang, None);
4174    }
4175
4176    #[test]
4177    fn detect_language_extension_override() {
4178        let mut overrides = BTreeMap::new();
4179        overrides.insert("h".into(), "cpp".into());
4180        let lang = detect_language(Path::new("header.h"), None, &overrides, false);
4181        assert_eq!(lang, Some(Language::Cpp));
4182    }
4183
4184    #[test]
4185    fn detect_language_shebang_python() {
4186        let lang = detect_language(
4187            Path::new("script"),
4188            Some("#!/usr/bin/env python3"),
4189            &BTreeMap::new(),
4190            true,
4191        );
4192        assert_eq!(lang, Some(Language::Python));
4193    }
4194
4195    #[test]
4196    fn detect_language_shebang_bash() {
4197        let lang = detect_language(
4198            Path::new("script"),
4199            Some("#!/bin/bash"),
4200            &BTreeMap::new(),
4201            true,
4202        );
4203        assert_eq!(lang, Some(Language::Shell));
4204    }
4205
4206    #[test]
4207    fn detect_language_shebang_ruby() {
4208        let lang = detect_language(
4209            Path::new("script"),
4210            Some("#!/usr/bin/env ruby"),
4211            &BTreeMap::new(),
4212            true,
4213        );
4214        assert_eq!(lang, Some(Language::Ruby));
4215    }
4216
4217    #[test]
4218    fn detect_language_shebang_disabled() {
4219        // When shebang_detection=false, shebang is ignored
4220        let lang = detect_language(
4221            Path::new("script"),
4222            Some("#!/usr/bin/env python3"),
4223            &BTreeMap::new(),
4224            false,
4225        );
4226        assert_eq!(lang, None);
4227    }
4228
4229    #[test]
4230    fn from_name_rust() {
4231        assert_eq!(Language::from_name("rust"), Some(Language::Rust));
4232    }
4233
4234    #[test]
4235    fn from_name_python() {
4236        assert_eq!(Language::from_name("python"), Some(Language::Python));
4237    }
4238
4239    #[test]
4240    fn from_name_unknown() {
4241        assert_eq!(Language::from_name("brainfuck"), None);
4242    }
4243
4244    #[test]
4245    fn from_name_roundtrip_all() {
4246        // Every language's slug should round-trip through from_name
4247        for lang in [
4248            Language::C,
4249            Language::Cpp,
4250            Language::CSharp,
4251            Language::Go,
4252            Language::Java,
4253            Language::JavaScript,
4254            Language::Python,
4255            Language::Rust,
4256            Language::Shell,
4257            Language::PowerShell,
4258            Language::TypeScript,
4259            Language::Assembly,
4260            Language::Clojure,
4261            Language::Css,
4262            Language::Dart,
4263            Language::Dockerfile,
4264            Language::Elixir,
4265            Language::Erlang,
4266            Language::FSharp,
4267            Language::Groovy,
4268            Language::Haskell,
4269            Language::Html,
4270            Language::Julia,
4271            Language::Kotlin,
4272            Language::Lua,
4273            Language::Makefile,
4274            Language::Nim,
4275            Language::ObjectiveC,
4276            Language::Ocaml,
4277            Language::Perl,
4278            Language::Php,
4279            Language::R,
4280            Language::Ruby,
4281            Language::Scala,
4282            Language::Scss,
4283            Language::Sql,
4284            Language::Svelte,
4285            Language::Swift,
4286            Language::Vue,
4287            Language::Xml,
4288            Language::Zig,
4289        ] {
4290            let slug = lang.as_slug();
4291            let roundtripped = Language::from_name(slug);
4292            assert_eq!(
4293                roundtripped,
4294                Some(lang),
4295                "from_name({slug:?}) should return {lang:?}"
4296            );
4297        }
4298    }
4299
4300    // ── blank_in_block_comment_policy behavioral tests ───────────────────────
4301
4302    #[test]
4303    fn blank_in_block_comment_defaults_to_comment() {
4304        // Default: blank lines inside /* */ count as multi-comment lines (IEEE-aligned).
4305        let input = "/*\n\n*/";
4306        let opts = AnalysisOptions {
4307            blank_in_block_comment_as_comment: true,
4308            ..Default::default()
4309        };
4310        let result = analyze_text(Language::C, input, opts);
4311        assert_eq!(
4312            result.raw.multi_comment_only_lines, 3,
4313            "all 3 block-comment lines must count as multi-comment with CountAsComment policy"
4314        );
4315        assert_eq!(
4316            result.raw.blank_only_lines, 0,
4317            "no blank lines expected with CountAsComment policy"
4318        );
4319    }
4320
4321    #[test]
4322    fn blank_in_block_comment_counted_as_blank_when_policy_false() {
4323        // CountAsBlank: blank lines inside /* */ count as blank, not comment.
4324        let input = "/*\n\n*/";
4325        let opts = AnalysisOptions {
4326            blank_in_block_comment_as_comment: false,
4327            ..Default::default()
4328        };
4329        let result = analyze_text(Language::C, input, opts);
4330        assert_eq!(
4331            result.raw.multi_comment_only_lines, 2,
4332            "opener and closer must count as multi-comment with CountAsBlank policy"
4333        );
4334        assert_eq!(
4335            result.raw.blank_only_lines, 1,
4336            "the blank line inside the block comment must count as blank with CountAsBlank policy"
4337        );
4338    }
4339
4340    // ── continuation_line_policy behavioral tests ────────────────────────────
4341
4342    #[test]
4343    fn continuation_lines_each_physical_default() {
4344        // Default (EachPhysicalLine): every physical line counted separately.
4345        let input = "#define FOO \\\n  1 \\\n  + 2\n";
4346        let opts = AnalysisOptions {
4347            collapse_continuation_lines: false,
4348            ..Default::default()
4349        };
4350        let result = analyze_text(Language::C, input, opts);
4351        assert_eq!(
4352            result.raw.total_physical_lines, 3,
4353            "3 physical lines expected"
4354        );
4355        assert_eq!(
4356            result.raw.code_only_lines, 3,
4357            "each physical line must count as code with EachPhysicalLine policy"
4358        );
4359    }
4360
4361    #[test]
4362    fn continuation_lines_collapse_to_logical() {
4363        // CollapseToLogical: 3 backslash-continued lines collapse to 1 logical code line.
4364        let input = "#define FOO \\\n  1 \\\n  + 2\n";
4365        let opts = AnalysisOptions {
4366            collapse_continuation_lines: true,
4367            ..Default::default()
4368        };
4369        let result = analyze_text(Language::C, input, opts);
4370        assert_eq!(
4371            result.raw.total_physical_lines, 3,
4372            "physical line count is always 3 regardless of policy"
4373        );
4374        assert_eq!(
4375            result.raw.code_only_lines, 1,
4376            "3 continuation lines must collapse to 1 logical code line"
4377        );
4378    }
4379}