Skip to main content

sloc_languages/
lib.rs

1// SPDX-License-Identifier: AGPL-3.0-or-later
2// Copyright (C) 2026 Nima Shafie <nimzshafie@gmail.com>
3
4pub mod style;
5pub use style::{IndentStyle, StyleAnalysis, StyleGuideScore, StyleSignal};
6
7use std::collections::{BTreeMap, BTreeSet, HashSet};
8use std::path::Path;
9
10use serde::{Deserialize, Serialize};
11
12#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
13#[serde(rename_all = "snake_case")]
14pub enum Language {
15    C,
16    Cpp,
17    CSharp,
18    Go,
19    Java,
20    JavaScript,
21    Python,
22    Rust,
23    Shell,
24    PowerShell,
25    TypeScript,
26    // --- Extended language support ---
27    Assembly,
28    Clojure,
29    Css,
30    Dart,
31    Dockerfile,
32    Elixir,
33    Erlang,
34    FSharp,
35    Groovy,
36    Haskell,
37    Html,
38    Julia,
39    Kotlin,
40    Lua,
41    Makefile,
42    Nim,
43    ObjectiveC,
44    Ocaml,
45    Perl,
46    Php,
47    R,
48    Ruby,
49    Scala,
50    Scss,
51    Sql,
52    Svelte,
53    Swift,
54    Vue,
55    Xml,
56    Zig,
57    // --- Pass 1: modern declarative / smart-contract languages ---
58    Solidity,
59    Protobuf,
60    Hcl,
61    GraphQl,
62    // --- Pass 2: legacy + embedded / hardware-description languages ---
63    Ada,
64    Vhdl,
65    Verilog,
66    Tcl,
67    Pascal,
68    VisualBasic,
69    Lisp,
70    // --- Pass 3: scientific / infra / systems / graphics ---
71    Fortran,
72    Nix,
73    Crystal,
74    D,
75    Glsl,
76    Cmake,
77    Elm,
78    Awk,
79}
80
81impl Language {
82    #[must_use]
83    pub const fn display_name(&self) -> &'static str {
84        match self {
85            Self::C => "C",
86            Self::Cpp => "C++",
87            Self::CSharp => "C#",
88            Self::Go => "Go",
89            Self::Java => "Java",
90            Self::JavaScript => "JavaScript",
91            Self::Python => "Python",
92            Self::Rust => "Rust",
93            Self::Shell => "Shell",
94            Self::PowerShell => "PowerShell",
95            Self::TypeScript => "TypeScript",
96            Self::Assembly => "Assembly",
97            Self::Clojure => "Clojure",
98            Self::Css => "CSS",
99            Self::Dart => "Dart",
100            Self::Dockerfile => "Dockerfile",
101            Self::Elixir => "Elixir",
102            Self::Erlang => "Erlang",
103            Self::FSharp => "F#",
104            Self::Groovy => "Groovy",
105            Self::Haskell => "Haskell",
106            Self::Html => "HTML",
107            Self::Julia => "Julia",
108            Self::Kotlin => "Kotlin",
109            Self::Lua => "Lua",
110            Self::Makefile => "Makefile",
111            Self::Nim => "Nim",
112            Self::ObjectiveC => "Objective-C",
113            Self::Ocaml => "OCaml",
114            Self::Perl => "Perl",
115            Self::Php => "PHP",
116            Self::R => "R",
117            Self::Ruby => "Ruby",
118            Self::Scala => "Scala",
119            Self::Scss => "SCSS",
120            Self::Sql => "SQL",
121            Self::Svelte => "Svelte",
122            Self::Swift => "Swift",
123            Self::Vue => "Vue",
124            Self::Xml => "XML",
125            Self::Zig => "Zig",
126            Self::Solidity => "Solidity",
127            Self::Protobuf => "Protocol Buffers",
128            Self::Hcl => "HCL/Terraform",
129            Self::GraphQl => "GraphQL",
130            Self::Ada => "Ada",
131            Self::Vhdl => "VHDL",
132            Self::Verilog => "Verilog/SystemVerilog",
133            Self::Tcl => "Tcl",
134            Self::Pascal => "Pascal/Delphi",
135            Self::VisualBasic => "Visual Basic",
136            Self::Lisp => "Lisp/Scheme",
137            Self::Fortran => "Fortran",
138            Self::Nix => "Nix",
139            Self::Crystal => "Crystal",
140            Self::D => "D",
141            Self::Glsl => "GLSL/HLSL",
142            Self::Cmake => "CMake",
143            Self::Elm => "Elm",
144            Self::Awk => "Awk",
145        }
146    }
147
148    #[must_use]
149    pub const fn as_slug(&self) -> &'static str {
150        match self {
151            Self::C => "c",
152            Self::Cpp => "cpp",
153            Self::CSharp => "csharp",
154            Self::Go => "go",
155            Self::Java => "java",
156            Self::JavaScript => "javascript",
157            Self::Python => "python",
158            Self::Rust => "rust",
159            Self::Shell => "shell",
160            Self::PowerShell => "powershell",
161            Self::TypeScript => "typescript",
162            Self::Assembly => "assembly",
163            Self::Clojure => "clojure",
164            Self::Css => "css",
165            Self::Dart => "dart",
166            Self::Dockerfile => "dockerfile",
167            Self::Elixir => "elixir",
168            Self::Erlang => "erlang",
169            Self::FSharp => "fsharp",
170            Self::Groovy => "groovy",
171            Self::Haskell => "haskell",
172            Self::Html => "html",
173            Self::Julia => "julia",
174            Self::Kotlin => "kotlin",
175            Self::Lua => "lua",
176            Self::Makefile => "makefile",
177            Self::Nim => "nim",
178            Self::ObjectiveC => "objectivec",
179            Self::Ocaml => "ocaml",
180            Self::Perl => "perl",
181            Self::Php => "php",
182            Self::R => "r",
183            Self::Ruby => "ruby",
184            Self::Scala => "scala",
185            Self::Scss => "scss",
186            Self::Sql => "sql",
187            Self::Svelte => "svelte",
188            Self::Swift => "swift",
189            Self::Vue => "vue",
190            Self::Xml => "xml",
191            Self::Zig => "zig",
192            Self::Solidity => "solidity",
193            Self::Protobuf => "protobuf",
194            Self::Hcl => "hcl",
195            Self::GraphQl => "graphql",
196            Self::Ada => "ada",
197            Self::Vhdl => "vhdl",
198            Self::Verilog => "verilog",
199            Self::Tcl => "tcl",
200            Self::Pascal => "pascal",
201            Self::VisualBasic => "visualbasic",
202            Self::Lisp => "lisp",
203            Self::Fortran => "fortran",
204            Self::Nix => "nix",
205            Self::Crystal => "crystal",
206            Self::D => "d",
207            Self::Glsl => "glsl",
208            Self::Cmake => "cmake",
209            Self::Elm => "elm",
210            Self::Awk => "awk",
211        }
212    }
213
214    #[must_use]
215    pub fn from_name(name: &str) -> Option<Self> {
216        match name.trim().to_ascii_lowercase().as_str() {
217            "c" => Some(Self::C),
218            "cpp" | "c++" | "cplusplus" => Some(Self::Cpp),
219            "csharp" | "c#" | "cs" => Some(Self::CSharp),
220            "go" | "golang" => Some(Self::Go),
221            "java" => Some(Self::Java),
222            "javascript" | "js" => Some(Self::JavaScript),
223            "python" | "py" => Some(Self::Python),
224            "rust" | "rs" => Some(Self::Rust),
225            "shell" | "sh" | "bash" => Some(Self::Shell),
226            "powershell" | "pwsh" | "ps" => Some(Self::PowerShell),
227            "typescript" | "ts" => Some(Self::TypeScript),
228            "assembly" | "asm" => Some(Self::Assembly),
229            "clojure" | "clj" => Some(Self::Clojure),
230            "css" => Some(Self::Css),
231            "dart" => Some(Self::Dart),
232            "dockerfile" | "docker" => Some(Self::Dockerfile),
233            "elixir" | "ex" => Some(Self::Elixir),
234            "erlang" | "erl" => Some(Self::Erlang),
235            "fsharp" | "f#" | "fs" => Some(Self::FSharp),
236            "groovy" => Some(Self::Groovy),
237            "haskell" | "hs" => Some(Self::Haskell),
238            "html" | "htm" => Some(Self::Html),
239            "julia" | "jl" => Some(Self::Julia),
240            "kotlin" | "kt" => Some(Self::Kotlin),
241            "lua" => Some(Self::Lua),
242            "makefile" | "make" | "mk" => Some(Self::Makefile),
243            "nim" => Some(Self::Nim),
244            "objectivec" | "objc" | "objective-c" => Some(Self::ObjectiveC),
245            "ocaml" | "ml" => Some(Self::Ocaml),
246            "perl" | "pl" => Some(Self::Perl),
247            "php" => Some(Self::Php),
248            "r" => Some(Self::R),
249            "ruby" | "rb" => Some(Self::Ruby),
250            "scala" => Some(Self::Scala),
251            "scss" | "sass" => Some(Self::Scss),
252            "sql" => Some(Self::Sql),
253            "svelte" => Some(Self::Svelte),
254            "swift" => Some(Self::Swift),
255            "vue" => Some(Self::Vue),
256            "xml" => Some(Self::Xml),
257            "zig" => Some(Self::Zig),
258            "solidity" | "sol" => Some(Self::Solidity),
259            "protobuf" | "proto" | "protocolbuffers" => Some(Self::Protobuf),
260            "hcl" | "terraform" | "tf" => Some(Self::Hcl),
261            "graphql" | "gql" => Some(Self::GraphQl),
262            "ada" => Some(Self::Ada),
263            "vhdl" => Some(Self::Vhdl),
264            "verilog" | "systemverilog" | "sv" => Some(Self::Verilog),
265            "tcl" => Some(Self::Tcl),
266            "pascal" | "delphi" | "pas" => Some(Self::Pascal),
267            "visualbasic" | "vb" | "vbnet" | "vb.net" => Some(Self::VisualBasic),
268            "lisp" | "scheme" | "racket" | "clisp" | "elisp" => Some(Self::Lisp),
269            "fortran" | "f90" | "f95" => Some(Self::Fortran),
270            "nix" => Some(Self::Nix),
271            "crystal" | "cr" => Some(Self::Crystal),
272            "d" | "dlang" => Some(Self::D),
273            "glsl" | "hlsl" | "shader" | "wgsl" => Some(Self::Glsl),
274            "cmake" => Some(Self::Cmake),
275            "elm" => Some(Self::Elm),
276            "awk" => Some(Self::Awk),
277            _ => None,
278        }
279    }
280}
281
282#[derive(Debug, Clone, Serialize, Deserialize, Default)]
283pub struct RawLineCounts {
284    pub total_physical_lines: u64,
285    pub blank_only_lines: u64,
286    pub code_only_lines: u64,
287    pub single_comment_only_lines: u64,
288    pub multi_comment_only_lines: u64,
289    pub mixed_code_single_comment_lines: u64,
290    pub mixed_code_multi_comment_lines: u64,
291    pub docstring_comment_lines: u64,
292    pub skipped_unknown_lines: u64,
293    /// Best-effort count of function/method definition lines detected lexically.
294    #[serde(default)]
295    pub functions: u64,
296    /// Best-effort count of class/struct/trait/type definition lines detected lexically.
297    #[serde(default)]
298    pub classes: u64,
299    /// Best-effort count of variable declaration lines detected lexically.
300    #[serde(default)]
301    pub variables: u64,
302    /// Best-effort count of import/use/include statement lines detected lexically.
303    #[serde(default)]
304    pub imports: u64,
305    /// Lines consisting solely of preprocessor/compiler directives (e.g. `#include`, `#define`
306    /// in C/C++/Objective-C). Always a subset of `code_only_lines`. Controlled by
307    /// `AnalysisConfig::count_compiler_directives`. IEEE 1045-1992 §4.2.
308    #[serde(default)]
309    pub compiler_directive_lines: u64,
310    /// Best-effort count of test case / test function definition lines detected lexically
311    /// (`GTest`, Catch2, `PyTest`, `JUnit`, etc.). Always a subset of `code_only_lines`.
312    #[serde(default)]
313    pub test_count: u64,
314    /// Best-effort count of test assertion call lines detected lexically
315    /// (`ASSERT_EQ`, `EXPECT_TRUE`, assertEquals, Assert.AreEqual, `assert_eq`!, etc.).
316    #[serde(default)]
317    pub test_assertion_count: u64,
318    /// Best-effort count of test suite / fixture / group declaration lines detected lexically
319    /// (`TEST_GROUP`, `BOOST_AUTO_TEST_SUITE`, [`TestClass`], [`TestFixture`], etc.).
320    #[serde(default)]
321    pub test_suite_count: u64,
322    /// Cyclomatic complexity approximation: total count of branch decision keywords found on
323    /// code lines (e.g. `if`, `for`, `while`, `||`, `&&`). Starts at 0; +1 per keyword hit.
324    #[serde(default)]
325    pub cyclomatic_complexity: u32,
326    /// Logical SLOC estimate: executable statement count using a language-specific strategy.
327    /// `None` when the language does not support lexical LSLOC estimation.
328    #[serde(default, skip_serializing_if = "Option::is_none")]
329    pub lsloc: Option<u32>,
330    /// Per-code-line content hashes (trimmed) for ULOC aggregation. Never serialized — only
331    /// populated during an in-process scan and consumed by `sloc-core` during aggregation.
332    #[serde(skip)]
333    pub code_line_hashes: Vec<u64>,
334}
335
336#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
337#[serde(rename_all = "snake_case")]
338pub enum ParseMode {
339    Lexical,
340    LexicalBestEffort,
341    TreeSitter,
342}
343
344#[derive(Debug, Clone, Serialize, Deserialize)]
345pub struct RawFileAnalysis {
346    pub raw: RawLineCounts,
347    pub parse_mode: ParseMode,
348    pub warnings: Vec<String>,
349    /// Lexical style-guide analysis for supported languages; `None` when no heuristics apply.
350    #[serde(default, skip_serializing_if = "Option::is_none")]
351    pub style_analysis: Option<StyleAnalysis>,
352}
353
354/// IEEE 1045-1992 counting options passed from `sloc-core` (built from `AnalysisConfig`).
355///
356/// `analyze_text` accepts this struct so that the caller can control behaviour that the
357/// standard defines as configurable parameters rather than fixed conventions.
358#[derive(Debug, Clone, Copy)]
359pub struct AnalysisOptions {
360    /// When `true` (IEEE 1045-1992 default), blank lines inside block comments count as
361    /// comment lines rather than blank lines.
362    pub blank_in_block_comment_as_comment: bool,
363    /// When `true`, backslash-continued physical lines are collapsed into a single logical
364    /// line for SLOC counting purposes (IEEE logical SLOC mode).
365    pub collapse_continuation_lines: bool,
366    /// When `true` (default), run lexical style-guide heuristics and populate
367    /// `RawFileAnalysis::style_analysis`. Set to `false` to skip style scoring entirely.
368    pub enable_style: bool,
369    /// Restrict style analysis to a specific language family slug (`"all"` or `"c_family"`).
370    /// When `"c_family"`, only C / C++ / Objective-C files are style-analysed.
371    pub style_lang_scope: StyleLangScope,
372}
373
374/// Which language families receive style-guide heuristic analysis.
375#[derive(Debug, Clone, Copy, PartialEq, Eq)]
376pub enum StyleLangScope {
377    All,
378    CFamilyOnly,
379}
380
381/// Strategy for computing Logical SLOC (LSLOC) from a physical-line scan.
382#[derive(Debug, Clone, Copy, PartialEq, Eq)]
383pub enum LslocStrategy {
384    /// Count semicolons on code lines (C, C++, Java, C#, Go, Rust, JS/TS, Kotlin, SQL, …).
385    Semicolons,
386    /// Count non-blank code lines whose trimmed content does not end with a continuation
387    /// character (`\`, `,`, `(`, `[`, `{`). Suitable for Python, Ruby, Shell, Elixir, Nim.
388    NonContinuationNewlines,
389    /// Language does not have a well-defined statement boundary detectable by simple
390    /// lexical heuristics; `lsloc` will be `None` for files of this type.
391    Unsupported,
392}
393
394impl Default for AnalysisOptions {
395    fn default() -> Self {
396        Self {
397            blank_in_block_comment_as_comment: true,
398            collapse_continuation_lines: false,
399            enable_style: true,
400            style_lang_scope: StyleLangScope::All,
401        }
402    }
403}
404
405#[must_use]
406pub fn supported_languages() -> BTreeSet<Language> {
407    [
408        Language::Assembly,
409        Language::C,
410        Language::Clojure,
411        Language::Cpp,
412        Language::CSharp,
413        Language::Css,
414        Language::Dart,
415        Language::Dockerfile,
416        Language::Elixir,
417        Language::Erlang,
418        Language::FSharp,
419        Language::Go,
420        Language::Groovy,
421        Language::Haskell,
422        Language::Html,
423        Language::Java,
424        Language::JavaScript,
425        Language::Julia,
426        Language::Kotlin,
427        Language::Lua,
428        Language::Makefile,
429        Language::Nim,
430        Language::ObjectiveC,
431        Language::Ocaml,
432        Language::Perl,
433        Language::Php,
434        Language::PowerShell,
435        Language::Python,
436        Language::R,
437        Language::Ruby,
438        Language::Rust,
439        Language::Scala,
440        Language::Scss,
441        Language::Shell,
442        Language::Sql,
443        Language::Svelte,
444        Language::Swift,
445        Language::TypeScript,
446        Language::Vue,
447        Language::Xml,
448        Language::Zig,
449        Language::Solidity,
450        Language::Protobuf,
451        Language::Hcl,
452        Language::GraphQl,
453        Language::Ada,
454        Language::Vhdl,
455        Language::Verilog,
456        Language::Tcl,
457        Language::Pascal,
458        Language::VisualBasic,
459        Language::Lisp,
460        Language::Fortran,
461        Language::Nix,
462        Language::Crystal,
463        Language::D,
464        Language::Glsl,
465        Language::Cmake,
466        Language::Elm,
467        Language::Awk,
468    ]
469    .into_iter()
470    .collect()
471}
472
473/// Detect language from a shebang line (e.g. `#!/usr/bin/env python3`).
474fn detect_by_shebang(line: &str) -> Option<Language> {
475    let lower = line.to_ascii_lowercase();
476    if !lower.starts_with("#!") {
477        return None;
478    }
479    if lower.contains("python") {
480        return Some(Language::Python);
481    }
482    if lower.contains("pwsh") || lower.contains("powershell") {
483        return Some(Language::PowerShell);
484    }
485    if lower.contains("bash")
486        || lower.contains("/sh")
487        || lower.contains("zsh")
488        || lower.contains("ksh")
489    {
490        return Some(Language::Shell);
491    }
492    if lower.contains("ruby") {
493        return Some(Language::Ruby);
494    }
495    if lower.contains("perl") {
496        return Some(Language::Perl);
497    }
498    if lower.contains("php") {
499        return Some(Language::Php);
500    }
501    if lower.contains("node") || lower.contains("nodejs") {
502        return Some(Language::JavaScript);
503    }
504    None
505}
506
507/// Detect language purely from a (lowercased) file extension.
508#[allow(clippy::too_many_lines)]
509fn detect_by_extension(ext: &str) -> Option<Language> {
510    // Static table avoids a large match statement; each extension maps 1-to-1 to a language.
511    static EXT_MAP: &[(&str, Language)] = &[
512        ("c", Language::C),
513        ("h", Language::C),
514        ("cc", Language::Cpp),
515        ("cp", Language::Cpp),
516        ("cpp", Language::Cpp),
517        ("cxx", Language::Cpp),
518        ("hh", Language::Cpp),
519        ("hpp", Language::Cpp),
520        ("hxx", Language::Cpp),
521        ("cs", Language::CSharp),
522        ("go", Language::Go),
523        ("java", Language::Java),
524        ("js", Language::JavaScript),
525        ("mjs", Language::JavaScript),
526        ("cjs", Language::JavaScript),
527        ("py", Language::Python),
528        ("rs", Language::Rust),
529        ("sh", Language::Shell),
530        ("bash", Language::Shell),
531        ("zsh", Language::Shell),
532        ("ksh", Language::Shell),
533        ("ps1", Language::PowerShell),
534        ("psm1", Language::PowerShell),
535        ("psd1", Language::PowerShell),
536        ("ts", Language::TypeScript),
537        ("mts", Language::TypeScript),
538        ("cts", Language::TypeScript),
539        ("tsx", Language::TypeScript),
540        ("jsx", Language::JavaScript),
541        ("asm", Language::Assembly),
542        ("s", Language::Assembly),
543        ("clj", Language::Clojure),
544        ("cljs", Language::Clojure),
545        ("cljc", Language::Clojure),
546        ("edn", Language::Clojure),
547        ("css", Language::Css),
548        ("dart", Language::Dart),
549        ("ex", Language::Elixir),
550        ("exs", Language::Elixir),
551        ("erl", Language::Erlang),
552        ("hrl", Language::Erlang),
553        ("fs", Language::FSharp),
554        ("fsi", Language::FSharp),
555        ("fsx", Language::FSharp),
556        ("groovy", Language::Groovy),
557        ("gradle", Language::Groovy),
558        ("hs", Language::Haskell),
559        ("lhs", Language::Haskell),
560        ("html", Language::Html),
561        ("htm", Language::Html),
562        ("xhtml", Language::Html),
563        ("jl", Language::Julia),
564        ("kt", Language::Kotlin),
565        ("kts", Language::Kotlin),
566        ("lua", Language::Lua),
567        ("mk", Language::Makefile),
568        ("nim", Language::Nim),
569        ("nims", Language::Nim),
570        ("m", Language::ObjectiveC),
571        ("mm", Language::ObjectiveC),
572        ("ml", Language::Ocaml),
573        ("mli", Language::Ocaml),
574        ("pl", Language::Perl),
575        ("pm", Language::Perl),
576        ("t", Language::Perl),
577        ("php", Language::Php),
578        ("php3", Language::Php),
579        ("php4", Language::Php),
580        ("php5", Language::Php),
581        ("php7", Language::Php),
582        ("phtml", Language::Php),
583        ("r", Language::R),
584        ("rb", Language::Ruby),
585        ("rake", Language::Ruby),
586        ("scala", Language::Scala),
587        ("sc", Language::Scala),
588        ("scss", Language::Scss),
589        ("sass", Language::Scss),
590        ("sql", Language::Sql),
591        ("svelte", Language::Svelte),
592        ("swift", Language::Swift),
593        ("vue", Language::Vue),
594        ("xml", Language::Xml),
595        ("xsd", Language::Xml),
596        ("xsl", Language::Xml),
597        ("xslt", Language::Xml),
598        ("svg", Language::Xml),
599        ("zig", Language::Zig),
600        ("sol", Language::Solidity),
601        ("proto", Language::Protobuf),
602        ("tf", Language::Hcl),
603        ("tfvars", Language::Hcl),
604        ("hcl", Language::Hcl),
605        ("graphql", Language::GraphQl),
606        ("gql", Language::GraphQl),
607        ("adb", Language::Ada),
608        ("ads", Language::Ada),
609        ("ada", Language::Ada),
610        ("vhd", Language::Vhdl),
611        ("vhdl", Language::Vhdl),
612        ("v", Language::Verilog),
613        ("sv", Language::Verilog),
614        ("svh", Language::Verilog),
615        ("vh", Language::Verilog),
616        ("tcl", Language::Tcl),
617        ("pas", Language::Pascal),
618        ("dpr", Language::Pascal),
619        ("vb", Language::VisualBasic),
620        ("bas", Language::VisualBasic),
621        ("lisp", Language::Lisp),
622        ("lsp", Language::Lisp),
623        ("el", Language::Lisp),
624        ("scm", Language::Lisp),
625        ("ss", Language::Lisp),
626        ("rkt", Language::Lisp),
627        ("f90", Language::Fortran),
628        ("f95", Language::Fortran),
629        ("f03", Language::Fortran),
630        ("f08", Language::Fortran),
631        ("f", Language::Fortran),
632        ("for", Language::Fortran),
633        ("nix", Language::Nix),
634        ("cr", Language::Crystal),
635        ("d", Language::D),
636        ("glsl", Language::Glsl),
637        ("vert", Language::Glsl),
638        ("frag", Language::Glsl),
639        ("comp", Language::Glsl),
640        ("geom", Language::Glsl),
641        ("tesc", Language::Glsl),
642        ("tese", Language::Glsl),
643        ("hlsl", Language::Glsl),
644        ("wgsl", Language::Glsl),
645        ("cmake", Language::Cmake),
646        ("elm", Language::Elm),
647        ("awk", Language::Awk),
648    ];
649    EXT_MAP.iter().find_map(|&(e, l)| (e == ext).then_some(l))
650}
651
652/// Detect language from an exact filename (no extension) or well-known filename patterns.
653fn detect_by_filename(filename: &str, filename_lower: &str) -> Option<Language> {
654    // Dockerfile: exact name or Dockerfile.* variant
655    if filename == "Dockerfile"
656        || filename.starts_with("Dockerfile.")
657        || filename_lower == "dockerfile"
658    {
659        return Some(Language::Dockerfile);
660    }
661    // Makefile variants
662    if matches!(
663        filename,
664        "Makefile" | "GNUmakefile" | "makefile" | "BSDmakefile"
665    ) {
666        return Some(Language::Makefile);
667    }
668    // Ruby ecosystem files that have no extension
669    if matches!(
670        filename,
671        "Rakefile" | "Gemfile" | "Guardfile" | "Vagrantfile" | "Fastfile" | "Podfile"
672    ) {
673        return Some(Language::Ruby);
674    }
675    // CMake build scripts: `CMakeLists.txt` has a `.txt` extension, so it must be
676    // matched by exact name before extension-based detection.
677    if filename == "CMakeLists.txt" || filename_lower == "cmakelists.txt" {
678        return Some(Language::Cmake);
679    }
680    None
681}
682
683#[must_use]
684#[allow(clippy::too_many_lines)]
685pub fn detect_language(
686    path: &Path,
687    first_line: Option<&str>,
688    extension_overrides: &BTreeMap<String, String>,
689    shebang_detection: bool,
690) -> Option<Language> {
691    let extension = path
692        .extension()
693        .and_then(|ext| ext.to_str())
694        .map(str::to_ascii_lowercase);
695
696    // Extension override check (user-configured mappings win over everything)
697    if let Some(ext) = extension.as_ref() {
698        if let Some(override_name) = extension_overrides.get(ext.as_str()) {
699            if let Some(lang) = Language::from_name(override_name) {
700                return Some(lang);
701            }
702        }
703    }
704
705    // Filename-based detection for files that have no extension or use exact names
706    let filename = path.file_name().and_then(|s| s.to_str()).unwrap_or("");
707    let filename_lower = filename.to_ascii_lowercase();
708
709    if let Some(lang) = detect_by_filename(filename, &filename_lower) {
710        return Some(lang);
711    }
712
713    // Extension-based detection
714    if let Some(lang) = extension.as_deref().and_then(detect_by_extension) {
715        return Some(lang);
716    }
717
718    // Shebang detection (last resort — only for extensionless scripts)
719    if shebang_detection {
720        if let Some(line) = first_line {
721            if let Some(lang) = detect_by_shebang(line) {
722                return Some(lang);
723            }
724        }
725    }
726
727    None
728}
729
730#[must_use]
731pub fn analyze_text(language: Language, text: &str, options: AnalysisOptions) -> RawFileAnalysis {
732    // tree-sitter fast-paths (compiled out when feature is disabled)
733    #[cfg(feature = "tree-sitter")]
734    {
735        match language {
736            Language::C | Language::Cpp => {
737                if let Some(mut result) = ts::analyze_c(text) {
738                    if options.enable_style
739                        && should_style_analyse(language, options.style_lang_scope)
740                    {
741                        result.style_analysis = style::analyze_style(language, text);
742                    }
743                    return result;
744                }
745            }
746            Language::Python => {
747                if let Some(result) = ts::analyze_python(text) {
748                    return result;
749                }
750            }
751            _ => {}
752        }
753    }
754
755    let (mut config, has_preprocessor) = language_scan_config(language);
756
757    // Python docstring lines are computed from the text and cannot be a static constant.
758    if language == Language::Python {
759        config.skip_lines = detect_python_docstring_lines(text);
760    }
761
762    // C, C++, and Objective-C have a preprocessor whose directive lines are tracked separately
763    // per IEEE 1045-1992 §4.2; every other language uses base flags.
764    let flags = IeeeFlags {
765        has_preprocessor_directives: has_preprocessor,
766        blank_in_block_comment_as_comment: options.blank_in_block_comment_as_comment,
767        collapse_continuation_lines: options.collapse_continuation_lines,
768    };
769    let mut result = analyze_generic(text, config, flags);
770    if options.enable_style && should_style_analyse(language, options.style_lang_scope) {
771        result.style_analysis = style::analyze_style(language, text);
772    }
773    result
774}
775
776/// Returns `true` when `language` should be style-analysed under `scope`.
777const fn should_style_analyse(language: Language, scope: StyleLangScope) -> bool {
778    match scope {
779        StyleLangScope::CFamilyOnly => {
780            matches!(language, Language::C | Language::Cpp | Language::ObjectiveC)
781        }
782        StyleLangScope::All => true,
783    }
784}
785
786/// Returns the lexical scan configuration for `language` and whether it uses a C preprocessor.
787/// All fields are static constants except `skip_lines`, which is always empty here; callers that
788/// need non-empty skip sets (currently only Python) must populate the field after this call.
789///
790/// The implementation delegates to `LANG_SCAN_TABLE` (a static `&[(Language, StaticLangConfig)]`)
791/// defined below the `SP_*` symbol-pattern constants.  Each language appears exactly once in the
792/// table, so the linear scan is O(|languages|) but avoids a 41-arm `match` statement.
793fn language_scan_config(language: Language) -> (ScanConfig, bool) {
794    let cfg = LANG_SCAN_TABLE
795        .iter()
796        .find_map(|&(l, c)| (l == language).then_some(c))
797        .unwrap_or_else(|| panic!("language_scan_config: no entry for {language:?}"));
798    let (branch_keywords, lsloc_strategy) = language_complexity_config(language);
799    (
800        ScanConfig {
801            line_comments: cfg.line_comments,
802            block_comment: cfg.block_comment,
803            allow_single_quote_strings: cfg.allow_single_quote_strings,
804            allow_double_quote_strings: cfg.allow_double_quote_strings,
805            allow_triple_quote_strings: cfg.allow_triple_quote_strings,
806            allow_csharp_verbatim_strings: cfg.allow_csharp_verbatim_strings,
807            skip_lines: HashSet::new(),
808            symbol_patterns: cfg.symbol_patterns,
809            branch_keywords,
810            lsloc_strategy,
811        },
812        cfg.has_preprocessor,
813    )
814}
815
816// ── Cyclomatic complexity branch-keyword lists ────────────────────────────────
817// Alphabetic tokens are matched word-bounded; operator tokens (||, &&, ?) are
818// matched as raw substrings.  Each list covers one language family.
819
820const BRANCH_C_FAMILY: &[&str] = &[
821    "if", "else", "for", "while", "switch", "case", "catch", "||", "&&",
822];
823const BRANCH_C_TERNARY: &[&str] = &[
824    "if", "else", "for", "while", "switch", "case", "catch", "||", "&&", "?",
825];
826const BRANCH_GO: &[&str] = &["if", "else", "for", "switch", "case", "select", "||", "&&"];
827const BRANCH_RUST: &[&str] = &["if", "else", "for", "while", "match", "||", "&&"];
828const BRANCH_ZIG: &[&str] = &["if", "else", "for", "while", "switch", "catch", "||", "&&"];
829const BRANCH_FSHARP: &[&str] = &["if", "then", "else", "elif", "match", "when", "||", "&&"];
830const BRANCH_LUA: &[&str] = &[
831    "if", "elseif", "else", "for", "while", "repeat", "and", "or",
832];
833const BRANCH_HASKELL: &[&str] = &["if", "then", "else", "case", "otherwise"];
834const BRANCH_SQL: &[&str] = &["CASE", "WHEN", "IF", "ELSE", "case", "when", "if", "else"];
835const BRANCH_OCAML: &[&str] = &["if", "then", "else", "match", "when", "||", "&&"];
836const BRANCH_CLOJURE: &[&str] = &["if", "when", "cond", "case", "and", "or"];
837const BRANCH_PHP: &[&str] = &[
838    "if", "elseif", "else", "for", "while", "switch", "case", "catch", "match", "||", "&&", "?",
839];
840const BRANCH_JULIA: &[&str] = &["if", "elseif", "else", "for", "while", "catch", "||", "&&"];
841const BRANCH_PYTHON: &[&str] = &["if", "elif", "else", "for", "while", "except", "or", "and"];
842const BRANCH_RUBY: &[&str] = &[
843    "if", "elsif", "else", "unless", "until", "while", "case", "when", "rescue", "||", "&&",
844];
845const BRANCH_SHELL: &[&str] = &["if", "elif", "else", "while", "until", "case", "||", "&&"];
846const BRANCH_ELIXIR: &[&str] = &[
847    "if", "else", "cond", "case", "when", "rescue", "||", "&&", "and", "or",
848];
849const BRANCH_POWERSHELL: &[&str] = &[
850    "if", "elseif", "else", "for", "while", "switch", "foreach", "||", "&&",
851];
852const BRANCH_NIM: &[&str] = &[
853    "if", "elif", "else", "for", "while", "case", "of", "except", "and", "or",
854];
855const BRANCH_PERL: &[&str] = &[
856    "if", "elsif", "else", "unless", "until", "for", "while", "foreach", "||", "&&",
857];
858const BRANCH_R: &[&str] = &["if", "else", "for", "while", "repeat", "||", "&&"];
859// Pass 2 branch-keyword lists (legacy + embedded / HDL).
860const BRANCH_ADA: &[&str] = &[
861    "if", "elsif", "else", "case", "when", "loop", "while", "for", "and", "or",
862];
863const BRANCH_VHDL: &[&str] = &[
864    "if", "elsif", "else", "case", "when", "loop", "while", "for", "and", "or", "nand", "nor",
865    "xor",
866];
867const BRANCH_VERILOG: &[&str] = &[
868    "if", "else", "case", "casex", "casez", "for", "while", "&&", "||",
869];
870const BRANCH_TCL: &[&str] = &["if", "elseif", "else", "switch", "while", "for", "foreach"];
871const BRANCH_PASCAL: &[&str] = &[
872    "if", "then", "else", "case", "while", "for", "repeat", "until", "and", "or",
873];
874const BRANCH_VB: &[&str] = &[
875    "If", "Then", "ElseIf", "Else", "Select", "Case", "While", "For", "Do", "And", "Or",
876];
877const BRANCH_LISP: &[&str] = &["if", "when", "unless", "cond", "case", "and", "or"];
878// Pass 3 branch-keyword lists (scientific / infra / systems / graphics).
879const BRANCH_FORTRAN: &[&str] = &[
880    "if", "then", "else", "elseif", "case", "do", "while", "where",
881];
882const BRANCH_NIX: &[&str] = &["if", "then", "else"];
883const BRANCH_CMAKE: &[&str] = &["if(", "elseif(", "else(", "while(", "foreach("];
884const BRANCH_ELM: &[&str] = &["if", "then", "else", "case", "of"];
885const BRANCH_AWK: &[&str] = &["if", "else", "while", "for", "do"];
886
887/// Returns (`branch_keywords`, `lsloc_strategy`) for the given language.
888/// Kept separate from `LANG_SCAN_TABLE` to avoid touching that large table.
889const fn language_complexity_config(
890    language: Language,
891) -> (&'static [&'static str], LslocStrategy) {
892    match language {
893        // ── C-ternary family (ternary operator counted as branch) ─────────────
894        Language::C
895        | Language::Cpp
896        | Language::ObjectiveC
897        | Language::CSharp
898        | Language::JavaScript
899        | Language::TypeScript
900        | Language::Svelte
901        | Language::Vue
902        | Language::Dart
903        | Language::Groovy
904        | Language::Swift
905        | Language::Solidity => (BRANCH_C_TERNARY, LslocStrategy::Semicolons),
906        // ── C-family (no ternary keyword) ────────────────────────────────────
907        Language::Java | Language::Kotlin | Language::Scala | Language::D | Language::Glsl => {
908            (BRANCH_C_FAMILY, LslocStrategy::Semicolons)
909        }
910        Language::Go => (BRANCH_GO, LslocStrategy::Semicolons),
911        Language::Rust => (BRANCH_RUST, LslocStrategy::Semicolons),
912        Language::Zig => (BRANCH_ZIG, LslocStrategy::Semicolons),
913        Language::FSharp => (BRANCH_FSHARP, LslocStrategy::Unsupported),
914        // ── Hash-comment family ───────────────────────────────────────────────
915        Language::Shell => (BRANCH_SHELL, LslocStrategy::NonContinuationNewlines),
916        Language::Elixir => (BRANCH_ELIXIR, LslocStrategy::NonContinuationNewlines),
917        Language::Perl => (BRANCH_PERL, LslocStrategy::Semicolons),
918        Language::R => (BRANCH_R, LslocStrategy::NonContinuationNewlines),
919        Language::Ruby | Language::Crystal => (BRANCH_RUBY, LslocStrategy::NonContinuationNewlines),
920        Language::Python => (BRANCH_PYTHON, LslocStrategy::NonContinuationNewlines),
921        Language::PowerShell => (BRANCH_POWERSHELL, LslocStrategy::Unsupported),
922        Language::Nim => (BRANCH_NIM, LslocStrategy::NonContinuationNewlines),
923        // ── Unique comment styles ─────────────────────────────────────────────
924        Language::Lua => (BRANCH_LUA, LslocStrategy::Unsupported),
925        Language::Haskell => (BRANCH_HASKELL, LslocStrategy::Unsupported),
926        Language::Sql => (BRANCH_SQL, LslocStrategy::Semicolons),
927        Language::Ocaml => (BRANCH_OCAML, LslocStrategy::Semicolons),
928        Language::Clojure => (BRANCH_CLOJURE, LslocStrategy::Unsupported),
929        Language::Php => (BRANCH_PHP, LslocStrategy::Semicolons),
930        Language::Julia => (BRANCH_JULIA, LslocStrategy::NonContinuationNewlines),
931        Language::Protobuf => (&[], LslocStrategy::Semicolons),
932        Language::Hcl => (&[], LslocStrategy::NonContinuationNewlines),
933        // ── Legacy / embedded / HDL ───────────────────────────────────────────
934        Language::Ada => (BRANCH_ADA, LslocStrategy::Semicolons),
935        Language::Vhdl => (BRANCH_VHDL, LslocStrategy::Semicolons),
936        Language::Verilog => (BRANCH_VERILOG, LslocStrategy::Semicolons),
937        Language::Tcl => (BRANCH_TCL, LslocStrategy::NonContinuationNewlines),
938        Language::Pascal => (BRANCH_PASCAL, LslocStrategy::Semicolons),
939        Language::VisualBasic => (BRANCH_VB, LslocStrategy::NonContinuationNewlines),
940        Language::Lisp => (BRANCH_LISP, LslocStrategy::Unsupported),
941        // ── Scientific / infra / systems / graphics ───────────────────────────
942        Language::Fortran => (BRANCH_FORTRAN, LslocStrategy::NonContinuationNewlines),
943        Language::Nix => (BRANCH_NIX, LslocStrategy::Unsupported),
944        Language::Cmake => (BRANCH_CMAKE, LslocStrategy::Unsupported),
945        Language::Elm => (BRANCH_ELM, LslocStrategy::Unsupported),
946        Language::Awk => (BRANCH_AWK, LslocStrategy::NonContinuationNewlines),
947        // ── No branch detection / syntax unsupported ──────────────────────────
948        Language::Makefile
949        | Language::Dockerfile
950        | Language::Css
951        | Language::Html
952        | Language::Xml
953        | Language::Assembly
954        | Language::Erlang
955        | Language::GraphQl
956        | Language::Scss => (&[], LslocStrategy::Unsupported),
957    }
958}
959
960/// Per-language keyword prefixes used for best-effort structural symbol detection.
961/// Each slice lists line prefixes (after leading whitespace is stripped) that indicate
962/// a definition of that category. Empty slice = detection disabled for that category.
963#[derive(Debug, Clone, Copy)]
964struct SymbolPatterns {
965    functions: &'static [&'static str],
966    /// Line prefixes that classify as a function only when the line ALSO contains `(`
967    /// AND there is no `=` between the prefix and the first `(`.  Used for C/C++ where
968    /// function definitions are led by the return type (`void`, `int`, `bool`, …) with
969    /// no dedicated keyword, so the paren guard distinguishes `void f(x)` from
970    /// `void* p = malloc(n)`.
971    functions_prefix_paren: &'static [&'static str],
972    classes: &'static [&'static str],
973    variables: &'static [&'static str],
974    imports: &'static [&'static str],
975    /// Line prefixes (after stripping leading whitespace) that indicate a test case or test
976    /// function definition. Matched against code lines only, same as other symbol categories.
977    tests: &'static [&'static str],
978    /// Line prefixes that indicate a test assertion call (`ASSERT_EQ`, assertEquals, `assert_eq`!,
979    /// Assert.AreEqual, etc.). Matched against code lines only.
980    assertions: &'static [&'static str],
981    /// Line prefixes that indicate a test suite / fixture / group declaration
982    /// (`TEST_GROUP`, `BOOST_AUTO_TEST_SUITE`, [`TestClass`], [`TestFixture`], etc.).
983    test_suites: &'static [&'static str],
984    /// Type-keyword prefixes (e.g. `"int "`, `"const "`) that classify a line as a
985    /// variable declaration when the line ALSO satisfies the complement of the
986    /// `functions_prefix_paren` condition: either no `(` is present, or a `=` appears
987    /// before the first `(`.  Used for C/C++ where both functions and variables are
988    /// led by the same return / value type keywords; the paren guard splits them.
989    variables_prefix_no_paren: &'static [&'static str],
990}
991
992impl SymbolPatterns {
993    const fn none() -> Self {
994        Self {
995            functions: &[],
996            functions_prefix_paren: &[],
997            classes: &[],
998            variables: &[],
999            imports: &[],
1000            tests: &[],
1001            assertions: &[],
1002            test_suites: &[],
1003            variables_prefix_no_paren: &[],
1004        }
1005    }
1006}
1007
1008const SP_NONE: SymbolPatterns = SymbolPatterns::none(); // all fields are &[]
1009
1010// Solidity: `function`/`modifier`/`constructor` definitions; `contract`/`interface`/
1011// `library` are the structural units (mapped to classes alongside struct/enum).
1012const SP_SOLIDITY: SymbolPatterns = SymbolPatterns {
1013    functions: &[
1014        "function ",
1015        "modifier ",
1016        "constructor",
1017        "receive ",
1018        "fallback ",
1019    ],
1020    functions_prefix_paren: &[],
1021    classes: &["contract ", "interface ", "library ", "struct ", "enum "],
1022    variables: &[],
1023    imports: &["import "],
1024    // Foundry / DSTest / Forge-std: test functions are `function test...`, fuzz
1025    // tests `function testFuzz...`, and assertions are the `assert*`/`expect*` cheats.
1026    tests: &["function test", "function testFuzz", "function invariant"],
1027    assertions: &[
1028        "assertEq(",
1029        "assertEq0(",
1030        "assertTrue(",
1031        "assertFalse(",
1032        "assertGt(",
1033        "assertLt(",
1034        "assertGe(",
1035        "assertLe(",
1036        "assertApproxEq",
1037        "vm.expectRevert(",
1038        "vm.expectEmit(",
1039    ],
1040    test_suites: &[],
1041    variables_prefix_no_paren: &[],
1042};
1043
1044// Protocol Buffers: `message`/`service`/`enum` declarations are the structural units;
1045// `rpc` entries are the closest thing to functions.
1046const SP_PROTOBUF: SymbolPatterns = SymbolPatterns {
1047    functions: &["rpc "],
1048    functions_prefix_paren: &[],
1049    classes: &["message ", "service ", "enum "],
1050    variables: &[],
1051    imports: &["import "],
1052    tests: &[],
1053    assertions: &[],
1054    test_suites: &[],
1055    variables_prefix_no_paren: &[],
1056};
1057
1058// ── Pass 2 symbol patterns (legacy + embedded / HDL) ──────────────────────────
1059const SP_ADA: SymbolPatterns = SymbolPatterns {
1060    functions: &["procedure ", "function "],
1061    functions_prefix_paren: &[],
1062    classes: &["package ", "type ", "task ", "protected "],
1063    variables: &[],
1064    imports: &["with ", "use "],
1065    tests: &[],
1066    assertions: &[],
1067    test_suites: &[],
1068    variables_prefix_no_paren: &[],
1069};
1070
1071const SP_VHDL: SymbolPatterns = SymbolPatterns {
1072    functions: &["function ", "procedure ", "process "],
1073    functions_prefix_paren: &[],
1074    classes: &["entity ", "architecture ", "package ", "component "],
1075    variables: &[],
1076    imports: &["library ", "use "],
1077    tests: &[],
1078    assertions: &[],
1079    test_suites: &[],
1080    variables_prefix_no_paren: &[],
1081};
1082
1083const SP_VERILOG: SymbolPatterns = SymbolPatterns {
1084    functions: &["function ", "task "],
1085    functions_prefix_paren: &[],
1086    classes: &["module ", "interface ", "class ", "package "],
1087    variables: &[],
1088    imports: &["import ", "`include"],
1089    tests: &[],
1090    assertions: &[],
1091    test_suites: &[],
1092    variables_prefix_no_paren: &[],
1093};
1094
1095const SP_TCL: SymbolPatterns = SymbolPatterns {
1096    functions: &["proc "],
1097    functions_prefix_paren: &[],
1098    classes: &[],
1099    variables: &[],
1100    imports: &["source ", "package require "],
1101    // tcltest: each case is introduced by the `test` command.
1102    tests: &["test "],
1103    assertions: &[],
1104    test_suites: &[],
1105    variables_prefix_no_paren: &[],
1106};
1107
1108const SP_PASCAL: SymbolPatterns = SymbolPatterns {
1109    functions: &["procedure ", "function "],
1110    functions_prefix_paren: &[],
1111    classes: &["type ", "class ", "record "],
1112    variables: &[],
1113    imports: &["uses "],
1114    // DUnit / FPCUnit: test methods are `procedure Test...`; checks are the assertions.
1115    tests: &["procedure Test"],
1116    assertions: &[
1117        "Check(",
1118        "CheckEquals(",
1119        "CheckTrue(",
1120        "CheckFalse(",
1121        "CheckNotNull(",
1122    ],
1123    test_suites: &[],
1124    variables_prefix_no_paren: &[],
1125};
1126
1127const SP_VB: SymbolPatterns = SymbolPatterns {
1128    functions: &[
1129        "Sub ",
1130        "Function ",
1131        "Private Sub ",
1132        "Public Sub ",
1133        "Private Function ",
1134        "Public Function ",
1135    ],
1136    functions_prefix_paren: &[],
1137    classes: &["Class ", "Module ", "Structure "],
1138    variables: &[],
1139    imports: &["Imports "],
1140    // MSTest attributes on their own line; Assert.* calls for assertions.
1141    tests: &["<TestMethod>", "<TestMethod("],
1142    assertions: &["Assert.", "CollectionAssert.", "StringAssert."],
1143    test_suites: &["<TestClass>", "<TestClass("],
1144    variables_prefix_no_paren: &[],
1145};
1146
1147const SP_LISP: SymbolPatterns = SymbolPatterns {
1148    functions: &["(defun ", "(defmacro ", "(define ", "(defmethod ", "(defn "],
1149    functions_prefix_paren: &[],
1150    classes: &["(defclass ", "(defstruct "],
1151    variables: &[],
1152    imports: &["(require ", "(import ", "(use-package "],
1153    // FiveAM (Common Lisp): `(test name ...)` cases with `(is ...)` checks.
1154    tests: &["(test ", "(deftest "],
1155    assertions: &["(is ", "(is-true ", "(is-false ", "(signals "],
1156    test_suites: &[],
1157    variables_prefix_no_paren: &[],
1158};
1159
1160// ── Pass 3 symbol patterns (scientific / infra / systems / graphics) ──────────
1161const SP_FORTRAN: SymbolPatterns = SymbolPatterns {
1162    functions: &["subroutine ", "function "],
1163    functions_prefix_paren: &[],
1164    classes: &["module ", "program ", "type "],
1165    variables: &[],
1166    imports: &["use ", "include "],
1167    tests: &[],
1168    assertions: &[],
1169    test_suites: &[],
1170    variables_prefix_no_paren: &[],
1171};
1172
1173const SP_CRYSTAL: SymbolPatterns = SymbolPatterns {
1174    functions: &["def "],
1175    functions_prefix_paren: &[],
1176    classes: &["class ", "module ", "struct ", "enum "],
1177    variables: &[],
1178    imports: &["require "],
1179    // Crystal Spec (RSpec-style): describe/it/context groups, pending stubs.
1180    tests: &["it ", "it(", "describe ", "context ", "pending "],
1181    assertions: &[],
1182    test_suites: &[],
1183    variables_prefix_no_paren: &[],
1184};
1185
1186const SP_D: SymbolPatterns = SymbolPatterns {
1187    functions: &[],
1188    functions_prefix_paren: &[],
1189    classes: &["class ", "struct ", "interface ", "enum ", "template "],
1190    variables: &[],
1191    imports: &["import "],
1192    // D built-in unittest blocks; `assert` is the in-language check.
1193    tests: &["unittest"],
1194    assertions: &["assert(", "assertThrown", "assertNotThrown"],
1195    test_suites: &[],
1196    variables_prefix_no_paren: &[],
1197};
1198
1199const SP_CMAKE: SymbolPatterns = SymbolPatterns {
1200    functions: &["function(", "macro("],
1201    functions_prefix_paren: &[],
1202    classes: &[],
1203    variables: &[],
1204    imports: &["include(", "add_subdirectory("],
1205    tests: &[],
1206    assertions: &[],
1207    test_suites: &[],
1208    variables_prefix_no_paren: &[],
1209};
1210
1211const SP_ELM: SymbolPatterns = SymbolPatterns {
1212    functions: &[],
1213    functions_prefix_paren: &[],
1214    classes: &["type "],
1215    variables: &[],
1216    imports: &["import "],
1217    // elm-test: test/describe/fuzz cases, with `Expect.*` checks.
1218    tests: &["test ", "describe ", "fuzz "],
1219    assertions: &["Expect."],
1220    test_suites: &[],
1221    variables_prefix_no_paren: &[],
1222};
1223
1224const SP_AWK: SymbolPatterns = SymbolPatterns {
1225    functions: &["function "],
1226    functions_prefix_paren: &[],
1227    classes: &[],
1228    variables: &[],
1229    imports: &[],
1230    tests: &[],
1231    assertions: &[],
1232    test_suites: &[],
1233    variables_prefix_no_paren: &[],
1234};
1235
1236const SP_RUST: SymbolPatterns = SymbolPatterns {
1237    functions: &[
1238        "fn ",
1239        "pub fn ",
1240        "pub(crate) fn ",
1241        "pub(super) fn ",
1242        "async fn ",
1243        "pub async fn ",
1244        "pub(crate) async fn ",
1245        "unsafe fn ",
1246        "pub unsafe fn ",
1247        "pub(crate) unsafe fn ",
1248        "const fn ",
1249        "pub const fn ",
1250        "pub(crate) const fn ",
1251        "extern fn ",
1252        "pub extern fn ",
1253    ],
1254    functions_prefix_paren: &[],
1255    classes: &[
1256        "struct ",
1257        "pub struct ",
1258        "pub(crate) struct ",
1259        "enum ",
1260        "pub enum ",
1261        "pub(crate) enum ",
1262        "trait ",
1263        "pub trait ",
1264        "pub(crate) trait ",
1265        "impl ",
1266        "impl<",
1267        "type ",
1268        "pub type ",
1269        "pub(crate) type ",
1270    ],
1271    variables: &["let ", "let mut "],
1272    imports: &["use ", "pub use ", "pub(crate) use ", "extern crate "],
1273    // Built-in #[test], tokio/actix async test attributes, rstest
1274    tests: &[
1275        "#[test]",
1276        "#[tokio::test]",
1277        "#[actix_web::test]",
1278        "#[rstest]",
1279        "#[test_case",
1280    ],
1281    assertions: &[
1282        "assert_eq!(",
1283        "assert_ne!(",
1284        "assert!(",
1285        "assert_matches!(",
1286        "assert_err!(",
1287        "assert_ok!(",
1288    ],
1289    test_suites: &[],
1290    variables_prefix_no_paren: &[],
1291};
1292
1293const SP_PYTHON: SymbolPatterns = SymbolPatterns {
1294    functions: &["def ", "async def "],
1295    functions_prefix_paren: &[],
1296    classes: &["class "],
1297    variables: &[],
1298    imports: &["import ", "from "],
1299    // pytest: test_ prefix functions and Test* classes; unittest: test_ methods
1300    tests: &["def test_", "async def test_", "class Test"],
1301    assertions: &[
1302        "self.assertEqual(",
1303        "self.assertNotEqual(",
1304        "self.assertTrue(",
1305        "self.assertFalse(",
1306        "self.assertIsNone(",
1307        "self.assertIsNotNone(",
1308        "self.assertIn(",
1309        "self.assertNotIn(",
1310        "self.assertRaises(",
1311        "self.assertAlmostEqual(",
1312    ],
1313    test_suites: &[],
1314    variables_prefix_no_paren: &[],
1315};
1316
1317const SP_JS: SymbolPatterns = SymbolPatterns {
1318    functions: &[
1319        "function ",
1320        "async function ",
1321        "export function ",
1322        "export async function ",
1323        "export default function ",
1324    ],
1325    functions_prefix_paren: &[],
1326    classes: &["class ", "export class ", "export default class "],
1327    variables: &[
1328        "var ",
1329        "let ",
1330        "const ",
1331        "export var ",
1332        "export let ",
1333        "export const ",
1334    ],
1335    imports: &["import "],
1336    // Jest/Mocha/Jasmine: describe/it/test block openers
1337    tests: &[
1338        "describe(",
1339        "it(",
1340        "test(",
1341        "it.each(",
1342        "test.each(",
1343        "describe.each(",
1344    ],
1345    assertions: &["expect("],
1346    test_suites: &[],
1347    variables_prefix_no_paren: &[],
1348};
1349
1350const SP_TS: SymbolPatterns = SymbolPatterns {
1351    functions: &[
1352        "function ",
1353        "async function ",
1354        "export function ",
1355        "export async function ",
1356        "export default function ",
1357    ],
1358    functions_prefix_paren: &[],
1359    classes: &[
1360        "class ",
1361        "export class ",
1362        "export default class ",
1363        "abstract class ",
1364        "export abstract class ",
1365        "interface ",
1366        "export interface ",
1367        "declare class ",
1368        "declare interface ",
1369    ],
1370    variables: &[
1371        "var ",
1372        "let ",
1373        "const ",
1374        "export var ",
1375        "export let ",
1376        "export const ",
1377    ],
1378    imports: &["import "],
1379    // Jest/Mocha/Jasmine/Vitest: describe/it/test block openers
1380    tests: &[
1381        "describe(",
1382        "it(",
1383        "test(",
1384        "it.each(",
1385        "test.each(",
1386        "describe.each(",
1387    ],
1388    assertions: &["expect("],
1389    test_suites: &[],
1390    variables_prefix_no_paren: &[],
1391};
1392
1393const SP_GO: SymbolPatterns = SymbolPatterns {
1394    functions: &["func "],
1395    functions_prefix_paren: &[],
1396    classes: &["type "],
1397    variables: &["var "],
1398    imports: &["import "],
1399    // Go standard testing: Test* functions (convention is practically exclusive to _test.go files)
1400    tests: &["func Test", "func Benchmark", "func Fuzz"],
1401    assertions: &[],
1402    test_suites: &[],
1403    variables_prefix_no_paren: &[],
1404};
1405
1406const SP_JAVA: SymbolPatterns = SymbolPatterns {
1407    functions: &[],
1408    functions_prefix_paren: &[],
1409    classes: &[
1410        "class ",
1411        "public class ",
1412        "private class ",
1413        "protected class ",
1414        "abstract class ",
1415        "final class ",
1416        "public abstract class ",
1417        "public final class ",
1418        "interface ",
1419        "public interface ",
1420        "enum ",
1421        "public enum ",
1422        "record ",
1423        "public record ",
1424        "@interface ",
1425    ],
1426    variables: &[],
1427    imports: &["import "],
1428    // JUnit 4 & 5, TestNG — annotations appear on their own line before the method
1429    tests: &[
1430        "@Test",
1431        "@ParameterizedTest",
1432        "@RepeatedTest",
1433        "@TestFactory",
1434        "@TestTemplate",
1435    ],
1436    assertions: &[
1437        "assertEquals(",
1438        "assertNotEquals(",
1439        "assertTrue(",
1440        "assertFalse(",
1441        "assertNull(",
1442        "assertNotNull(",
1443        "assertThat(",
1444        "assertThrows(",
1445        "assertAll(",
1446        "assertArrayEquals(",
1447        "assertIterableEquals(",
1448        "assertLinesMatch(",
1449    ],
1450    test_suites: &[],
1451    variables_prefix_no_paren: &[],
1452};
1453
1454const SP_CSHARP: SymbolPatterns = SymbolPatterns {
1455    functions: &[],
1456    functions_prefix_paren: &[],
1457    classes: &[
1458        "class ",
1459        "public class ",
1460        "private class ",
1461        "protected class ",
1462        "internal class ",
1463        "abstract class ",
1464        "sealed class ",
1465        "static class ",
1466        "partial class ",
1467        "public abstract class ",
1468        "public sealed class ",
1469        "public static class ",
1470        "interface ",
1471        "public interface ",
1472        "internal interface ",
1473        "enum ",
1474        "public enum ",
1475        "struct ",
1476        "public struct ",
1477        "record ",
1478        "public record ",
1479    ],
1480    variables: &["var "],
1481    imports: &["using "],
1482    // MSTest, NUnit, xUnit — attributes on their own line before the method
1483    tests: &[
1484        "[TestMethod]",
1485        "[Test]",
1486        "[Fact]",
1487        "[Theory]",
1488        "[TestCase(",
1489        "[DataRow(",
1490        "[InlineData(",
1491        "[MemberData(",
1492    ],
1493    assertions: &[
1494        "Assert.AreEqual(",
1495        "Assert.AreNotEqual(",
1496        "Assert.IsTrue(",
1497        "Assert.IsFalse(",
1498        "Assert.IsNull(",
1499        "Assert.IsNotNull(",
1500        "Assert.Equal(",
1501        "Assert.NotEqual(",
1502        "Assert.True(",
1503        "Assert.False(",
1504        "Assert.That(",
1505        "Assert.Contains(",
1506        "Assert.Throws(",
1507        "Assert.ThrowsAsync(",
1508        "Assert.IsInstanceOfType(",
1509    ],
1510    test_suites: &["[TestClass]", "[TestFixture]", "[SetUpFixture]"],
1511    variables_prefix_no_paren: &[],
1512};
1513
1514// GTest, Catch2/doctest, Boost.Test, Unity, Check, CMocka, CppUTest patterns for C and C++.
1515const TEST_PATTERNS_C_CPP: &[&str] = &[
1516    // Google Test
1517    "TEST(",
1518    "TEST_F(",
1519    "TEST_P(",
1520    "TYPED_TEST(",
1521    "TYPED_TEST_P(",
1522    "INSTANTIATE_TEST_SUITE_P(",
1523    "INSTANTIATE_TYPED_TEST_SUITE_P(",
1524    // Catch2 / doctest
1525    "TEST_CASE(",
1526    "SECTION(",
1527    "SCENARIO(",
1528    "SCENARIO_METHOD(",
1529    "TEST_CASE_METHOD(",
1530    // Boost.Test
1531    "BOOST_AUTO_TEST_CASE(",
1532    "BOOST_FIXTURE_TEST_CASE(",
1533    "BOOST_AUTO_TEST_SUITE(",
1534    "BOOST_PARAM_TEST_CASE(",
1535    // CppUnit
1536    "CPPUNIT_TEST(",
1537    "CPPUNIT_TEST_SUITE(",
1538    // Unity (embedded C)
1539    "RUN_TEST(",
1540    "TEST_IGNORE(",
1541    "TEST_FAIL(",
1542    // Check (libcheck — embedded C)
1543    "START_TEST(",
1544    "tcase_add_test(",
1545    "suite_create(",
1546    // CMocka (embedded C)
1547    "cmocka_unit_test(",
1548    "cmocka_run_group_tests(",
1549    // CppUTest
1550    "IGNORE_TEST(",
1551    "TEST_GROUP(",
1552    "TEST_GROUP_BASE(",
1553];
1554
1555// Test assertion patterns shared by C and C++.
1556const ASSERT_PATTERNS_C_CPP: &[&str] = &[
1557    // Google Test ASSERT_* (test-stopping failures)
1558    "ASSERT_EQ(",
1559    "ASSERT_NE(",
1560    "ASSERT_LT(",
1561    "ASSERT_LE(",
1562    "ASSERT_GT(",
1563    "ASSERT_GE(",
1564    "ASSERT_TRUE(",
1565    "ASSERT_FALSE(",
1566    "ASSERT_STREQ(",
1567    "ASSERT_STRNE(",
1568    "ASSERT_FLOAT_EQ(",
1569    "ASSERT_DOUBLE_EQ(",
1570    "ASSERT_NEAR(",
1571    "ASSERT_THROW(",
1572    "ASSERT_NO_THROW(",
1573    "ASSERT_ANY_THROW(",
1574    // Google Test EXPECT_* (non-stopping failures)
1575    "EXPECT_EQ(",
1576    "EXPECT_NE(",
1577    "EXPECT_LT(",
1578    "EXPECT_LE(",
1579    "EXPECT_GT(",
1580    "EXPECT_GE(",
1581    "EXPECT_TRUE(",
1582    "EXPECT_FALSE(",
1583    "EXPECT_STREQ(",
1584    "EXPECT_STRNE(",
1585    "EXPECT_FLOAT_EQ(",
1586    "EXPECT_DOUBLE_EQ(",
1587    "EXPECT_NEAR(",
1588    "EXPECT_THROW(",
1589    "EXPECT_NO_THROW(",
1590    "EXPECT_ANY_THROW(",
1591    // Catch2 / doctest assertions
1592    "REQUIRE(",
1593    "CHECK(",
1594    "REQUIRE_FALSE(",
1595    "CHECK_FALSE(",
1596    "REQUIRE_NOTHROW(",
1597    "CHECK_NOTHROW(",
1598    "REQUIRE_THROWS(",
1599    "CHECK_THROWS(",
1600    "REQUIRE_THAT(",
1601    "CHECK_THAT(",
1602    // Unity assertions (embedded C)
1603    "TEST_ASSERT_EQUAL(",
1604    "TEST_ASSERT_EQUAL_INT(",
1605    "TEST_ASSERT_EQUAL_STRING(",
1606    "TEST_ASSERT_EQUAL_FLOAT(",
1607    "TEST_ASSERT_EQUAL_DOUBLE(",
1608    "TEST_ASSERT_EQUAL_PTR(",
1609    "TEST_ASSERT_TRUE(",
1610    "TEST_ASSERT_FALSE(",
1611    "TEST_ASSERT_NULL(",
1612    "TEST_ASSERT_NOT_NULL(",
1613    "TEST_ASSERT_BITS_HIGH(",
1614    "TEST_ASSERT_BITS_LOW(",
1615    // CMocka assertions (embedded C)
1616    "assert_int_equal(",
1617    "assert_int_not_equal(",
1618    "assert_string_equal(",
1619    "assert_string_not_equal(",
1620    "assert_true(",
1621    "assert_false(",
1622    "assert_null(",
1623    "assert_non_null(",
1624    "assert_ptr_equal(",
1625    "assert_memory_equal(",
1626    "assert_return_code(",
1627];
1628
1629// Test suite/group declaration patterns for C and C++.
1630const SUITE_PATTERNS_C_CPP: &[&str] = &[
1631    "TEST_GROUP(",
1632    "TEST_GROUP_BASE(",
1633    "BOOST_AUTO_TEST_SUITE(",
1634    "CPPUNIT_TEST_SUITE(",
1635    "CPPUNIT_TEST_SUITE_END(",
1636];
1637
1638const SP_C: SymbolPatterns = SymbolPatterns {
1639    // C has no function keyword; detect by common return types that precede `(` with no `=`.
1640    functions: &[],
1641    functions_prefix_paren: &[
1642        "void ",
1643        "int ",
1644        "char ",
1645        "float ",
1646        "double ",
1647        "long ",
1648        "unsigned ",
1649        "size_t ",
1650        "static ",
1651        "inline ",
1652        "const ",
1653        "extern ",
1654    ],
1655    classes: &[
1656        "struct ",
1657        "typedef struct ",
1658        "union ",
1659        "typedef union ",
1660        "typedef enum ",
1661    ],
1662    variables: &[],
1663    imports: &["#include "],
1664    tests: TEST_PATTERNS_C_CPP,
1665    assertions: ASSERT_PATTERNS_C_CPP,
1666    test_suites: SUITE_PATTERNS_C_CPP,
1667    // Same type keywords as functions_prefix_paren; the complement paren guard (no unguarded `(`
1668    // in the line) distinguishes `int x;` / `int x = 5;` (variable) from `int foo()` (function).
1669    variables_prefix_no_paren: &[
1670        "void ",
1671        "int ",
1672        "char ",
1673        "float ",
1674        "double ",
1675        "long ",
1676        "unsigned ",
1677        "size_t ",
1678        "static ",
1679        "inline ",
1680        "const ",
1681        "extern ",
1682    ],
1683};
1684
1685const SP_CPP: SymbolPatterns = SymbolPatterns {
1686    // C++ specific function keyword-prefixes; return-type-led patterns use functions_prefix_paren.
1687    functions: &[
1688        "virtual ",  // virtual method declaration/definition
1689        "explicit ", // explicit constructor modifier
1690        "~",         // destructor (e.g. ~MyClass())
1691        "operator",  // operator overload (operator==, operator+, …)
1692    ],
1693    functions_prefix_paren: &[
1694        "void ",
1695        "bool ",
1696        "int ",
1697        "char ",
1698        "float ",
1699        "double ",
1700        "long ",
1701        "unsigned ",
1702        "size_t ",
1703        "auto ",
1704        "static ",
1705        "inline ",
1706        "constexpr ",
1707        "const ",
1708        "extern ",
1709    ],
1710    // `template<` (no space) is the dominant modern style alongside `template ` (with space).
1711    classes: &["class ", "struct ", "namespace ", "template ", "template<"],
1712    variables: &[],
1713    imports: &["#include "],
1714    tests: TEST_PATTERNS_C_CPP,
1715    assertions: ASSERT_PATTERNS_C_CPP,
1716    test_suites: SUITE_PATTERNS_C_CPP,
1717    // Mirror of functions_prefix_paren; complement paren guard splits variables from functions.
1718    variables_prefix_no_paren: &[
1719        "void ",
1720        "bool ",
1721        "int ",
1722        "char ",
1723        "float ",
1724        "double ",
1725        "long ",
1726        "unsigned ",
1727        "size_t ",
1728        "auto ",
1729        "static ",
1730        "inline ",
1731        "constexpr ",
1732        "const ",
1733        "extern ",
1734    ],
1735};
1736
1737const SP_SHELL: SymbolPatterns = SymbolPatterns {
1738    functions: &["function "],
1739    functions_prefix_paren: &[],
1740    classes: &[],
1741    variables: &["declare ", "local ", "export "],
1742    imports: &["source ", ". "],
1743    // Bats (Bash Automated Testing System): each case is a `@test "name" {` block.
1744    tests: &["@test "],
1745    assertions: &[],
1746    test_suites: &[],
1747    variables_prefix_no_paren: &[],
1748};
1749
1750const SP_POWERSHELL: SymbolPatterns = SymbolPatterns {
1751    functions: &["function ", "Function "],
1752    functions_prefix_paren: &[],
1753    classes: &["class "],
1754    variables: &[],
1755    imports: &["Import-Module ", "using "],
1756    // Pester test framework
1757    tests: &["Describe ", "It ", "Context "],
1758    assertions: &[],
1759    test_suites: &[],
1760    variables_prefix_no_paren: &[],
1761};
1762
1763const SP_KOTLIN: SymbolPatterns = SymbolPatterns {
1764    functions: &[
1765        "fun ",
1766        "private fun ",
1767        "public fun ",
1768        "protected fun ",
1769        "internal fun ",
1770        "override fun ",
1771        "suspend fun ",
1772        "abstract fun ",
1773        "open fun ",
1774        "private suspend fun ",
1775        "public suspend fun ",
1776    ],
1777    functions_prefix_paren: &[],
1778    classes: &[
1779        "class ",
1780        "data class ",
1781        "sealed class ",
1782        "abstract class ",
1783        "open class ",
1784        "object ",
1785        "companion object",
1786        "interface ",
1787        "enum class ",
1788        "annotation class ",
1789    ],
1790    variables: &["val ", "var ", "private val ", "private var ", "const val "],
1791    imports: &["import "],
1792    // JUnit 4/5, KotlinTest, Kotest
1793    tests: &[
1794        "@Test",
1795        "@ParameterizedTest",
1796        "@RepeatedTest",
1797        "\"should ",
1798        "\"it ",
1799    ],
1800    assertions: &[
1801        "assertEquals(",
1802        "assertNotEquals(",
1803        "assertTrue(",
1804        "assertFalse(",
1805        "assertNull(",
1806        "assertNotNull(",
1807        "assertThat(",
1808        "assertThrows(",
1809        "shouldBe(",
1810        "shouldNotBe(",
1811        "shouldThrow(",
1812    ],
1813    test_suites: &[],
1814    variables_prefix_no_paren: &[],
1815};
1816
1817const SP_SWIFT: SymbolPatterns = SymbolPatterns {
1818    functions: &[
1819        "func ",
1820        "private func ",
1821        "public func ",
1822        "internal func ",
1823        "override func ",
1824        "open func ",
1825        "static func ",
1826        "class func ",
1827        "mutating func ",
1828        "private static func ",
1829        "public static func ",
1830    ],
1831    functions_prefix_paren: &[],
1832    classes: &[
1833        "class ",
1834        "struct ",
1835        "protocol ",
1836        "enum ",
1837        "extension ",
1838        "actor ",
1839        "public class ",
1840        "private class ",
1841        "open class ",
1842        "final class ",
1843        "public struct ",
1844        "private struct ",
1845        "public protocol ",
1846    ],
1847    variables: &[
1848        "var ",
1849        "let ",
1850        "private var ",
1851        "private let ",
1852        "static var ",
1853        "static let ",
1854    ],
1855    imports: &["import "],
1856    // XCTest: test functions are named test* by convention; Swift Testing: @Test attribute
1857    tests: &["func test", "func Test", "@Test"],
1858    assertions: &[
1859        "XCTAssertEqual(",
1860        "XCTAssertNotEqual(",
1861        "XCTAssertTrue(",
1862        "XCTAssertFalse(",
1863        "XCTAssertNil(",
1864        "XCTAssertNotNil(",
1865        "XCTAssertGreaterThan(",
1866        "XCTAssertLessThan(",
1867        "XCTAssertThrowsError(",
1868        "XCTAssertNoThrow(",
1869        "#expect(",
1870    ],
1871    test_suites: &[],
1872    variables_prefix_no_paren: &[],
1873};
1874
1875const SP_RUBY: SymbolPatterns = SymbolPatterns {
1876    functions: &["def ", "private def ", "protected def "],
1877    functions_prefix_paren: &[],
1878    classes: &["class ", "module "],
1879    variables: &[],
1880    imports: &["require ", "require_relative "],
1881    // RSpec / minitest
1882    tests: &["it ", "it(", "describe ", "context ", "test "],
1883    assertions: &[],
1884    test_suites: &[],
1885    variables_prefix_no_paren: &[],
1886};
1887
1888const SP_SCALA: SymbolPatterns = SymbolPatterns {
1889    functions: &["def ", "private def ", "protected def ", "override def "],
1890    functions_prefix_paren: &[],
1891    classes: &[
1892        "class ",
1893        "case class ",
1894        "abstract class ",
1895        "sealed class ",
1896        "object ",
1897        "trait ",
1898    ],
1899    variables: &["val ", "var ", "lazy val "],
1900    imports: &["import "],
1901    // ScalaTest / MUnit: FunSuite test("..."), FlatSpec it("..."), AnyWordSpec "..." should
1902    tests: &["test(", "it(", "describe("],
1903    assertions: &[],
1904    test_suites: &[],
1905    variables_prefix_no_paren: &[],
1906};
1907
1908const SP_PHP: SymbolPatterns = SymbolPatterns {
1909    functions: &[
1910        "function ",
1911        "public function ",
1912        "private function ",
1913        "protected function ",
1914        "static function ",
1915        "abstract function ",
1916        "final function ",
1917        "public static function ",
1918        "private static function ",
1919        "protected static function ",
1920    ],
1921    functions_prefix_paren: &[],
1922    classes: &[
1923        "class ",
1924        "abstract class ",
1925        "final class ",
1926        "interface ",
1927        "trait ",
1928        "enum ",
1929    ],
1930    variables: &[],
1931    imports: &[
1932        "use ",
1933        "require ",
1934        "require_once ",
1935        "include ",
1936        "include_once ",
1937    ],
1938    // PHPUnit: test methods start with test, or use @test annotation
1939    tests: &[
1940        "public function test",
1941        "function test",
1942        "#[Test]",
1943        "#[DataProvider(",
1944    ],
1945    assertions: &[],
1946    test_suites: &[],
1947    variables_prefix_no_paren: &[],
1948};
1949
1950const SP_ELIXIR: SymbolPatterns = SymbolPatterns {
1951    functions: &[
1952        "def ",
1953        "defp ",
1954        "defmacro ",
1955        "defmacrop ",
1956        "defguard ",
1957        "defguardp ",
1958    ],
1959    functions_prefix_paren: &[],
1960    classes: &["defmodule ", "defprotocol ", "defimpl "],
1961    variables: &[],
1962    imports: &["import ", "alias ", "use ", "require "],
1963    // ExUnit
1964    tests: &["test ", "describe "],
1965    assertions: &[],
1966    test_suites: &[],
1967    variables_prefix_no_paren: &[],
1968};
1969
1970const SP_ERLANG: SymbolPatterns = SymbolPatterns {
1971    functions: &[],
1972    functions_prefix_paren: &[],
1973    classes: &["-module("],
1974    variables: &[],
1975    imports: &["-import(", "-include(", "-include_lib("],
1976    // EUnit: test names end in `_test`/`_test_` (suffix — not prefix-matchable), so we
1977    // only count the `?assert*` macro family, which is line-prefixable.
1978    tests: &[],
1979    assertions: &[
1980        "?assert(",
1981        "?assertEqual(",
1982        "?assertNotEqual(",
1983        "?assertMatch(",
1984        "?assertError(",
1985        "?assertThrow(",
1986        "?assertException(",
1987    ],
1988    test_suites: &[],
1989    variables_prefix_no_paren: &[],
1990};
1991
1992const SP_FSHARP: SymbolPatterns = SymbolPatterns {
1993    functions: &[
1994        "let ",
1995        "let rec ",
1996        "member ",
1997        "override ",
1998        "abstract member ",
1999    ],
2000    functions_prefix_paren: &[],
2001    classes: &["type "],
2002    variables: &["let mutable "],
2003    imports: &["open "],
2004    // NUnit / xUnit attributes on their own line; FsUnit uses [<Test>] / [<Fact>]
2005    tests: &["[<Test>]", "[<Fact>]", "[<Theory>]", "[<TestCase("],
2006    assertions: &[],
2007    test_suites: &[],
2008    variables_prefix_no_paren: &[],
2009};
2010
2011const SP_GROOVY: SymbolPatterns = SymbolPatterns {
2012    functions: &["def ", "private def ", "public def ", "protected def "],
2013    functions_prefix_paren: &[],
2014    classes: &["class ", "abstract class ", "interface ", "enum ", "trait "],
2015    variables: &[],
2016    imports: &["import "],
2017    // Spock framework: feature methods; JUnit annotations
2018    tests: &["def \"", "@Test", "given:", "when:", "then:", "expect:"],
2019    assertions: &[],
2020    test_suites: &[],
2021    variables_prefix_no_paren: &[],
2022};
2023
2024const SP_HASKELL: SymbolPatterns = SymbolPatterns {
2025    functions: &[],
2026    functions_prefix_paren: &[],
2027    classes: &["class ", "data ", "newtype ", "type "],
2028    variables: &[],
2029    imports: &["import "],
2030    // Hspec (describe/it) and QuickCheck (prop_) conventions. Hspec expectations
2031    // (`x `shouldBe` y`) are infix/mid-line, so they are not prefix-countable here.
2032    tests: &["describe ", "it ", "prop_"],
2033    assertions: &[],
2034    test_suites: &[],
2035    variables_prefix_no_paren: &[],
2036};
2037
2038const SP_LUA: SymbolPatterns = SymbolPatterns {
2039    functions: &["function ", "local function "],
2040    functions_prefix_paren: &[],
2041    classes: &[],
2042    variables: &["local "],
2043    imports: &[],
2044    // busted test framework
2045    tests: &["it(", "describe(", "pending("],
2046    assertions: &[],
2047    test_suites: &[],
2048    variables_prefix_no_paren: &[],
2049};
2050
2051const SP_NIM: SymbolPatterns = SymbolPatterns {
2052    functions: &[
2053        "proc ",
2054        "func ",
2055        "method ",
2056        "iterator ",
2057        "converter ",
2058        "template ",
2059        "macro ",
2060    ],
2061    functions_prefix_paren: &[],
2062    classes: &["type "],
2063    variables: &["var ", "let ", "const "],
2064    imports: &["import ", "from "],
2065    // unittest module
2066    tests: &["test "],
2067    assertions: &[],
2068    test_suites: &[],
2069    variables_prefix_no_paren: &[],
2070};
2071
2072const SP_OBJECTIVEC: SymbolPatterns = SymbolPatterns {
2073    functions: &["- (", "+ ("],
2074    functions_prefix_paren: &[],
2075    classes: &["@interface ", "@implementation ", "@protocol "],
2076    variables: &[],
2077    imports: &["#import ", "#include "],
2078    // XCTest: test methods start with - (void)test
2079    tests: &["- (void)test"],
2080    assertions: &[
2081        "XCTAssertEqual(",
2082        "XCTAssertNotEqual(",
2083        "XCTAssertTrue(",
2084        "XCTAssertFalse(",
2085        "XCTAssertNil(",
2086        "XCTAssertNotNil(",
2087        "XCTAssertGreaterThan(",
2088        "XCTAssertLessThan(",
2089        "XCTAssertThrowsError(",
2090        "XCTAssertNoThrow(",
2091    ],
2092    test_suites: &[],
2093    variables_prefix_no_paren: &[],
2094};
2095
2096const SP_OCAML: SymbolPatterns = SymbolPatterns {
2097    functions: &["let ", "let rec "],
2098    functions_prefix_paren: &[],
2099    classes: &["type ", "module ", "class "],
2100    variables: &[],
2101    imports: &["open "],
2102    // OUnit (`let test_... >:: `, `assert_*`) and Alcotest (`test_case`) conventions.
2103    tests: &["let test_", "test_case "],
2104    assertions: &[
2105        "assert_equal ",
2106        "assert_bool ",
2107        "assert_raises ",
2108        "assert_failure ",
2109        "OUnit.assert",
2110    ],
2111    test_suites: &[],
2112    variables_prefix_no_paren: &[],
2113};
2114
2115const SP_PERL: SymbolPatterns = SymbolPatterns {
2116    functions: &["sub "],
2117    functions_prefix_paren: &[],
2118    classes: &["package "],
2119    variables: &["my ", "our ", "local "],
2120    imports: &["use ", "require "],
2121    // Test::More / Test2: subtests group cases; ok/is/like/etc. are the assertions.
2122    tests: &["subtest "],
2123    assertions: &[
2124        "ok(",
2125        "is(",
2126        "isnt(",
2127        "like(",
2128        "unlike(",
2129        "cmp_ok(",
2130        "is_deeply(",
2131        "isa_ok(",
2132        "can_ok(",
2133    ],
2134    test_suites: &[],
2135    variables_prefix_no_paren: &[],
2136};
2137
2138const SP_CLOJURE: SymbolPatterns = SymbolPatterns {
2139    functions: &["(defn ", "(defn- ", "(defmacro ", "(defmulti "],
2140    functions_prefix_paren: &[],
2141    classes: &[
2142        "(defrecord ",
2143        "(defprotocol ",
2144        "(deftype ",
2145        "(definterface ",
2146    ],
2147    variables: &["(def ", "(defonce "],
2148    imports: &["(ns ", "(require "],
2149    // clojure.test
2150    tests: &["(deftest ", "(testing "],
2151    assertions: &[],
2152    test_suites: &[],
2153    variables_prefix_no_paren: &[],
2154};
2155
2156const SP_JULIA: SymbolPatterns = SymbolPatterns {
2157    functions: &["function ", "macro "],
2158    functions_prefix_paren: &[],
2159    classes: &[
2160        "struct ",
2161        "mutable struct ",
2162        "abstract type ",
2163        "primitive type ",
2164    ],
2165    variables: &["const "],
2166    imports: &["import ", "using "],
2167    // Test.jl standard library
2168    tests: &["@test ", "@testset "],
2169    assertions: &[],
2170    test_suites: &[],
2171    variables_prefix_no_paren: &[],
2172};
2173
2174const SP_DART: SymbolPatterns = SymbolPatterns {
2175    functions: &[],
2176    functions_prefix_paren: &[],
2177    classes: &["class ", "abstract class ", "mixin ", "extension ", "enum "],
2178    variables: &["var ", "final ", "const ", "late "],
2179    imports: &["import "],
2180    // flutter_test / test package
2181    tests: &["test(", "testWidgets(", "group("],
2182    assertions: &[],
2183    test_suites: &[],
2184    variables_prefix_no_paren: &[],
2185};
2186
2187const SP_R: SymbolPatterns = SymbolPatterns {
2188    functions: &[],
2189    functions_prefix_paren: &[],
2190    classes: &[],
2191    variables: &[],
2192    imports: &["library(", "source("],
2193    // testthat
2194    tests: &["test_that(", "it(", "describe(", "expect_"],
2195    assertions: &[],
2196    test_suites: &[],
2197    variables_prefix_no_paren: &[],
2198};
2199
2200const SP_SQL: SymbolPatterns = SymbolPatterns {
2201    functions: &[
2202        "create function ",
2203        "create or replace function ",
2204        "create procedure ",
2205        "create or replace procedure ",
2206        "CREATE FUNCTION ",
2207        "CREATE OR REPLACE FUNCTION ",
2208        "CREATE PROCEDURE ",
2209        "CREATE OR REPLACE PROCEDURE ",
2210    ],
2211    functions_prefix_paren: &[],
2212    classes: &[
2213        "create table ",
2214        "create view ",
2215        "create schema ",
2216        "CREATE TABLE ",
2217        "CREATE VIEW ",
2218        "CREATE SCHEMA ",
2219    ],
2220    variables: &["declare ", "DECLARE "],
2221    imports: &[],
2222    tests: &[],
2223    assertions: &[],
2224    test_suites: &[],
2225    variables_prefix_no_paren: &[],
2226};
2227
2228const SP_ASSEMBLY: SymbolPatterns = SymbolPatterns {
2229    functions: &["proc ", "PROC "],
2230    functions_prefix_paren: &[],
2231    classes: &[],
2232    variables: &[],
2233    imports: &["include ", "INCLUDE ", "%include "],
2234    tests: &[],
2235    assertions: &[],
2236    test_suites: &[],
2237    variables_prefix_no_paren: &[],
2238};
2239
2240const SP_ZIG: SymbolPatterns = SymbolPatterns {
2241    functions: &[
2242        "fn ",
2243        "pub fn ",
2244        "export fn ",
2245        "inline fn ",
2246        "pub inline fn ",
2247    ],
2248    functions_prefix_paren: &[],
2249    classes: &[],
2250    variables: &["var ", "pub var "],
2251    imports: &[],
2252    // Zig built-in test blocks
2253    tests: &["test \"", "test{"],
2254    assertions: &[],
2255    test_suites: &[],
2256    variables_prefix_no_paren: &[],
2257};
2258
2259/// Static (non-heap) language scanning parameters.  All fields are `'static` so this struct
2260/// can be stored in a `static` array.  The dynamic `skip_lines` set (used only for Python
2261/// docstring detection) is kept in `ScanConfig` and populated by the caller after lookup.
2262#[allow(clippy::struct_excessive_bools)]
2263#[derive(Clone, Copy)]
2264struct StaticLangConfig {
2265    line_comments: &'static [&'static str],
2266    block_comment: Option<(&'static str, &'static str)>,
2267    allow_single_quote_strings: bool,
2268    allow_double_quote_strings: bool,
2269    allow_triple_quote_strings: bool,
2270    allow_csharp_verbatim_strings: bool,
2271    symbol_patterns: SymbolPatterns,
2272    /// `true` for C, C++, and Objective-C (languages that have a C preprocessor).
2273    has_preprocessor: bool,
2274}
2275
2276#[allow(clippy::struct_excessive_bools)]
2277#[derive(Debug, Clone)]
2278struct ScanConfig {
2279    line_comments: &'static [&'static str],
2280    block_comment: Option<(&'static str, &'static str)>,
2281    allow_single_quote_strings: bool,
2282    allow_double_quote_strings: bool,
2283    allow_triple_quote_strings: bool,
2284    allow_csharp_verbatim_strings: bool,
2285    skip_lines: HashSet<usize>,
2286    symbol_patterns: SymbolPatterns,
2287    /// Branch keywords used to approximate cyclomatic complexity.
2288    branch_keywords: &'static [&'static str],
2289    /// Strategy for computing Logical SLOC.
2290    lsloc_strategy: LslocStrategy,
2291}
2292
2293// ── Per-family base configurations ───────────────────────────────────────────
2294//
2295// Most languages share one of two comment styles.  Define a base `const` for
2296// each family; table entries override only the fields that differ (symbol
2297// patterns, preprocessor flag, verbatim-string flag, etc.).
2298//
2299// C-slash family: `//` line, `/* */` block, single + double quotes.
2300// Covers C, C++, Obj-C, C#, Go, Java, JS/TS/Svelte/Vue, Dart, Groovy, Kotlin,
2301// Scala, SCSS, Swift, Rust, and Zig (Zig has no block comment → overridden).
2302const C_SLASH_BASE: StaticLangConfig = StaticLangConfig {
2303    line_comments: &["//"],
2304    block_comment: Some(("/*", "*/")),
2305    allow_single_quote_strings: true,
2306    allow_double_quote_strings: true,
2307    allow_triple_quote_strings: false,
2308    allow_csharp_verbatim_strings: false,
2309    symbol_patterns: SP_NONE,
2310    has_preprocessor: false,
2311};
2312
2313// Hash-comment family: `#` line comment, no block comment, single + double
2314// quotes.  Covers Shell, Ruby, R, Perl, Elixir (each overrides only SP_*);
2315// Python overrides triple-quote; PowerShell and Nim override block_comment.
2316const HASH_BASE: StaticLangConfig = StaticLangConfig {
2317    line_comments: &["#"],
2318    block_comment: None,
2319    allow_single_quote_strings: true,
2320    allow_double_quote_strings: true,
2321    allow_triple_quote_strings: false,
2322    allow_csharp_verbatim_strings: false,
2323    symbol_patterns: SP_NONE,
2324    has_preprocessor: false,
2325};
2326
2327/// Static language-scan configuration table — one entry per supported language.
2328/// Used by `language_scan_config` to avoid a 41-arm match.  All `SP_*` constants
2329/// referenced here are defined above in the same module.
2330static LANG_SCAN_TABLE: &[(Language, StaticLangConfig)] = &[
2331    // ── C preprocessor family ─────────────────────────────────────────────────
2332    (
2333        Language::C,
2334        StaticLangConfig {
2335            symbol_patterns: SP_C,
2336            has_preprocessor: true,
2337            ..C_SLASH_BASE
2338        },
2339    ),
2340    (
2341        Language::Cpp,
2342        StaticLangConfig {
2343            symbol_patterns: SP_CPP,
2344            has_preprocessor: true,
2345            ..C_SLASH_BASE
2346        },
2347    ),
2348    (
2349        Language::ObjectiveC,
2350        StaticLangConfig {
2351            symbol_patterns: SP_OBJECTIVEC,
2352            has_preprocessor: true,
2353            ..C_SLASH_BASE
2354        },
2355    ),
2356    // ── C-slash family ────────────────────────────────────────────────────────
2357    (
2358        Language::CSharp,
2359        StaticLangConfig {
2360            symbol_patterns: SP_CSHARP,
2361            allow_csharp_verbatim_strings: true,
2362            ..C_SLASH_BASE
2363        },
2364    ),
2365    (
2366        Language::Go,
2367        StaticLangConfig {
2368            symbol_patterns: SP_GO,
2369            ..C_SLASH_BASE
2370        },
2371    ),
2372    (
2373        Language::Java,
2374        StaticLangConfig {
2375            symbol_patterns: SP_JAVA,
2376            ..C_SLASH_BASE
2377        },
2378    ),
2379    (
2380        Language::JavaScript,
2381        StaticLangConfig {
2382            symbol_patterns: SP_JS,
2383            ..C_SLASH_BASE
2384        },
2385    ),
2386    (
2387        Language::TypeScript,
2388        StaticLangConfig {
2389            symbol_patterns: SP_TS,
2390            ..C_SLASH_BASE
2391        },
2392    ),
2393    (
2394        Language::Svelte,
2395        StaticLangConfig {
2396            symbol_patterns: SP_JS,
2397            ..C_SLASH_BASE
2398        },
2399    ),
2400    (
2401        Language::Vue,
2402        StaticLangConfig {
2403            symbol_patterns: SP_JS,
2404            ..C_SLASH_BASE
2405        },
2406    ),
2407    (
2408        Language::Dart,
2409        StaticLangConfig {
2410            symbol_patterns: SP_DART,
2411            ..C_SLASH_BASE
2412        },
2413    ),
2414    (
2415        Language::Groovy,
2416        StaticLangConfig {
2417            symbol_patterns: SP_GROOVY,
2418            ..C_SLASH_BASE
2419        },
2420    ),
2421    (
2422        Language::Kotlin,
2423        StaticLangConfig {
2424            symbol_patterns: SP_KOTLIN,
2425            ..C_SLASH_BASE
2426        },
2427    ),
2428    (
2429        Language::Scala,
2430        StaticLangConfig {
2431            symbol_patterns: SP_SCALA,
2432            ..C_SLASH_BASE
2433        },
2434    ),
2435    (
2436        Language::Scss,
2437        StaticLangConfig {
2438            symbol_patterns: SP_NONE,
2439            ..C_SLASH_BASE
2440        },
2441    ),
2442    // Rust: no single-quote char literals (they're lifetime annotations)
2443    (
2444        Language::Rust,
2445        StaticLangConfig {
2446            symbol_patterns: SP_RUST,
2447            allow_single_quote_strings: false,
2448            ..C_SLASH_BASE
2449        },
2450    ),
2451    // Swift: no single-quote strings
2452    (
2453        Language::Swift,
2454        StaticLangConfig {
2455            symbol_patterns: SP_SWIFT,
2456            allow_single_quote_strings: false,
2457            ..C_SLASH_BASE
2458        },
2459    ),
2460    // Zig: no block comment
2461    (
2462        Language::Zig,
2463        StaticLangConfig {
2464            symbol_patterns: SP_ZIG,
2465            block_comment: None,
2466            ..C_SLASH_BASE
2467        },
2468    ),
2469    // F#: `(*` … `*)` block comment, no single-quote strings
2470    (
2471        Language::FSharp,
2472        StaticLangConfig {
2473            line_comments: &["//"],
2474            block_comment: Some(("(*", "*)")),
2475            allow_single_quote_strings: false,
2476            allow_double_quote_strings: true,
2477            symbol_patterns: SP_FSHARP,
2478            ..C_SLASH_BASE
2479        },
2480    ),
2481    // ── Hash-comment family ───────────────────────────────────────────────────
2482    (
2483        Language::Shell,
2484        StaticLangConfig {
2485            symbol_patterns: SP_SHELL,
2486            ..HASH_BASE
2487        },
2488    ),
2489    (
2490        Language::Elixir,
2491        StaticLangConfig {
2492            symbol_patterns: SP_ELIXIR,
2493            ..HASH_BASE
2494        },
2495    ),
2496    (
2497        Language::Perl,
2498        StaticLangConfig {
2499            symbol_patterns: SP_PERL,
2500            ..HASH_BASE
2501        },
2502    ),
2503    (
2504        Language::R,
2505        StaticLangConfig {
2506            symbol_patterns: SP_R,
2507            ..HASH_BASE
2508        },
2509    ),
2510    (
2511        Language::Ruby,
2512        StaticLangConfig {
2513            symbol_patterns: SP_RUBY,
2514            ..HASH_BASE
2515        },
2516    ),
2517    // Python: triple-quote string literals
2518    (
2519        Language::Python,
2520        StaticLangConfig {
2521            symbol_patterns: SP_PYTHON,
2522            allow_triple_quote_strings: true,
2523            ..HASH_BASE
2524        },
2525    ),
2526    // PowerShell: `<# … #>` block comment
2527    (
2528        Language::PowerShell,
2529        StaticLangConfig {
2530            symbol_patterns: SP_POWERSHELL,
2531            block_comment: Some(("<#", "#>")),
2532            ..HASH_BASE
2533        },
2534    ),
2535    // Nim: `#[` … `]#` block comment
2536    (
2537        Language::Nim,
2538        StaticLangConfig {
2539            symbol_patterns: SP_NIM,
2540            block_comment: Some(("#[", "]#")),
2541            ..HASH_BASE
2542        },
2543    ),
2544    // Makefile / Dockerfile: `#` only, no string literals
2545    (
2546        Language::Makefile,
2547        StaticLangConfig {
2548            symbol_patterns: SP_NONE,
2549            allow_single_quote_strings: false,
2550            allow_double_quote_strings: false,
2551            ..HASH_BASE
2552        },
2553    ),
2554    (
2555        Language::Dockerfile,
2556        StaticLangConfig {
2557            symbol_patterns: SP_NONE,
2558            allow_single_quote_strings: false,
2559            allow_double_quote_strings: false,
2560            ..HASH_BASE
2561        },
2562    ),
2563    // ── Other unique comment styles ───────────────────────────────────────────
2564    // CSS / SCSS: only `/* */` block, no line comment
2565    (
2566        Language::Css,
2567        StaticLangConfig {
2568            line_comments: &[],
2569            block_comment: Some(("/*", "*/")),
2570            symbol_patterns: SP_NONE,
2571            ..C_SLASH_BASE
2572        },
2573    ),
2574    // HTML / XML: `<!-- -->` block, no line comment, no string literals
2575    (
2576        Language::Html,
2577        StaticLangConfig {
2578            line_comments: &[],
2579            block_comment: Some(("<!--", "-->")),
2580            allow_single_quote_strings: false,
2581            allow_double_quote_strings: false,
2582            symbol_patterns: SP_NONE,
2583            ..C_SLASH_BASE
2584        },
2585    ),
2586    (
2587        Language::Xml,
2588        StaticLangConfig {
2589            line_comments: &[],
2590            block_comment: Some(("<!--", "-->")),
2591            allow_single_quote_strings: false,
2592            allow_double_quote_strings: false,
2593            symbol_patterns: SP_NONE,
2594            ..C_SLASH_BASE
2595        },
2596    ),
2597    // Lua: `--` line, `--[[ ]]` block
2598    (
2599        Language::Lua,
2600        StaticLangConfig {
2601            line_comments: &["--"],
2602            block_comment: Some(("--[[", "]]")),
2603            symbol_patterns: SP_LUA,
2604            ..C_SLASH_BASE
2605        },
2606    ),
2607    // Haskell: `--` line, `{- -}` block
2608    (
2609        Language::Haskell,
2610        StaticLangConfig {
2611            line_comments: &["--"],
2612            block_comment: Some(("{-", "-}")),
2613            symbol_patterns: SP_HASKELL,
2614            ..C_SLASH_BASE
2615        },
2616    ),
2617    // SQL: `--` line, `/* */` block, single quote only
2618    (
2619        Language::Sql,
2620        StaticLangConfig {
2621            line_comments: &["--"],
2622            block_comment: Some(("/*", "*/")),
2623            allow_single_quote_strings: true,
2624            allow_double_quote_strings: false,
2625            symbol_patterns: SP_SQL,
2626            ..C_SLASH_BASE
2627        },
2628    ),
2629    // OCaml: `(*` … `*)` only, no line comment, no single-quote strings
2630    (
2631        Language::Ocaml,
2632        StaticLangConfig {
2633            line_comments: &[],
2634            block_comment: Some(("(*", "*)")),
2635            allow_single_quote_strings: false,
2636            symbol_patterns: SP_OCAML,
2637            ..C_SLASH_BASE
2638        },
2639    ),
2640    // Assembly: `;` line comment (NASM/MASM) + `/* */` block (GAS), double-quote
2641    // strings for `.ascii`/`.string` directives. `#` (GAS x86) and `@` (ARM) line
2642    // comments are intentionally NOT added: `#` is an immediate prefix in ARM
2643    // (`mov r0, #5`) and `@` appears in x86 symbol versioning (`memcpy@plt`), so a
2644    // universal superset would mis-count one dialect or the other.
2645    (
2646        Language::Assembly,
2647        StaticLangConfig {
2648            line_comments: &[";"],
2649            block_comment: Some(("/*", "*/")),
2650            allow_single_quote_strings: false,
2651            allow_double_quote_strings: true,
2652            symbol_patterns: SP_ASSEMBLY,
2653            ..C_SLASH_BASE
2654        },
2655    ),
2656    (
2657        Language::Clojure,
2658        StaticLangConfig {
2659            line_comments: &[";"],
2660            block_comment: None,
2661            allow_single_quote_strings: false,
2662            symbol_patterns: SP_CLOJURE,
2663            ..C_SLASH_BASE
2664        },
2665    ),
2666    // Erlang: `%` line comment, no block, no single-quote strings
2667    (
2668        Language::Erlang,
2669        StaticLangConfig {
2670            line_comments: &["%"],
2671            block_comment: None,
2672            allow_single_quote_strings: false,
2673            symbol_patterns: SP_ERLANG,
2674            ..C_SLASH_BASE
2675        },
2676    ),
2677    // PHP: `//` or `#` line, `/* */` block
2678    (
2679        Language::Php,
2680        StaticLangConfig {
2681            line_comments: &["//", "#"],
2682            block_comment: Some(("/*", "*/")),
2683            symbol_patterns: SP_PHP,
2684            ..C_SLASH_BASE
2685        },
2686    ),
2687    // Julia: `#` line, `#= =#` block, double + triple quotes, no single
2688    (
2689        Language::Julia,
2690        StaticLangConfig {
2691            line_comments: &["#"],
2692            block_comment: Some(("#=", "=#")),
2693            allow_single_quote_strings: false,
2694            allow_triple_quote_strings: true,
2695            symbol_patterns: SP_JULIA,
2696            ..C_SLASH_BASE
2697        },
2698    ),
2699    // ── Pass 1 additions ──────────────────────────────────────────────────────
2700    // Solidity: C-slash family (`//`, `/* */`, single + double quotes).
2701    (
2702        Language::Solidity,
2703        StaticLangConfig {
2704            symbol_patterns: SP_SOLIDITY,
2705            ..C_SLASH_BASE
2706        },
2707    ),
2708    // Protocol Buffers: C-slash family, statements terminated by `;`.
2709    (
2710        Language::Protobuf,
2711        StaticLangConfig {
2712            symbol_patterns: SP_PROTOBUF,
2713            ..C_SLASH_BASE
2714        },
2715    ),
2716    // HCL / Terraform: `#` or `//` line, `/* */` block, double-quote strings only.
2717    (
2718        Language::Hcl,
2719        StaticLangConfig {
2720            line_comments: &["#", "//"],
2721            allow_single_quote_strings: false,
2722            symbol_patterns: SP_NONE,
2723            ..C_SLASH_BASE
2724        },
2725    ),
2726    // GraphQL: `#` line comment, no block; `"""` block-string descriptions, no single quotes.
2727    (
2728        Language::GraphQl,
2729        StaticLangConfig {
2730            allow_single_quote_strings: false,
2731            allow_triple_quote_strings: true,
2732            symbol_patterns: SP_NONE,
2733            ..HASH_BASE
2734        },
2735    ),
2736    // ── Pass 2 additions (legacy + embedded / HDL) ────────────────────────────
2737    // Ada: `--` line comment, no block; `'` is a char/attribute tick, not a string.
2738    (
2739        Language::Ada,
2740        StaticLangConfig {
2741            line_comments: &["--"],
2742            block_comment: None,
2743            allow_single_quote_strings: false,
2744            symbol_patterns: SP_ADA,
2745            ..C_SLASH_BASE
2746        },
2747    ),
2748    // VHDL: `--` line comment, no block; `'` is a bit/char literal, not a string.
2749    (
2750        Language::Vhdl,
2751        StaticLangConfig {
2752            line_comments: &["--"],
2753            block_comment: None,
2754            allow_single_quote_strings: false,
2755            symbol_patterns: SP_VHDL,
2756            ..C_SLASH_BASE
2757        },
2758    ),
2759    // Verilog / SystemVerilog: C-slash family; `'` is a sized-literal base, not a string.
2760    (
2761        Language::Verilog,
2762        StaticLangConfig {
2763            allow_single_quote_strings: false,
2764            symbol_patterns: SP_VERILOG,
2765            ..C_SLASH_BASE
2766        },
2767    ),
2768    // Tcl: `#` line comment, no block; `"` strings only.
2769    (
2770        Language::Tcl,
2771        StaticLangConfig {
2772            allow_single_quote_strings: false,
2773            symbol_patterns: SP_TCL,
2774            ..HASH_BASE
2775        },
2776    ),
2777    // Pascal / Delphi: `//` line, `{ }` block; strings are single-quoted.
2778    (
2779        Language::Pascal,
2780        StaticLangConfig {
2781            line_comments: &["//"],
2782            block_comment: Some(("{", "}")),
2783            allow_single_quote_strings: true,
2784            allow_double_quote_strings: false,
2785            symbol_patterns: SP_PASCAL,
2786            ..C_SLASH_BASE
2787        },
2788    ),
2789    // Visual Basic: `'` line comment, no block; `"` strings only.
2790    (
2791        Language::VisualBasic,
2792        StaticLangConfig {
2793            line_comments: &["'"],
2794            block_comment: None,
2795            allow_single_quote_strings: false,
2796            allow_double_quote_strings: true,
2797            symbol_patterns: SP_VB,
2798            ..C_SLASH_BASE
2799        },
2800    ),
2801    // Lisp / Scheme: `;` line comment, `#| |#` block; `"` strings, `'` is the quote operator.
2802    (
2803        Language::Lisp,
2804        StaticLangConfig {
2805            line_comments: &[";"],
2806            block_comment: Some(("#|", "|#")),
2807            allow_single_quote_strings: false,
2808            symbol_patterns: SP_LISP,
2809            ..C_SLASH_BASE
2810        },
2811    ),
2812    // ── Pass 3 additions (scientific / infra / systems / graphics) ────────────
2813    // Fortran: `!` line comment (free-form), no block; single + double strings.
2814    (
2815        Language::Fortran,
2816        StaticLangConfig {
2817            line_comments: &["!"],
2818            block_comment: None,
2819            symbol_patterns: SP_FORTRAN,
2820            ..C_SLASH_BASE
2821        },
2822    ),
2823    // Nix: `#` line, `/* */` block; double-quote strings (and `''` multi-line).
2824    (
2825        Language::Nix,
2826        StaticLangConfig {
2827            block_comment: Some(("/*", "*/")),
2828            allow_single_quote_strings: false,
2829            symbol_patterns: SP_NONE,
2830            ..HASH_BASE
2831        },
2832    ),
2833    // Crystal: `#` line comment, no block; Ruby-like single + double strings.
2834    (
2835        Language::Crystal,
2836        StaticLangConfig {
2837            symbol_patterns: SP_CRYSTAL,
2838            ..HASH_BASE
2839        },
2840    ),
2841    // D: C-slash family (`//`, `/* */`); single-quote char literals + double strings.
2842    (
2843        Language::D,
2844        StaticLangConfig {
2845            symbol_patterns: SP_D,
2846            ..C_SLASH_BASE
2847        },
2848    ),
2849    // GLSL / HLSL / WGSL shaders: C-slash family; no char literals.
2850    (
2851        Language::Glsl,
2852        StaticLangConfig {
2853            allow_single_quote_strings: false,
2854            symbol_patterns: SP_NONE,
2855            ..C_SLASH_BASE
2856        },
2857    ),
2858    // CMake: `#` line, `#[[ ]]` block; double-quote strings only.
2859    (
2860        Language::Cmake,
2861        StaticLangConfig {
2862            block_comment: Some(("#[[", "]]")),
2863            allow_single_quote_strings: false,
2864            symbol_patterns: SP_CMAKE,
2865            ..HASH_BASE
2866        },
2867    ),
2868    // Elm: `--` line, `{- -}` block; double-quote strings only.
2869    (
2870        Language::Elm,
2871        StaticLangConfig {
2872            line_comments: &["--"],
2873            block_comment: Some(("{-", "-}")),
2874            allow_single_quote_strings: false,
2875            symbol_patterns: SP_ELM,
2876            ..C_SLASH_BASE
2877        },
2878    ),
2879    // Awk: `#` line comment, no block; double-quote strings only.
2880    (
2881        Language::Awk,
2882        StaticLangConfig {
2883            allow_single_quote_strings: false,
2884            symbol_patterns: SP_AWK,
2885            ..HASH_BASE
2886        },
2887    ),
2888];
2889
2890/// Per-call IEEE 1045-1992 flags derived from `AnalysisOptions` plus per-language properties.
2891/// Private to this crate; constructed inside `analyze_text`.
2892#[derive(Debug, Clone, Copy)]
2893struct IeeeFlags {
2894    /// True for C, C++, and Objective-C — languages with a C preprocessor.
2895    has_preprocessor_directives: bool,
2896    /// Mirrors `AnalysisOptions::blank_in_block_comment_as_comment`.
2897    blank_in_block_comment_as_comment: bool,
2898    /// Mirrors `AnalysisOptions::collapse_continuation_lines`.
2899    collapse_continuation_lines: bool,
2900}
2901
2902#[derive(Debug, Clone, Copy)]
2903enum StringState {
2904    Single(char),
2905    Triple(&'static str),
2906    VerbatimDouble,
2907}
2908
2909#[allow(clippy::struct_excessive_bools)]
2910#[derive(Debug, Default)]
2911struct LineFacts {
2912    has_code: bool,
2913    has_single_comment: bool,
2914    has_multi_comment: bool,
2915    has_docstring: bool,
2916}
2917
2918/// Process one character while the lexer is inside a string literal.
2919///
2920/// Returns `(new_string_state, advance)` where `advance` is the number of chars to skip.
2921fn process_string_char(
2922    state: StringState,
2923    chars: &[char],
2924    i: usize,
2925) -> (Option<StringState>, usize) {
2926    match state {
2927        StringState::Single(delim) => {
2928            if chars[i] == '\\' {
2929                return (Some(state), 2); // skip escaped character
2930            }
2931            if chars[i] == delim {
2932                (None, 1)
2933            } else {
2934                (Some(state), 1)
2935            }
2936        }
2937        StringState::Triple(delim) => {
2938            if starts_with(chars, i, delim) {
2939                (None, delim.len())
2940            } else {
2941                (Some(state), 1)
2942            }
2943        }
2944        StringState::VerbatimDouble => {
2945            if starts_with(chars, i, "\"\"") {
2946                return (Some(state), 2); // escaped quote-quote inside verbatim string
2947            }
2948            if chars[i] == '"' {
2949                (None, 1)
2950            } else {
2951                (Some(state), 1)
2952            }
2953        }
2954    }
2955}
2956
2957/// Process one character while the lexer is inside a block comment.
2958///
2959/// Returns `(still_in_block_comment, advance)`.
2960fn process_block_comment_char(chars: &[char], i: usize, close: &str) -> (bool, usize) {
2961    if starts_with(chars, i, close) {
2962        (false, close.len())
2963    } else {
2964        (true, 1)
2965    }
2966}
2967
2968/// Attempt to begin a new string literal at position `i`.
2969///
2970/// Returns `Some((new_state, advance))` when a string opener is detected, else `None`.
2971fn try_open_string(chars: &[char], i: usize, config: &ScanConfig) -> Option<(StringState, usize)> {
2972    if config.allow_csharp_verbatim_strings && starts_with(chars, i, "@\"") {
2973        return Some((StringState::VerbatimDouble, 2));
2974    }
2975    if config.allow_triple_quote_strings {
2976        if starts_with(chars, i, "\"\"\"") {
2977            return Some((StringState::Triple("\"\"\""), 3));
2978        }
2979        if starts_with(chars, i, "'''") {
2980            return Some((StringState::Triple("'''"), 3));
2981        }
2982    }
2983    if config.allow_single_quote_strings && chars[i] == '\'' {
2984        return Some((StringState::Single('\''), 1));
2985    }
2986    if config.allow_double_quote_strings && chars[i] == '"' {
2987        return Some((StringState::Single('"'), 1));
2988    }
2989    None
2990}
2991
2992/// Advance past one character position while inside a block comment.
2993///
2994/// Updates `in_block_comment` if the closing delimiter is found and returns the
2995/// number of characters consumed. Returns 0 when no block-comment config is set
2996/// (preserving the caller's `continue`-without-advance behaviour for that impossible state).
2997fn step_through_block_comment(
2998    chars: &[char],
2999    i: usize,
3000    block_comment: Option<(&'static str, &'static str)>,
3001    in_block_comment: &mut bool,
3002) -> usize {
3003    if let Some((_, close)) = block_comment {
3004        let (still_in, advance) = process_block_comment_char(chars, i, close);
3005        *in_block_comment = still_in;
3006        return advance;
3007    }
3008    0
3009}
3010
3011/// If the character at `i` starts a block comment, return the length of the opening
3012/// delimiter so the caller can advance past it. Returns `None` if no match.
3013fn try_open_block_comment(
3014    chars: &[char],
3015    i: usize,
3016    block_comment: Option<(&'static str, &'static str)>,
3017) -> Option<usize> {
3018    let (open, _) = block_comment?;
3019    starts_with(chars, i, open).then_some(open.len())
3020}
3021
3022/// Scan a single physical line and update `facts`, `in_block_comment`, and `string_state`.
3023///
3024/// Returns `true` when the caller should break out of the per-line loop early (line comment hit).
3025fn scan_line(
3026    chars: &[char],
3027    config: &ScanConfig,
3028    facts: &mut LineFacts,
3029    in_block_comment: &mut bool,
3030    string_state: &mut Option<StringState>,
3031) {
3032    let mut i = 0usize;
3033    while i < chars.len() {
3034        // Inside a string literal — advance until the closing delimiter.
3035        if let Some(state) = *string_state {
3036            facts.has_code = true;
3037            let (new_state, advance) = process_string_char(state, chars, i);
3038            *string_state = new_state;
3039            i += advance;
3040            continue;
3041        }
3042
3043        // Inside a block comment — advance until the closing delimiter.
3044        if *in_block_comment {
3045            facts.has_multi_comment = true;
3046            i += step_through_block_comment(chars, i, config.block_comment, in_block_comment);
3047            continue;
3048        }
3049
3050        // Whitespace outside any string/comment — skip.
3051        if chars[i].is_whitespace() {
3052            i += 1;
3053            continue;
3054        }
3055
3056        // Attempt to open a string literal.
3057        if let Some((new_state, advance)) = try_open_string(chars, i, config) {
3058            facts.has_code = true;
3059            *string_state = Some(new_state);
3060            i += advance;
3061            continue;
3062        }
3063
3064        // Attempt to open a block comment.
3065        if let Some(advance) = try_open_block_comment(chars, i, config.block_comment) {
3066            facts.has_multi_comment = true;
3067            *in_block_comment = true;
3068            i += advance;
3069            continue;
3070        }
3071
3072        // Line comment — rest of the line is a comment; stop scanning.
3073        if config
3074            .line_comments
3075            .iter()
3076            .any(|prefix| starts_with(chars, i, prefix))
3077        {
3078            facts.has_single_comment = true;
3079            break;
3080        }
3081
3082        // Plain code character.
3083        facts.has_code = true;
3084        i += 1;
3085    }
3086}
3087
3088/// Apply IEEE 1045-1992 §4.2 preprocessor-directive tracking and continuation-line merging,
3089/// then emit the finalized `LineFacts` for this physical line.
3090///
3091/// Returns `None` when the line is part of a continuation sequence and should be deferred.
3092fn finalize_line_facts(
3093    facts: LineFacts,
3094    trimmed: &str,
3095    raw: &mut RawLineCounts,
3096    ieee: IeeeFlags,
3097    in_block_comment: bool,
3098    string_state: Option<StringState>,
3099    pending_continuation: &mut Option<LineFacts>,
3100) -> Option<LineFacts> {
3101    // IEEE 1045-1992 §4.2: track preprocessor/compiler directive lines (C/C++/ObjC).
3102    // A directive line is a pure code line (no comment on the same physical line) whose
3103    // trimmed content starts with '#'.
3104    if ieee.has_preprocessor_directives
3105        && facts.has_code
3106        && !facts.has_single_comment
3107        && !facts.has_multi_comment
3108        && trimmed.starts_with('#')
3109    {
3110        raw.compiler_directive_lines += 1;
3111    }
3112
3113    // IEEE 1045-1992 continuation-line handling.
3114    // A line is a continuation starter when it ends with '\' outside any comment or string.
3115    let is_continuation = ieee.collapse_continuation_lines
3116        && !in_block_comment
3117        && string_state.is_none()
3118        && trimmed.ends_with('\\');
3119
3120    if is_continuation {
3121        let pending = pending_continuation.get_or_insert_with(LineFacts::default);
3122        pending.has_code |= facts.has_code;
3123        pending.has_single_comment |= facts.has_single_comment;
3124        pending.has_multi_comment |= facts.has_multi_comment;
3125        pending.has_docstring |= facts.has_docstring;
3126        return None; // defer classification until the sequence ends
3127    }
3128
3129    // Merge any accumulated continuation facts into the final line.
3130    let emit = if let Some(pending) = pending_continuation.take() {
3131        LineFacts {
3132            has_code: pending.has_code | facts.has_code,
3133            has_single_comment: pending.has_single_comment | facts.has_single_comment,
3134            has_multi_comment: pending.has_multi_comment | facts.has_multi_comment,
3135            has_docstring: pending.has_docstring | facts.has_docstring,
3136        }
3137    } else {
3138        facts
3139    };
3140    Some(emit)
3141}
3142
3143/// Scan and classify one physical line, updating all running state in place.
3144///
3145/// Pre-classified lines (present in `config.skip_lines`) are counted as docstring-comment
3146/// lines and returned early without further analysis.
3147#[allow(clippy::needless_pass_by_value)]
3148#[allow(clippy::too_many_arguments)]
3149#[allow(clippy::many_single_char_names)] // destructuring return from count_symbols; names match field roles
3150fn process_physical_line(
3151    line: &str,
3152    line_idx: usize,
3153    config: &ScanConfig,
3154    raw: &mut RawLineCounts,
3155    in_block_comment: &mut bool,
3156    string_state: &mut Option<StringState>,
3157    pending_continuation: &mut Option<LineFacts>,
3158    ieee: IeeeFlags,
3159) {
3160    raw.total_physical_lines += 1;
3161
3162    if config.skip_lines.contains(&line_idx) {
3163        raw.docstring_comment_lines += 1;
3164        return;
3165    }
3166
3167    let trimmed = line.trim();
3168    let mut facts = LineFacts::default();
3169
3170    // IEEE 1045-1992: blank lines inside block comments are comment lines by default.
3171    // When blank_in_block_comment_as_comment is false, blank lines keep their blank
3172    // classification even while inside a block comment.
3173    if *in_block_comment && (ieee.blank_in_block_comment_as_comment || !trimmed.is_empty()) {
3174        facts.has_multi_comment = true;
3175    }
3176
3177    let chars: Vec<char> = line.chars().collect();
3178    scan_line(&chars, config, &mut facts, in_block_comment, string_state);
3179
3180    let Some(emit) = finalize_line_facts(
3181        facts,
3182        trimmed,
3183        raw,
3184        ieee,
3185        *in_block_comment,
3186        *string_state,
3187        pending_continuation,
3188    ) else {
3189        return;
3190    };
3191
3192    classify_line(raw, &emit, trimmed);
3193
3194    if emit.has_code {
3195        use std::hash::{DefaultHasher, Hash, Hasher};
3196        let (f, c, v, i, t, a, s) = count_symbols(&config.symbol_patterns, trimmed);
3197        raw.functions += f;
3198        raw.classes += c;
3199        raw.variables += v;
3200        raw.imports += i;
3201        raw.test_count += t;
3202        raw.test_assertion_count += a;
3203        raw.test_suite_count += s;
3204
3205        // Cyclomatic complexity: count branch decision keywords on code lines.
3206        raw.cyclomatic_complexity +=
3207            count_branch_in_line(trimmed.as_bytes(), config.branch_keywords);
3208
3209        // Logical SLOC (language-specific strategy).
3210        match config.lsloc_strategy {
3211            LslocStrategy::Semicolons => {
3212                let semi = u32::try_from(trimmed.bytes().filter(|&b| b == b';').count())
3213                    .unwrap_or(u32::MAX);
3214                *raw.lsloc.get_or_insert(0) += semi;
3215            }
3216            LslocStrategy::NonContinuationNewlines => {
3217                let cont = trimmed.ends_with('\\')
3218                    || trimmed.ends_with(',')
3219                    || trimmed.ends_with('(')
3220                    || trimmed.ends_with('[')
3221                    || trimmed.ends_with('{');
3222                if !cont {
3223                    *raw.lsloc.get_or_insert(0) += 1;
3224                }
3225            }
3226            LslocStrategy::Unsupported => {}
3227        }
3228
3229        // ULOC: hash each trimmed code line for cross-file unique-line counting.
3230        let mut h = DefaultHasher::new();
3231        trimmed.hash(&mut h);
3232        raw.code_line_hashes.push(h.finish());
3233    }
3234}
3235
3236#[allow(clippy::needless_pass_by_value)]
3237fn analyze_generic(text: &str, config: ScanConfig, ieee: IeeeFlags) -> RawFileAnalysis {
3238    let normalized = text.replace("\r\n", "\n").replace('\r', "\n");
3239    let lines: Vec<&str> = normalized.split_terminator('\n').collect();
3240
3241    let mut raw = RawLineCounts::default();
3242    let mut warnings = Vec::new();
3243
3244    let mut in_block_comment = false;
3245    let mut string_state: Option<StringState> = None;
3246    // IEEE continuation-line state: accumulates facts across a backslash-continued sequence.
3247    let mut pending_continuation: Option<LineFacts> = None;
3248
3249    for (line_idx, line) in lines.iter().enumerate() {
3250        process_physical_line(
3251            line,
3252            line_idx,
3253            &config,
3254            &mut raw,
3255            &mut in_block_comment,
3256            &mut string_state,
3257            &mut pending_continuation,
3258            ieee,
3259        );
3260    }
3261
3262    // Flush any pending continuation that reaches end-of-file without a closing line.
3263    if let Some(pending) = pending_continuation.take() {
3264        classify_line(&mut raw, &pending, "");
3265    }
3266
3267    if in_block_comment {
3268        warnings.push("unclosed block comment detected; result is best effort".into());
3269    }
3270    if string_state.is_some() {
3271        warnings.push("unclosed string literal detected; result is best effort".into());
3272    }
3273
3274    RawFileAnalysis {
3275        raw,
3276        parse_mode: if warnings.is_empty() {
3277            ParseMode::Lexical
3278        } else {
3279            ParseMode::LexicalBestEffort
3280        },
3281        warnings,
3282        style_analysis: None,
3283    }
3284}
3285
3286const fn classify_line(raw: &mut RawLineCounts, facts: &LineFacts, trimmed: &str) {
3287    if facts.has_docstring {
3288        raw.docstring_comment_lines += 1;
3289    } else if !facts.has_code
3290        && !facts.has_single_comment
3291        && !facts.has_multi_comment
3292        && trimmed.is_empty()
3293    {
3294        raw.blank_only_lines += 1;
3295    } else if facts.has_code && facts.has_single_comment {
3296        raw.mixed_code_single_comment_lines += 1;
3297    } else if facts.has_code && facts.has_multi_comment {
3298        raw.mixed_code_multi_comment_lines += 1;
3299    } else if facts.has_code {
3300        raw.code_only_lines += 1;
3301    } else if facts.has_single_comment {
3302        raw.single_comment_only_lines += 1;
3303    } else if facts.has_multi_comment {
3304        raw.multi_comment_only_lines += 1;
3305    } else if trimmed.is_empty() {
3306        raw.blank_only_lines += 1;
3307    } else {
3308        raw.skipped_unknown_lines += 1;
3309    }
3310}
3311
3312/// True (as 0/1) when `trimmed` starts with any of the prefixes in `pats`.
3313fn prefix_hit(pats: &[&str], trimmed: &str) -> u64 {
3314    u64::from(pats.iter().any(|p| trimmed.starts_with(p)))
3315}
3316
3317/// Match a return-type-led function prefix (C/C++): prefix AND `(` present AND no `=` sits
3318/// between the prefix start and the first `(` (guards against `void* p = malloc(n)`).
3319fn fn_prefix_paren_hit(patterns: &SymbolPatterns, trimmed: &str) -> u64 {
3320    if patterns.functions_prefix_paren.is_empty() {
3321        return 0;
3322    }
3323    let Some(paren_pos) = trimmed.find('(') else {
3324        return 0;
3325    };
3326    if trimmed[..paren_pos].contains('=') {
3327        0
3328    } else {
3329        prefix_hit(patterns.functions_prefix_paren, trimmed)
3330    }
3331}
3332
3333/// Complement of `functions_prefix_paren`: same type keywords, but triggered when there is no
3334/// unguarded `(` on the line (i.e. not a function definition).
3335fn var_prefix_no_paren_hit(patterns: &SymbolPatterns, trimmed: &str) -> u64 {
3336    if patterns.variables_prefix_no_paren.is_empty()
3337        || prefix_hit(patterns.variables_prefix_no_paren, trimmed) == 0
3338    {
3339        return 0;
3340    }
3341    trimmed
3342        .find('(')
3343        .map_or(1, |pp| u64::from(trimmed[..pp].contains('=')))
3344}
3345
3346fn count_symbols(patterns: &SymbolPatterns, trimmed: &str) -> (u64, u64, u64, u64, u64, u64, u64) {
3347    let hit = |pats: &[&str]| prefix_hit(pats, trimmed);
3348    let fn_pp = fn_prefix_paren_hit(patterns, trimmed);
3349    let test_hit = hit(patterns.tests);
3350    // Lines matching a test pattern count as tests, not as plain functions or classes.
3351    // This prevents double-counting in Python (`def test_` / `class Test`) and Go
3352    // (`func Test` / `func Benchmark` / `func Fuzz`) where the same line satisfies both
3353    // a function/class prefix and a test pattern. Rust is unaffected: `#[test]` is a
3354    // standalone attribute line; the `fn` declaration on the next line does not match any
3355    // test pattern and still increments functions correctly.
3356    let fn_hit = if test_hit == 0 {
3357        hit(patterns.functions) | fn_pp
3358    } else {
3359        0
3360    };
3361    let class_hit = if test_hit == 0 {
3362        hit(patterns.classes)
3363    } else {
3364        0
3365    };
3366    let var_pnp = var_prefix_no_paren_hit(patterns, trimmed);
3367    (
3368        fn_hit,
3369        class_hit,
3370        hit(patterns.variables) | var_pnp,
3371        hit(patterns.imports),
3372        test_hit,
3373        hit(patterns.assertions),
3374        hit(patterns.test_suites),
3375    )
3376}
3377
3378/// True when `line[start..end]` is surrounded by non-identifier characters.
3379fn is_word_boundary(line: &[u8], start: usize, end: usize) -> bool {
3380    let before_ok =
3381        start == 0 || (!line[start - 1].is_ascii_alphanumeric() && line[start - 1] != b'_');
3382    let after_ok = end >= line.len() || (!line[end].is_ascii_alphanumeric() && line[end] != b'_');
3383    before_ok && after_ok
3384}
3385
3386/// True when `kw_bytes` appears at `line[i..]`, respecting word boundaries when `word_kw` is set.
3387fn keyword_matches_at(line: &[u8], i: usize, kw_bytes: &[u8], word_kw: bool) -> bool {
3388    if &line[i..i + kw_bytes.len()] != kw_bytes {
3389        return false;
3390    }
3391    !word_kw || is_word_boundary(line, i, i + kw_bytes.len())
3392}
3393
3394/// Count branch keyword occurrences in `line` (ASCII bytes of a trimmed code line).
3395///
3396/// Alphabetic keywords are matched word-bounded (not as substrings of longer identifiers).
3397/// Operator tokens (`||`, `&&`, `?`) are matched as raw substrings.
3398fn count_branch_in_line(line: &[u8], keywords: &[&str]) -> u32 {
3399    if keywords.is_empty() || line.is_empty() {
3400        return 0;
3401    }
3402    let mut total = 0u32;
3403    for &kw in keywords {
3404        let kw_bytes = kw.as_bytes();
3405        let word_kw = kw.bytes().all(|b| b.is_ascii_alphabetic() || b == b'_');
3406        let mut i = 0usize;
3407        while i + kw_bytes.len() <= line.len() {
3408            if keyword_matches_at(line, i, kw_bytes, word_kw) {
3409                total += 1;
3410                i += kw_bytes.len();
3411            } else {
3412                i += 1;
3413            }
3414        }
3415    }
3416    total
3417}
3418
3419fn starts_with(chars: &[char], index: usize, needle: &str) -> bool {
3420    let needle_chars: Vec<char> = needle.chars().collect();
3421    chars.get(index..index + needle_chars.len()) == Some(needle_chars.as_slice())
3422}
3423
3424#[derive(Debug, Clone)]
3425struct PyContext {
3426    indent: usize,
3427    expect_docstring: bool,
3428}
3429
3430/// Update `contexts` to pop any scopes that the current `indent` has outdented past.
3431fn py_pop_outdented_contexts(contexts: &mut Vec<PyContext>, indent: usize) {
3432    while contexts.len() > 1 && indent < contexts.last().map_or(0, |c| c.indent) {
3433        contexts.pop();
3434    }
3435}
3436
3437/// Handle `pending_block_indent` transition: push a new docstring-expecting context when we
3438/// detect the first indented line of a new block, or cancel the pending state otherwise.
3439fn py_handle_pending_indent(
3440    pending_block_indent: &mut Option<usize>,
3441    contexts: &mut Vec<PyContext>,
3442    indent: usize,
3443    trimmed: &str,
3444) {
3445    let Some(base_indent) = *pending_block_indent else {
3446        return;
3447    };
3448    if indent > base_indent {
3449        contexts.push(PyContext {
3450            indent,
3451            expect_docstring: true,
3452        });
3453        *pending_block_indent = None;
3454    } else if !trimmed.starts_with('@') {
3455        *pending_block_indent = None;
3456    }
3457}
3458
3459/// Check whether the current line is a docstring opener in the current context.
3460///
3461/// If it is, records the line, adjusts `ctx.expect_docstring`, and optionally sets
3462/// `active_docstring` for multi-line docstrings. Returns `true` when the caller should
3463/// `continue` to the next line.
3464fn py_try_record_docstring(
3465    ctx: &mut PyContext,
3466    trimmed: &str,
3467    idx: usize,
3468    docstring_lines: &mut HashSet<usize>,
3469    active_docstring: &mut Option<(&'static str, usize)>,
3470) -> bool {
3471    if !ctx.expect_docstring {
3472        return false;
3473    }
3474    if let Some(delim) = docstring_delimiter(trimmed) {
3475        docstring_lines.insert(idx);
3476        ctx.expect_docstring = false;
3477        if !closes_triple_docstring(trimmed, delim, true) {
3478            *active_docstring = Some((delim, idx));
3479        }
3480        return true;
3481    }
3482    ctx.expect_docstring = false;
3483    false
3484}
3485
3486/// Advance through an active multi-line docstring: marks the current line and clears
3487/// `active_docstring` when the closing delimiter is found. Returns `true` when the caller
3488/// should `continue` to the next line (i.e. we were inside a docstring).
3489fn track_active_docstring(
3490    active_docstring: &mut Option<(&'static str, usize)>,
3491    docstring_lines: &mut HashSet<usize>,
3492    idx: usize,
3493    trimmed: &str,
3494) -> bool {
3495    let Some((delim, start_line)) = *active_docstring else {
3496        return false;
3497    };
3498    docstring_lines.insert(idx);
3499    if closes_triple_docstring(trimmed, delim, idx == start_line) {
3500        *active_docstring = None;
3501    }
3502    true
3503}
3504
3505/// Attempt to record a docstring opener using the top of the context stack.
3506/// Returns `true` when the caller should `continue` to the next line.
3507fn try_record_docstring_if_context(
3508    contexts: &mut [PyContext],
3509    trimmed: &str,
3510    idx: usize,
3511    docstring_lines: &mut HashSet<usize>,
3512    active_docstring: &mut Option<(&'static str, usize)>,
3513) -> bool {
3514    let Some(ctx) = contexts.last_mut() else {
3515        return false;
3516    };
3517    py_try_record_docstring(ctx, trimmed, idx, docstring_lines, active_docstring)
3518}
3519
3520/// If an unclosed docstring is still active at end-of-file, mark all remaining lines.
3521fn mark_unclosed_docstring_lines(
3522    active_docstring: Option<&(&'static str, usize)>,
3523    docstring_lines: &mut HashSet<usize>,
3524    num_lines: usize,
3525) {
3526    if let Some(&(_, start_line)) = active_docstring {
3527        for idx in start_line..num_lines {
3528            docstring_lines.insert(idx);
3529        }
3530    }
3531}
3532
3533fn detect_python_docstring_lines(text: &str) -> HashSet<usize> {
3534    let normalized = text.replace("\r\n", "\n").replace('\r', "\n");
3535    let lines: Vec<&str> = normalized.split_terminator('\n').collect();
3536
3537    let mut docstring_lines = HashSet::new();
3538    let mut contexts = vec![PyContext {
3539        indent: 0,
3540        expect_docstring: true,
3541    }];
3542    let mut pending_block_indent: Option<usize> = None;
3543    let mut active_docstring: Option<(&'static str, usize)> = None;
3544
3545    for (idx, line) in lines.iter().enumerate() {
3546        let trimmed = line.trim();
3547        let indent = leading_indent(line);
3548
3549        if track_active_docstring(&mut active_docstring, &mut docstring_lines, idx, trimmed) {
3550            continue;
3551        }
3552
3553        // Blank lines and comment lines don't affect docstring detection.
3554        if trimmed.is_empty() || trimmed.starts_with('#') {
3555            continue;
3556        }
3557
3558        py_pop_outdented_contexts(&mut contexts, indent);
3559        py_handle_pending_indent(&mut pending_block_indent, &mut contexts, indent, trimmed);
3560
3561        if try_record_docstring_if_context(
3562            &mut contexts,
3563            trimmed,
3564            idx,
3565            &mut docstring_lines,
3566            &mut active_docstring,
3567        ) {
3568            continue;
3569        }
3570
3571        if is_python_block_header(trimmed) {
3572            pending_block_indent = Some(indent);
3573        }
3574    }
3575
3576    mark_unclosed_docstring_lines(active_docstring.as_ref(), &mut docstring_lines, lines.len());
3577
3578    docstring_lines
3579}
3580
3581fn leading_indent(line: &str) -> usize {
3582    line.chars().take_while(|c| c.is_whitespace()).count()
3583}
3584
3585fn is_python_block_header(trimmed: &str) -> bool {
3586    (trimmed.starts_with("def ")
3587        || trimmed.starts_with("async def ")
3588        || trimmed.starts_with("class "))
3589        && trimmed.ends_with(':')
3590}
3591
3592fn docstring_delimiter(trimmed: &str) -> Option<&'static str> {
3593    let mut idx = 0usize;
3594    let bytes = trimmed.as_bytes();
3595    while idx < bytes.len() {
3596        let c = bytes[idx] as char;
3597        if matches!(c, 'r' | 'R' | 'u' | 'U' | 'b' | 'B' | 'f' | 'F') {
3598            idx += 1;
3599            continue;
3600        }
3601        break;
3602    }
3603
3604    let rest = &trimmed[idx..];
3605    if rest.starts_with("\"\"\"") {
3606        Some("\"\"\"")
3607    } else if rest.starts_with("'''") {
3608        Some("'''")
3609    } else {
3610        None
3611    }
3612}
3613
3614fn closes_triple_docstring(trimmed: &str, delim: &str, same_line_as_start: bool) -> bool {
3615    let mut occurrences = 0usize;
3616    let mut search = trimmed;
3617    while let Some(index) = search.find(delim) {
3618        occurrences += 1;
3619        search = &search[index + delim.len()..];
3620    }
3621
3622    if same_line_as_start {
3623        occurrences >= 2
3624    } else {
3625        occurrences >= 1
3626    }
3627}
3628
3629/// Tree-sitter-backed adapters (compiled only when the `tree-sitter` feature is enabled).
3630///
3631/// When parsing succeeds the result is used directly; on any failure the caller falls back
3632/// to the lexical state machine.
3633#[cfg(feature = "tree-sitter")]
3634pub mod ts {
3635    use tree_sitter::Node;
3636
3637    use super::{ParseMode, RawFileAnalysis, RawLineCounts};
3638
3639    /// Configuration for which AST node kinds map to symbols in this grammar.
3640    struct SymbolKinds {
3641        /// Node kind name for function definitions (e.g. `"function_definition"`).
3642        function_def: &'static str,
3643        /// Node kind name for class definitions (e.g. `"class_definition"`).
3644        class_def: &'static str,
3645        /// Name field of a function node that, when it starts with this prefix, marks a test.
3646        /// Empty string disables test-prefix detection.
3647        test_fn_prefix: &'static str,
3648        /// Name field of a class node that, when it starts with this prefix, marks a test.
3649        /// Empty string disables test-prefix detection.
3650        test_class_prefix: &'static str,
3651        /// When non-empty, `call` nodes whose `function` is an `attribute` access and whose
3652        /// attribute identifier starts with this prefix are counted as test assertions.
3653        /// Used for Python `self.assertXxx(...)` detection.
3654        assertion_attr_prefix: &'static str,
3655    }
3656
3657    impl SymbolKinds {
3658        const fn none() -> Self {
3659            Self {
3660                function_def: "",
3661                class_def: "",
3662                test_fn_prefix: "",
3663                test_class_prefix: "",
3664                assertion_attr_prefix: "",
3665            }
3666        }
3667    }
3668
3669    /// Classify every line of `text` using a tree-sitter grammar.
3670    ///
3671    /// `comment_node_kinds` — node type names that represent comments in this grammar
3672    /// `docstring_stmt_kind` — optional parent node type whose direct `string` child is a docstring
3673    /// `symbols` — AST node kinds used to populate symbol counters
3674    fn analyze_lines(
3675        text: &str,
3676        ts_language: &tree_sitter::Language,
3677        comment_node_kinds: &[&str],
3678        docstring_stmt_kind: Option<&str>,
3679        symbols: &SymbolKinds,
3680    ) -> Option<RawFileAnalysis> {
3681        let mut parser = tree_sitter::Parser::new();
3682        parser.set_language(ts_language).ok()?;
3683        let tree = parser.parse(text, None)?;
3684
3685        let lines: Vec<&str> = text.split_terminator('\n').collect();
3686        let n = lines.len();
3687
3688        let mut has_code = vec![false; n];
3689        let mut has_comment = vec![false; n];
3690        let mut comment_is_block = vec![false; n];
3691        let mut has_docstring = vec![false; n];
3692
3693        // Walk every node in the tree and mark line arrays.
3694        let mut ctx = VisitCtx {
3695            source: text.as_bytes(),
3696            comment_kinds: comment_node_kinds,
3697            docstring_stmt_kind,
3698            has_code: &mut has_code,
3699            has_comment: &mut has_comment,
3700            comment_is_block: &mut comment_is_block,
3701            has_docstring: &mut has_docstring,
3702        };
3703        visit(tree.root_node(), &mut ctx);
3704
3705        let mut raw = RawLineCounts::default();
3706        classify_ts_lines(
3707            &lines,
3708            &has_code,
3709            &has_comment,
3710            &comment_is_block,
3711            &has_docstring,
3712            &mut raw,
3713        );
3714
3715        // Symbol counting: walk the AST a second time to collect function/class/test counts.
3716        if !symbols.function_def.is_empty() || !symbols.class_def.is_empty() {
3717            count_symbols(tree.root_node(), text.as_bytes(), symbols, &mut raw);
3718        }
3719
3720        Some(RawFileAnalysis {
3721            raw,
3722            parse_mode: ParseMode::TreeSitter,
3723            warnings: Vec::new(),
3724            style_analysis: None,
3725        })
3726    }
3727
3728    /// Recurse into every direct child of `node`.
3729    fn recurse_children(node: Node, source: &[u8], kinds: &SymbolKinds, raw: &mut RawLineCounts) {
3730        for i in 0..node.child_count() {
3731            #[allow(clippy::cast_possible_truncation)]
3732            if let Some(child) = node.child(i as u32) {
3733                count_symbols(child, source, kinds, raw);
3734            }
3735        }
3736    }
3737
3738    /// Handle a function-definition node. Returns `true` if the node matched.
3739    fn try_count_function(
3740        node: Node,
3741        source: &[u8],
3742        kinds: &SymbolKinds,
3743        raw: &mut RawLineCounts,
3744    ) -> bool {
3745        if kinds.function_def.is_empty() || node.kind() != kinds.function_def {
3746            return false;
3747        }
3748        let name = node
3749            .child_by_field_name("name")
3750            .and_then(|n| n.utf8_text(source).ok())
3751            .unwrap_or("");
3752        if !kinds.test_fn_prefix.is_empty() && name.starts_with(kinds.test_fn_prefix) {
3753            raw.test_count += 1;
3754        } else {
3755            raw.functions += 1;
3756        }
3757        recurse_children(node, source, kinds, raw);
3758        true
3759    }
3760
3761    /// Handle a class-definition node. Returns `true` if the node matched.
3762    fn try_count_class(
3763        node: Node,
3764        source: &[u8],
3765        kinds: &SymbolKinds,
3766        raw: &mut RawLineCounts,
3767    ) -> bool {
3768        if kinds.class_def.is_empty() || node.kind() != kinds.class_def {
3769            return false;
3770        }
3771        let name = node
3772            .child_by_field_name("name")
3773            .and_then(|n| n.utf8_text(source).ok())
3774            .unwrap_or("");
3775        if !kinds.test_class_prefix.is_empty() && name.starts_with(kinds.test_class_prefix) {
3776            raw.test_count += 1;
3777        } else {
3778            raw.classes += 1;
3779        }
3780        recurse_children(node, source, kinds, raw);
3781        true
3782    }
3783
3784    /// Handle an assertion call node. Returns `true` if the node matched (skips recursion
3785    /// into arguments, preserving "don't double-count test bodies" semantics).
3786    fn try_count_assertion(
3787        node: Node,
3788        source: &[u8],
3789        kinds: &SymbolKinds,
3790        raw: &mut RawLineCounts,
3791    ) -> bool {
3792        if kinds.assertion_attr_prefix.is_empty() || node.kind() != "call" {
3793            return false;
3794        }
3795        let Some(func) = node.child_by_field_name("function") else {
3796            return false;
3797        };
3798        if func.kind() != "attribute" {
3799            return false;
3800        }
3801        let attr_text = func
3802            .child_by_field_name("attribute")
3803            .and_then(|n| n.utf8_text(source).ok())
3804            .unwrap_or("");
3805        if !attr_text.starts_with(kinds.assertion_attr_prefix) {
3806            return false;
3807        }
3808        raw.test_assertion_count += 1;
3809        true
3810    }
3811
3812    /// Walk the AST and populate `raw.functions`, `raw.classes`, `raw.test_count`,
3813    /// and `raw.test_assertion_count`.
3814    fn count_symbols(node: Node, source: &[u8], kinds: &SymbolKinds, raw: &mut RawLineCounts) {
3815        if try_count_function(node, source, kinds, raw) {
3816            return;
3817        }
3818        if try_count_class(node, source, kinds, raw) {
3819            return;
3820        }
3821        if try_count_assertion(node, source, kinds, raw) {
3822            return;
3823        }
3824        recurse_children(node, source, kinds, raw);
3825    }
3826
3827    /// Flags describing what kinds of content appear on a single line.
3828    // Four bools are the natural representation for these four independent properties.
3829    #[allow(clippy::struct_excessive_bools)]
3830    #[derive(Clone, Copy)]
3831    struct TsLineFlags {
3832        has_code: bool,
3833        has_comment: bool,
3834        comment_is_block: bool,
3835        has_docstring: bool,
3836    }
3837
3838    /// Classify a single tree-sitter-annotated line and accumulate into `raw`.
3839    const fn classify_ts_line(trimmed: &str, flags: TsLineFlags, raw: &mut RawLineCounts) {
3840        if trimmed.is_empty() {
3841            raw.blank_only_lines += 1;
3842        } else if flags.has_docstring && !flags.has_code {
3843            raw.docstring_comment_lines += 1;
3844        } else if flags.has_code && flags.has_comment {
3845            // Classify the mixed line as single or multi based on what kind of comment is on it.
3846            if flags.comment_is_block {
3847                raw.mixed_code_multi_comment_lines += 1;
3848            } else {
3849                raw.mixed_code_single_comment_lines += 1;
3850            }
3851        } else if flags.has_comment {
3852            if flags.comment_is_block {
3853                raw.multi_comment_only_lines += 1;
3854            } else {
3855                raw.single_comment_only_lines += 1;
3856            }
3857        } else {
3858            raw.code_only_lines += 1;
3859        }
3860    }
3861
3862    /// Classify each tree-sitter-annotated line and accumulate counts into `raw`.
3863    fn classify_ts_lines(
3864        lines: &[&str],
3865        has_code: &[bool],
3866        has_comment: &[bool],
3867        comment_is_block: &[bool],
3868        has_docstring: &[bool],
3869        raw: &mut RawLineCounts,
3870    ) {
3871        for i in 0..lines.len() {
3872            raw.total_physical_lines += 1;
3873            classify_ts_line(
3874                lines[i].trim(),
3875                TsLineFlags {
3876                    has_code: has_code[i],
3877                    has_comment: has_comment[i],
3878                    comment_is_block: comment_is_block[i],
3879                    has_docstring: has_docstring[i],
3880                },
3881                raw,
3882            );
3883        }
3884    }
3885
3886    struct VisitCtx<'a> {
3887        source: &'a [u8],
3888        comment_kinds: &'a [&'a str],
3889        docstring_stmt_kind: Option<&'a str>,
3890        has_code: &'a mut Vec<bool>,
3891        has_comment: &'a mut Vec<bool>,
3892        comment_is_block: &'a mut Vec<bool>,
3893        has_docstring: &'a mut Vec<bool>,
3894    }
3895
3896    /// Mark all rows of a comment node and detect whether it is a block comment.
3897    fn visit_comment_node(node: Node, ctx: &mut VisitCtx<'_>) {
3898        let start_row = node.start_position().row;
3899        let end_row = node.end_position().row;
3900        let first_two = node
3901            .utf8_text(ctx.source)
3902            .unwrap_or("")
3903            .get(..2)
3904            .unwrap_or("");
3905        let is_block = first_two == "/*" || first_two == "<#";
3906        for row in start_row..=end_row {
3907            if row < ctx.has_comment.len() {
3908                ctx.has_comment[row] = true;
3909                if is_block {
3910                    ctx.comment_is_block[row] = true;
3911                }
3912            }
3913        }
3914    }
3915
3916    /// If `node` is an `expression_statement` whose sole named child is a string literal,
3917    /// mark those rows as docstring and return `true`.
3918    fn visit_maybe_docstring(node: Node, kind: &str, ctx: &mut VisitCtx<'_>) -> bool {
3919        let Some(stmt_kind) = ctx.docstring_stmt_kind else {
3920            return false;
3921        };
3922        if kind != stmt_kind || node.named_child_count() != 1 {
3923            return false;
3924        }
3925        let Some(child) = node.named_child(0) else {
3926            return false;
3927        };
3928        if child.kind() != "string" {
3929            return false;
3930        }
3931        let child_start = child.start_position().row;
3932        let child_end = child.end_position().row;
3933        for row in child_start..=child_end {
3934            if row < ctx.has_docstring.len() {
3935                ctx.has_docstring[row] = true;
3936            }
3937        }
3938        true
3939    }
3940
3941    /// Mark all rows of a leaf (non-comment, non-extra) node as code.
3942    fn visit_leaf_code(node: Node, ctx: &mut VisitCtx<'_>) {
3943        let start_row = node.start_position().row;
3944        let end_row = node.end_position().row;
3945        for row in start_row..=end_row {
3946            if row < ctx.has_code.len() {
3947                ctx.has_code[row] = true;
3948            }
3949        }
3950    }
3951
3952    #[allow(clippy::too_many_lines)]
3953    fn visit(node: Node, ctx: &mut VisitCtx<'_>) {
3954        let kind = node.kind();
3955
3956        // Comment node — mark rows as comment, detect block vs. line comment.
3957        if ctx.comment_kinds.contains(&kind) {
3958            visit_comment_node(node, ctx);
3959            return;
3960        }
3961
3962        // Python docstring: expression_statement whose only named child is a string literal.
3963        if visit_maybe_docstring(node, kind, ctx) {
3964            return;
3965        }
3966
3967        // Leaf non-comment node: mark as code.
3968        if node.child_count() == 0 && !node.is_extra() {
3969            visit_leaf_code(node, ctx);
3970            return;
3971        }
3972
3973        for i in 0..node.child_count() {
3974            #[allow(clippy::cast_possible_truncation)]
3975            // child_count bounded by tree-sitter u32 capacity
3976            if let Some(child) = node.child(i as u32) {
3977                visit(child, ctx);
3978            }
3979        }
3980    }
3981
3982    const C_SYMBOLS: SymbolKinds = SymbolKinds::none();
3983
3984    const PYTHON_SYMBOLS: SymbolKinds = SymbolKinds {
3985        function_def: "function_definition",
3986        class_def: "class_definition",
3987        test_fn_prefix: "test_",
3988        test_class_prefix: "Test",
3989        assertion_attr_prefix: "assert",
3990    };
3991
3992    /// Parse C or C++ source with tree-sitter-c.
3993    #[must_use]
3994    pub fn analyze_c(text: &str) -> Option<RawFileAnalysis> {
3995        let lang: tree_sitter::Language = tree_sitter_c::LANGUAGE.into();
3996        analyze_lines(text, &lang, &["comment"], None, &C_SYMBOLS)
3997    }
3998
3999    /// Parse Python source with tree-sitter-python.
4000    #[must_use]
4001    pub fn analyze_python(text: &str) -> Option<RawFileAnalysis> {
4002        let lang: tree_sitter::Language = tree_sitter_python::LANGUAGE.into();
4003        analyze_lines(
4004            text,
4005            &lang,
4006            &["comment"],
4007            Some("expression_statement"),
4008            &PYTHON_SYMBOLS,
4009        )
4010    }
4011}
4012
4013#[cfg(test)]
4014mod tests {
4015    use super::*;
4016
4017    #[test]
4018    fn python_docstrings_are_separated() {
4019        let input = r#""""module docs"""
4020
4021
4022def fn_a():
4023    """function docs"""
4024    value = 1  # trailing comment
4025    return value
4026"#;
4027
4028        let result = analyze_text(Language::Python, input, AnalysisOptions::default());
4029        assert_eq!(result.raw.docstring_comment_lines, 2);
4030        assert_eq!(result.raw.mixed_code_single_comment_lines, 1);
4031        assert_eq!(result.raw.code_only_lines, 2);
4032    }
4033
4034    #[test]
4035    fn c_style_mixed_lines_are_captured() {
4036        let input = "int x = 1; // note\n/* block */\n";
4037        let result = analyze_text(Language::C, input, AnalysisOptions::default());
4038        assert_eq!(result.raw.mixed_code_single_comment_lines, 1);
4039        assert_eq!(result.raw.multi_comment_only_lines, 1);
4040    }
4041
4042    #[test]
4043    fn detect_language_by_shebang() {
4044        let language = detect_language(
4045            Path::new("script"),
4046            Some("#!/usr/bin/env bash"),
4047            &BTreeMap::new(),
4048            true,
4049        );
4050        assert_eq!(language, Some(Language::Shell));
4051    }
4052
4053    // ── count_symbols: no double-counting of test functions ──────────────────
4054
4055    fn sym(lang: Language, line: &str) -> (u64, u64, u64, u64, u64, u64, u64) {
4056        let result = analyze_text(lang, &format!("{line}\n"), AnalysisOptions::default());
4057        let r = &result.raw;
4058        (
4059            r.functions,
4060            r.classes,
4061            r.variables,
4062            r.imports,
4063            r.test_count,
4064            r.test_assertion_count,
4065            r.test_suite_count,
4066        )
4067    }
4068
4069    #[test]
4070    fn python_test_fn_not_double_counted() {
4071        // def test_ lines count as tests only, NOT as functions
4072        let (f, c, _, _, t, _, _) = sym(Language::Python, "def test_foo():");
4073        assert_eq!(f, 0, "test fn must not also increment functions");
4074        assert_eq!(t, 1, "must be counted as a test");
4075        assert_eq!(c, 0);
4076    }
4077
4078    #[test]
4079    fn python_test_class_not_double_counted() {
4080        // class Test* lines count as tests only, NOT as classes
4081        let (f, c, _, _, t, _, _) = sym(Language::Python, "class TestFoo:");
4082        assert_eq!(c, 0, "test class must not also increment classes");
4083        assert_eq!(t, 1, "must be counted as a test");
4084        assert_eq!(f, 0);
4085    }
4086
4087    #[test]
4088    fn python_regular_fn_counts_as_function() {
4089        let (f, c, _, _, t, _, _) = sym(Language::Python, "def regular():");
4090        assert_eq!(f, 1, "regular function must be counted");
4091        assert_eq!(t, 0);
4092        assert_eq!(c, 0);
4093    }
4094
4095    #[test]
4096    fn python_regular_class_counts_as_class() {
4097        let (f, c, _, _, t, _, _) = sym(Language::Python, "class Regular:");
4098        assert_eq!(c, 1, "regular class must be counted");
4099        assert_eq!(t, 0);
4100        assert_eq!(f, 0);
4101    }
4102
4103    #[test]
4104    fn go_test_fn_not_double_counted() {
4105        let (f, _, _, _, t, _, _) = sym(Language::Go, "func TestFoo(t *testing.T) {");
4106        assert_eq!(f, 0, "Go test func must not also increment functions");
4107        assert_eq!(t, 1, "must be counted as a test");
4108    }
4109
4110    #[test]
4111    fn go_benchmark_fn_not_double_counted() {
4112        let (f, _, _, _, t, _, _) = sym(Language::Go, "func BenchmarkBar(b *testing.B) {");
4113        assert_eq!(f, 0, "Go benchmark func must not also increment functions");
4114        assert_eq!(t, 1, "must be counted as a test");
4115    }
4116
4117    #[test]
4118    fn go_regular_fn_counts_as_function() {
4119        let (f, _, _, _, t, _, _) = sym(Language::Go, "func doSomething() {");
4120        assert_eq!(f, 1, "regular Go func must be counted");
4121        assert_eq!(t, 0);
4122    }
4123
4124    #[test]
4125    fn rust_test_attr_counts_as_test_not_function() {
4126        // #[test] is a standalone attribute line — counted as a test, never as a function
4127        let (f, _, _, _, t, _, _) = sym(Language::Rust, "#[test]");
4128        assert_eq!(t, 1, "#[test] must be counted as a test");
4129        assert_eq!(f, 0, "#[test] attribute must not be counted as a function");
4130    }
4131
4132    #[test]
4133    fn rust_fn_line_counts_as_function_not_test() {
4134        // The fn declaration after #[test] does NOT match any test pattern
4135        let (f, _, _, _, t, _, _) = sym(Language::Rust, "fn test_something() {");
4136        assert_eq!(f, 1, "fn declaration must count as a function");
4137        assert_eq!(
4138            t, 0,
4139            "fn declaration line must not be double-counted as a test"
4140        );
4141    }
4142
4143    #[test]
4144    fn js_describe_counts_as_test_not_function() {
4145        let (f, _, _, _, t, _, _) = sym(Language::JavaScript, "describe('suite', () => {");
4146        assert_eq!(t, 1, "describe must be counted as a test");
4147        assert_eq!(f, 0, "describe must not be counted as a function");
4148    }
4149
4150    #[test]
4151    fn js_regular_fn_counts_as_function() {
4152        let (f, _, _, _, t, _, _) = sym(Language::JavaScript, "function doWork() {");
4153        assert_eq!(f, 1, "JS function declaration must be counted");
4154        assert_eq!(t, 0);
4155    }
4156
4157    // ── Language detection tests ─────────────────────────────────────────────
4158
4159    use std::collections::BTreeMap;
4160    use std::path::Path;
4161
4162    #[test]
4163    fn detect_language_rs_extension() {
4164        let lang = detect_language(Path::new("foo.rs"), None, &BTreeMap::new(), false);
4165        assert_eq!(lang, Some(Language::Rust));
4166    }
4167
4168    #[test]
4169    fn detect_language_py_extension() {
4170        let lang = detect_language(Path::new("foo.py"), None, &BTreeMap::new(), false);
4171        assert_eq!(lang, Some(Language::Python));
4172    }
4173
4174    #[test]
4175    fn detect_language_ts_extension() {
4176        let lang = detect_language(Path::new("app.ts"), None, &BTreeMap::new(), false);
4177        assert_eq!(lang, Some(Language::TypeScript));
4178    }
4179
4180    #[test]
4181    fn detect_language_js_extension() {
4182        let lang = detect_language(Path::new("app.js"), None, &BTreeMap::new(), false);
4183        assert_eq!(lang, Some(Language::JavaScript));
4184    }
4185
4186    #[test]
4187    fn detect_language_go_extension() {
4188        let lang = detect_language(Path::new("main.go"), None, &BTreeMap::new(), false);
4189        assert_eq!(lang, Some(Language::Go));
4190    }
4191
4192    #[test]
4193    fn detect_language_c_extension() {
4194        let lang = detect_language(Path::new("main.c"), None, &BTreeMap::new(), false);
4195        assert_eq!(lang, Some(Language::C));
4196    }
4197
4198    #[test]
4199    fn detect_language_cpp_extension() {
4200        let lang = detect_language(Path::new("main.cpp"), None, &BTreeMap::new(), false);
4201        assert_eq!(lang, Some(Language::Cpp));
4202    }
4203
4204    #[test]
4205    fn detect_language_java_extension() {
4206        let lang = detect_language(Path::new("Main.java"), None, &BTreeMap::new(), false);
4207        assert_eq!(lang, Some(Language::Java));
4208    }
4209
4210    #[test]
4211    fn detect_language_makefile_exact_name() {
4212        let lang = detect_language(Path::new("Makefile"), None, &BTreeMap::new(), false);
4213        assert_eq!(lang, Some(Language::Makefile));
4214    }
4215
4216    #[test]
4217    fn detect_language_dockerfile_exact_name() {
4218        let lang = detect_language(Path::new("Dockerfile"), None, &BTreeMap::new(), false);
4219        assert_eq!(lang, Some(Language::Dockerfile));
4220    }
4221
4222    #[test]
4223    fn detect_language_rakefile() {
4224        let lang = detect_language(Path::new("Rakefile"), None, &BTreeMap::new(), false);
4225        assert_eq!(lang, Some(Language::Ruby));
4226    }
4227
4228    #[test]
4229    fn detect_language_gemfile() {
4230        let lang = detect_language(Path::new("Gemfile"), None, &BTreeMap::new(), false);
4231        assert_eq!(lang, Some(Language::Ruby));
4232    }
4233
4234    #[test]
4235    fn detect_language_unknown_extension_returns_none() {
4236        let lang = detect_language(Path::new("foo.xyz123"), None, &BTreeMap::new(), false);
4237        assert_eq!(lang, None);
4238    }
4239
4240    #[test]
4241    fn detect_language_extension_override() {
4242        let mut overrides = BTreeMap::new();
4243        overrides.insert("h".into(), "cpp".into());
4244        let lang = detect_language(Path::new("header.h"), None, &overrides, false);
4245        assert_eq!(lang, Some(Language::Cpp));
4246    }
4247
4248    #[test]
4249    fn detect_language_shebang_python() {
4250        let lang = detect_language(
4251            Path::new("script"),
4252            Some("#!/usr/bin/env python3"),
4253            &BTreeMap::new(),
4254            true,
4255        );
4256        assert_eq!(lang, Some(Language::Python));
4257    }
4258
4259    #[test]
4260    fn detect_language_shebang_bash() {
4261        let lang = detect_language(
4262            Path::new("script"),
4263            Some("#!/bin/bash"),
4264            &BTreeMap::new(),
4265            true,
4266        );
4267        assert_eq!(lang, Some(Language::Shell));
4268    }
4269
4270    #[test]
4271    fn detect_language_shebang_ruby() {
4272        let lang = detect_language(
4273            Path::new("script"),
4274            Some("#!/usr/bin/env ruby"),
4275            &BTreeMap::new(),
4276            true,
4277        );
4278        assert_eq!(lang, Some(Language::Ruby));
4279    }
4280
4281    #[test]
4282    fn detect_language_shebang_disabled() {
4283        // When shebang_detection=false, shebang is ignored
4284        let lang = detect_language(
4285            Path::new("script"),
4286            Some("#!/usr/bin/env python3"),
4287            &BTreeMap::new(),
4288            false,
4289        );
4290        assert_eq!(lang, None);
4291    }
4292
4293    #[test]
4294    fn from_name_rust() {
4295        assert_eq!(Language::from_name("rust"), Some(Language::Rust));
4296    }
4297
4298    #[test]
4299    fn from_name_python() {
4300        assert_eq!(Language::from_name("python"), Some(Language::Python));
4301    }
4302
4303    #[test]
4304    fn from_name_unknown() {
4305        assert_eq!(Language::from_name("brainfuck"), None);
4306    }
4307
4308    #[test]
4309    fn from_name_roundtrip_all() {
4310        // Every language's slug should round-trip through from_name
4311        for lang in [
4312            Language::C,
4313            Language::Cpp,
4314            Language::CSharp,
4315            Language::Go,
4316            Language::Java,
4317            Language::JavaScript,
4318            Language::Python,
4319            Language::Rust,
4320            Language::Shell,
4321            Language::PowerShell,
4322            Language::TypeScript,
4323            Language::Assembly,
4324            Language::Clojure,
4325            Language::Css,
4326            Language::Dart,
4327            Language::Dockerfile,
4328            Language::Elixir,
4329            Language::Erlang,
4330            Language::FSharp,
4331            Language::Groovy,
4332            Language::Haskell,
4333            Language::Html,
4334            Language::Julia,
4335            Language::Kotlin,
4336            Language::Lua,
4337            Language::Makefile,
4338            Language::Nim,
4339            Language::ObjectiveC,
4340            Language::Ocaml,
4341            Language::Perl,
4342            Language::Php,
4343            Language::R,
4344            Language::Ruby,
4345            Language::Scala,
4346            Language::Scss,
4347            Language::Sql,
4348            Language::Svelte,
4349            Language::Swift,
4350            Language::Vue,
4351            Language::Xml,
4352            Language::Zig,
4353        ] {
4354            let slug = lang.as_slug();
4355            let roundtripped = Language::from_name(slug);
4356            assert_eq!(
4357                roundtripped,
4358                Some(lang),
4359                "from_name({slug:?}) should return {lang:?}"
4360            );
4361        }
4362    }
4363
4364    // ── blank_in_block_comment_policy behavioral tests ───────────────────────
4365
4366    #[test]
4367    fn blank_in_block_comment_defaults_to_comment() {
4368        // Default: blank lines inside /* */ count as multi-comment lines (IEEE-aligned).
4369        let input = "/*\n\n*/";
4370        let opts = AnalysisOptions {
4371            blank_in_block_comment_as_comment: true,
4372            ..Default::default()
4373        };
4374        let result = analyze_text(Language::C, input, opts);
4375        assert_eq!(
4376            result.raw.multi_comment_only_lines, 3,
4377            "all 3 block-comment lines must count as multi-comment with CountAsComment policy"
4378        );
4379        assert_eq!(
4380            result.raw.blank_only_lines, 0,
4381            "no blank lines expected with CountAsComment policy"
4382        );
4383    }
4384
4385    #[test]
4386    fn blank_in_block_comment_counted_as_blank_when_policy_false() {
4387        // CountAsBlank: blank lines inside /* */ count as blank, not comment.
4388        let input = "/*\n\n*/";
4389        let opts = AnalysisOptions {
4390            blank_in_block_comment_as_comment: false,
4391            ..Default::default()
4392        };
4393        let result = analyze_text(Language::C, input, opts);
4394        assert_eq!(
4395            result.raw.multi_comment_only_lines, 2,
4396            "opener and closer must count as multi-comment with CountAsBlank policy"
4397        );
4398        assert_eq!(
4399            result.raw.blank_only_lines, 1,
4400            "the blank line inside the block comment must count as blank with CountAsBlank policy"
4401        );
4402    }
4403
4404    // ── continuation_line_policy behavioral tests ────────────────────────────
4405
4406    #[test]
4407    fn continuation_lines_each_physical_default() {
4408        // Default (EachPhysicalLine): every physical line counted separately.
4409        let input = "#define FOO \\\n  1 \\\n  + 2\n";
4410        let opts = AnalysisOptions {
4411            collapse_continuation_lines: false,
4412            ..Default::default()
4413        };
4414        let result = analyze_text(Language::C, input, opts);
4415        assert_eq!(
4416            result.raw.total_physical_lines, 3,
4417            "3 physical lines expected"
4418        );
4419        assert_eq!(
4420            result.raw.code_only_lines, 3,
4421            "each physical line must count as code with EachPhysicalLine policy"
4422        );
4423    }
4424
4425    #[test]
4426    fn continuation_lines_collapse_to_logical() {
4427        // CollapseToLogical: 3 backslash-continued lines collapse to 1 logical code line.
4428        let input = "#define FOO \\\n  1 \\\n  + 2\n";
4429        let opts = AnalysisOptions {
4430            collapse_continuation_lines: true,
4431            ..Default::default()
4432        };
4433        let result = analyze_text(Language::C, input, opts);
4434        assert_eq!(
4435            result.raw.total_physical_lines, 3,
4436            "physical line count is always 3 regardless of policy"
4437        );
4438        assert_eq!(
4439            result.raw.code_only_lines, 1,
4440            "3 continuation lines must collapse to 1 logical code line"
4441        );
4442    }
4443}