Skip to main content

sloc_languages/
lib.rs

1// SPDX-License-Identifier: AGPL-3.0-or-later
2// Copyright (C) 2026 Nima Shafie <nimzshafie@gmail.com>
3
4pub mod style;
5pub use style::{IndentStyle, StyleAnalysis, StyleGuideScore, StyleSignal};
6
7use std::collections::{BTreeMap, BTreeSet, HashSet};
8use std::path::Path;
9
10use serde::{Deserialize, Serialize};
11
12#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
13#[serde(rename_all = "snake_case")]
14pub enum Language {
15    C,
16    Cpp,
17    CSharp,
18    Go,
19    Java,
20    JavaScript,
21    Python,
22    Rust,
23    Shell,
24    PowerShell,
25    TypeScript,
26    // --- Extended language support ---
27    Assembly,
28    Clojure,
29    Css,
30    Dart,
31    Dockerfile,
32    Elixir,
33    Erlang,
34    FSharp,
35    Groovy,
36    Haskell,
37    Html,
38    Julia,
39    Kotlin,
40    Lua,
41    Makefile,
42    Nim,
43    ObjectiveC,
44    Ocaml,
45    Perl,
46    Php,
47    R,
48    Ruby,
49    Scala,
50    Scss,
51    Sql,
52    Svelte,
53    Swift,
54    Vue,
55    Xml,
56    Zig,
57}
58
59impl Language {
60    #[must_use]
61    pub const fn display_name(&self) -> &'static str {
62        match self {
63            Self::C => "C",
64            Self::Cpp => "C++",
65            Self::CSharp => "C#",
66            Self::Go => "Go",
67            Self::Java => "Java",
68            Self::JavaScript => "JavaScript",
69            Self::Python => "Python",
70            Self::Rust => "Rust",
71            Self::Shell => "Shell",
72            Self::PowerShell => "PowerShell",
73            Self::TypeScript => "TypeScript",
74            Self::Assembly => "Assembly",
75            Self::Clojure => "Clojure",
76            Self::Css => "CSS",
77            Self::Dart => "Dart",
78            Self::Dockerfile => "Dockerfile",
79            Self::Elixir => "Elixir",
80            Self::Erlang => "Erlang",
81            Self::FSharp => "F#",
82            Self::Groovy => "Groovy",
83            Self::Haskell => "Haskell",
84            Self::Html => "HTML",
85            Self::Julia => "Julia",
86            Self::Kotlin => "Kotlin",
87            Self::Lua => "Lua",
88            Self::Makefile => "Makefile",
89            Self::Nim => "Nim",
90            Self::ObjectiveC => "Objective-C",
91            Self::Ocaml => "OCaml",
92            Self::Perl => "Perl",
93            Self::Php => "PHP",
94            Self::R => "R",
95            Self::Ruby => "Ruby",
96            Self::Scala => "Scala",
97            Self::Scss => "SCSS",
98            Self::Sql => "SQL",
99            Self::Svelte => "Svelte",
100            Self::Swift => "Swift",
101            Self::Vue => "Vue",
102            Self::Xml => "XML",
103            Self::Zig => "Zig",
104        }
105    }
106
107    #[must_use]
108    pub const fn as_slug(&self) -> &'static str {
109        match self {
110            Self::C => "c",
111            Self::Cpp => "cpp",
112            Self::CSharp => "csharp",
113            Self::Go => "go",
114            Self::Java => "java",
115            Self::JavaScript => "javascript",
116            Self::Python => "python",
117            Self::Rust => "rust",
118            Self::Shell => "shell",
119            Self::PowerShell => "powershell",
120            Self::TypeScript => "typescript",
121            Self::Assembly => "assembly",
122            Self::Clojure => "clojure",
123            Self::Css => "css",
124            Self::Dart => "dart",
125            Self::Dockerfile => "dockerfile",
126            Self::Elixir => "elixir",
127            Self::Erlang => "erlang",
128            Self::FSharp => "fsharp",
129            Self::Groovy => "groovy",
130            Self::Haskell => "haskell",
131            Self::Html => "html",
132            Self::Julia => "julia",
133            Self::Kotlin => "kotlin",
134            Self::Lua => "lua",
135            Self::Makefile => "makefile",
136            Self::Nim => "nim",
137            Self::ObjectiveC => "objectivec",
138            Self::Ocaml => "ocaml",
139            Self::Perl => "perl",
140            Self::Php => "php",
141            Self::R => "r",
142            Self::Ruby => "ruby",
143            Self::Scala => "scala",
144            Self::Scss => "scss",
145            Self::Sql => "sql",
146            Self::Svelte => "svelte",
147            Self::Swift => "swift",
148            Self::Vue => "vue",
149            Self::Xml => "xml",
150            Self::Zig => "zig",
151        }
152    }
153
154    #[must_use]
155    pub fn from_name(name: &str) -> Option<Self> {
156        match name.trim().to_ascii_lowercase().as_str() {
157            "c" => Some(Self::C),
158            "cpp" | "c++" | "cplusplus" => Some(Self::Cpp),
159            "csharp" | "c#" | "cs" => Some(Self::CSharp),
160            "go" | "golang" => Some(Self::Go),
161            "java" => Some(Self::Java),
162            "javascript" | "js" => Some(Self::JavaScript),
163            "python" | "py" => Some(Self::Python),
164            "rust" | "rs" => Some(Self::Rust),
165            "shell" | "sh" | "bash" => Some(Self::Shell),
166            "powershell" | "pwsh" | "ps" => Some(Self::PowerShell),
167            "typescript" | "ts" => Some(Self::TypeScript),
168            "assembly" | "asm" => Some(Self::Assembly),
169            "clojure" | "clj" => Some(Self::Clojure),
170            "css" => Some(Self::Css),
171            "dart" => Some(Self::Dart),
172            "dockerfile" | "docker" => Some(Self::Dockerfile),
173            "elixir" | "ex" => Some(Self::Elixir),
174            "erlang" | "erl" => Some(Self::Erlang),
175            "fsharp" | "f#" | "fs" => Some(Self::FSharp),
176            "groovy" => Some(Self::Groovy),
177            "haskell" | "hs" => Some(Self::Haskell),
178            "html" | "htm" => Some(Self::Html),
179            "julia" | "jl" => Some(Self::Julia),
180            "kotlin" | "kt" => Some(Self::Kotlin),
181            "lua" => Some(Self::Lua),
182            "makefile" | "make" | "mk" => Some(Self::Makefile),
183            "nim" => Some(Self::Nim),
184            "objectivec" | "objc" | "objective-c" => Some(Self::ObjectiveC),
185            "ocaml" | "ml" => Some(Self::Ocaml),
186            "perl" | "pl" => Some(Self::Perl),
187            "php" => Some(Self::Php),
188            "r" => Some(Self::R),
189            "ruby" | "rb" => Some(Self::Ruby),
190            "scala" => Some(Self::Scala),
191            "scss" | "sass" => Some(Self::Scss),
192            "sql" => Some(Self::Sql),
193            "svelte" => Some(Self::Svelte),
194            "swift" => Some(Self::Swift),
195            "vue" => Some(Self::Vue),
196            "xml" => Some(Self::Xml),
197            "zig" => Some(Self::Zig),
198            _ => None,
199        }
200    }
201}
202
203#[derive(Debug, Clone, Serialize, Deserialize, Default)]
204pub struct RawLineCounts {
205    pub total_physical_lines: u64,
206    pub blank_only_lines: u64,
207    pub code_only_lines: u64,
208    pub single_comment_only_lines: u64,
209    pub multi_comment_only_lines: u64,
210    pub mixed_code_single_comment_lines: u64,
211    pub mixed_code_multi_comment_lines: u64,
212    pub docstring_comment_lines: u64,
213    pub skipped_unknown_lines: u64,
214    /// Best-effort count of function/method definition lines detected lexically.
215    #[serde(default)]
216    pub functions: u64,
217    /// Best-effort count of class/struct/trait/type definition lines detected lexically.
218    #[serde(default)]
219    pub classes: u64,
220    /// Best-effort count of variable declaration lines detected lexically.
221    #[serde(default)]
222    pub variables: u64,
223    /// Best-effort count of import/use/include statement lines detected lexically.
224    #[serde(default)]
225    pub imports: u64,
226    /// Lines consisting solely of preprocessor/compiler directives (e.g. `#include`, `#define`
227    /// in C/C++/Objective-C). Always a subset of `code_only_lines`. Controlled by
228    /// `AnalysisConfig::count_compiler_directives`. IEEE 1045-1992 §4.2.
229    #[serde(default)]
230    pub compiler_directive_lines: u64,
231    /// Best-effort count of test case / test function definition lines detected lexically
232    /// (`GTest`, Catch2, `PyTest`, `JUnit`, etc.). Always a subset of `code_only_lines`.
233    #[serde(default)]
234    pub test_count: u64,
235    /// Best-effort count of test assertion call lines detected lexically
236    /// (`ASSERT_EQ`, `EXPECT_TRUE`, assertEquals, Assert.AreEqual, `assert_eq`!, etc.).
237    #[serde(default)]
238    pub test_assertion_count: u64,
239    /// Best-effort count of test suite / fixture / group declaration lines detected lexically
240    /// (`TEST_GROUP`, `BOOST_AUTO_TEST_SUITE`, [`TestClass`], [`TestFixture`], etc.).
241    #[serde(default)]
242    pub test_suite_count: u64,
243}
244
245#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
246#[serde(rename_all = "snake_case")]
247pub enum ParseMode {
248    Lexical,
249    LexicalBestEffort,
250    TreeSitter,
251}
252
253#[derive(Debug, Clone, Serialize, Deserialize)]
254pub struct RawFileAnalysis {
255    pub raw: RawLineCounts,
256    pub parse_mode: ParseMode,
257    pub warnings: Vec<String>,
258    /// Lexical style-guide analysis for supported languages; `None` when no heuristics apply.
259    #[serde(default, skip_serializing_if = "Option::is_none")]
260    pub style_analysis: Option<StyleAnalysis>,
261}
262
263/// IEEE 1045-1992 counting options passed from `sloc-core` (built from `AnalysisConfig`).
264///
265/// `analyze_text` accepts this struct so that the caller can control behaviour that the
266/// standard defines as configurable parameters rather than fixed conventions.
267#[derive(Debug, Clone, Copy)]
268pub struct AnalysisOptions {
269    /// When `true` (IEEE 1045-1992 default), blank lines inside block comments count as
270    /// comment lines rather than blank lines.
271    pub blank_in_block_comment_as_comment: bool,
272    /// When `true`, backslash-continued physical lines are collapsed into a single logical
273    /// line for SLOC counting purposes (IEEE logical SLOC mode).
274    pub collapse_continuation_lines: bool,
275    /// When `true` (default), run lexical style-guide heuristics and populate
276    /// `RawFileAnalysis::style_analysis`. Set to `false` to skip style scoring entirely.
277    pub enable_style: bool,
278    /// Restrict style analysis to a specific language family slug (`"all"` or `"c_family"`).
279    /// When `"c_family"`, only C / C++ / Objective-C files are style-analysed.
280    pub style_lang_scope: StyleLangScope,
281}
282
283/// Which language families receive style-guide heuristic analysis.
284#[derive(Debug, Clone, Copy, PartialEq, Eq)]
285pub enum StyleLangScope {
286    All,
287    CFamilyOnly,
288}
289
290impl Default for AnalysisOptions {
291    fn default() -> Self {
292        Self {
293            blank_in_block_comment_as_comment: true,
294            collapse_continuation_lines: false,
295            enable_style: true,
296            style_lang_scope: StyleLangScope::All,
297        }
298    }
299}
300
301#[must_use]
302pub fn supported_languages() -> BTreeSet<Language> {
303    [
304        Language::Assembly,
305        Language::C,
306        Language::Clojure,
307        Language::Cpp,
308        Language::CSharp,
309        Language::Css,
310        Language::Dart,
311        Language::Dockerfile,
312        Language::Elixir,
313        Language::Erlang,
314        Language::FSharp,
315        Language::Go,
316        Language::Groovy,
317        Language::Haskell,
318        Language::Html,
319        Language::Java,
320        Language::JavaScript,
321        Language::Julia,
322        Language::Kotlin,
323        Language::Lua,
324        Language::Makefile,
325        Language::Nim,
326        Language::ObjectiveC,
327        Language::Ocaml,
328        Language::Perl,
329        Language::Php,
330        Language::PowerShell,
331        Language::Python,
332        Language::R,
333        Language::Ruby,
334        Language::Rust,
335        Language::Scala,
336        Language::Scss,
337        Language::Shell,
338        Language::Sql,
339        Language::Svelte,
340        Language::Swift,
341        Language::TypeScript,
342        Language::Vue,
343        Language::Xml,
344        Language::Zig,
345    ]
346    .into_iter()
347    .collect()
348}
349
350/// Detect language from a shebang line (e.g. `#!/usr/bin/env python3`).
351fn detect_by_shebang(line: &str) -> Option<Language> {
352    let lower = line.to_ascii_lowercase();
353    if !lower.starts_with("#!") {
354        return None;
355    }
356    if lower.contains("python") {
357        return Some(Language::Python);
358    }
359    if lower.contains("pwsh") || lower.contains("powershell") {
360        return Some(Language::PowerShell);
361    }
362    if lower.contains("bash")
363        || lower.contains("/sh")
364        || lower.contains("zsh")
365        || lower.contains("ksh")
366    {
367        return Some(Language::Shell);
368    }
369    if lower.contains("ruby") {
370        return Some(Language::Ruby);
371    }
372    if lower.contains("perl") {
373        return Some(Language::Perl);
374    }
375    if lower.contains("php") {
376        return Some(Language::Php);
377    }
378    if lower.contains("node") || lower.contains("nodejs") {
379        return Some(Language::JavaScript);
380    }
381    None
382}
383
384/// Detect language purely from a (lowercased) file extension.
385fn detect_by_extension(ext: &str) -> Option<Language> {
386    // Static table avoids a large match statement; each extension maps 1-to-1 to a language.
387    static EXT_MAP: &[(&str, Language)] = &[
388        ("c", Language::C),
389        ("h", Language::C),
390        ("cc", Language::Cpp),
391        ("cp", Language::Cpp),
392        ("cpp", Language::Cpp),
393        ("cxx", Language::Cpp),
394        ("hh", Language::Cpp),
395        ("hpp", Language::Cpp),
396        ("hxx", Language::Cpp),
397        ("cs", Language::CSharp),
398        ("go", Language::Go),
399        ("java", Language::Java),
400        ("js", Language::JavaScript),
401        ("mjs", Language::JavaScript),
402        ("cjs", Language::JavaScript),
403        ("py", Language::Python),
404        ("rs", Language::Rust),
405        ("sh", Language::Shell),
406        ("bash", Language::Shell),
407        ("zsh", Language::Shell),
408        ("ksh", Language::Shell),
409        ("ps1", Language::PowerShell),
410        ("psm1", Language::PowerShell),
411        ("psd1", Language::PowerShell),
412        ("ts", Language::TypeScript),
413        ("mts", Language::TypeScript),
414        ("cts", Language::TypeScript),
415        ("asm", Language::Assembly),
416        ("s", Language::Assembly),
417        ("clj", Language::Clojure),
418        ("cljs", Language::Clojure),
419        ("cljc", Language::Clojure),
420        ("edn", Language::Clojure),
421        ("css", Language::Css),
422        ("dart", Language::Dart),
423        ("ex", Language::Elixir),
424        ("exs", Language::Elixir),
425        ("erl", Language::Erlang),
426        ("hrl", Language::Erlang),
427        ("fs", Language::FSharp),
428        ("fsi", Language::FSharp),
429        ("fsx", Language::FSharp),
430        ("groovy", Language::Groovy),
431        ("gradle", Language::Groovy),
432        ("hs", Language::Haskell),
433        ("lhs", Language::Haskell),
434        ("html", Language::Html),
435        ("htm", Language::Html),
436        ("xhtml", Language::Html),
437        ("jl", Language::Julia),
438        ("kt", Language::Kotlin),
439        ("kts", Language::Kotlin),
440        ("lua", Language::Lua),
441        ("mk", Language::Makefile),
442        ("nim", Language::Nim),
443        ("nims", Language::Nim),
444        ("m", Language::ObjectiveC),
445        ("mm", Language::ObjectiveC),
446        ("ml", Language::Ocaml),
447        ("mli", Language::Ocaml),
448        ("pl", Language::Perl),
449        ("pm", Language::Perl),
450        ("t", Language::Perl),
451        ("php", Language::Php),
452        ("php3", Language::Php),
453        ("php4", Language::Php),
454        ("php5", Language::Php),
455        ("php7", Language::Php),
456        ("phtml", Language::Php),
457        ("r", Language::R),
458        ("rb", Language::Ruby),
459        ("rake", Language::Ruby),
460        ("scala", Language::Scala),
461        ("sc", Language::Scala),
462        ("scss", Language::Scss),
463        ("sass", Language::Scss),
464        ("sql", Language::Sql),
465        ("svelte", Language::Svelte),
466        ("swift", Language::Swift),
467        ("vue", Language::Vue),
468        ("xml", Language::Xml),
469        ("xsd", Language::Xml),
470        ("xsl", Language::Xml),
471        ("xslt", Language::Xml),
472        ("svg", Language::Xml),
473        ("zig", Language::Zig),
474    ];
475    EXT_MAP.iter().find_map(|&(e, l)| (e == ext).then_some(l))
476}
477
478/// Detect language from an exact filename (no extension) or well-known filename patterns.
479fn detect_by_filename(filename: &str, filename_lower: &str) -> Option<Language> {
480    // Dockerfile: exact name or Dockerfile.* variant
481    if filename == "Dockerfile"
482        || filename.starts_with("Dockerfile.")
483        || filename_lower == "dockerfile"
484    {
485        return Some(Language::Dockerfile);
486    }
487    // Makefile variants
488    if matches!(
489        filename,
490        "Makefile" | "GNUmakefile" | "makefile" | "BSDmakefile"
491    ) {
492        return Some(Language::Makefile);
493    }
494    // Ruby ecosystem files that have no extension
495    if matches!(
496        filename,
497        "Rakefile" | "Gemfile" | "Guardfile" | "Vagrantfile" | "Fastfile" | "Podfile"
498    ) {
499        return Some(Language::Ruby);
500    }
501    None
502}
503
504#[must_use]
505#[allow(clippy::too_many_lines)]
506pub fn detect_language(
507    path: &Path,
508    first_line: Option<&str>,
509    extension_overrides: &BTreeMap<String, String>,
510    shebang_detection: bool,
511) -> Option<Language> {
512    let extension = path
513        .extension()
514        .and_then(|ext| ext.to_str())
515        .map(str::to_ascii_lowercase);
516
517    // Extension override check (user-configured mappings win over everything)
518    if let Some(ext) = extension.as_ref() {
519        if let Some(override_name) = extension_overrides.get(ext.as_str()) {
520            if let Some(lang) = Language::from_name(override_name) {
521                return Some(lang);
522            }
523        }
524    }
525
526    // Filename-based detection for files that have no extension or use exact names
527    let filename = path.file_name().and_then(|s| s.to_str()).unwrap_or("");
528    let filename_lower = filename.to_ascii_lowercase();
529
530    if let Some(lang) = detect_by_filename(filename, &filename_lower) {
531        return Some(lang);
532    }
533
534    // Extension-based detection
535    if let Some(lang) = extension.as_deref().and_then(detect_by_extension) {
536        return Some(lang);
537    }
538
539    // Shebang detection (last resort — only for extensionless scripts)
540    if shebang_detection {
541        if let Some(line) = first_line {
542            if let Some(lang) = detect_by_shebang(line) {
543                return Some(lang);
544            }
545        }
546    }
547
548    None
549}
550
551#[must_use]
552pub fn analyze_text(language: Language, text: &str, options: AnalysisOptions) -> RawFileAnalysis {
553    // tree-sitter fast-paths (compiled out when feature is disabled)
554    #[cfg(feature = "tree-sitter")]
555    {
556        match language {
557            Language::C | Language::Cpp => {
558                if let Some(mut result) = ts::analyze_c(text) {
559                    if options.enable_style
560                        && should_style_analyse(language, options.style_lang_scope)
561                    {
562                        result.style_analysis = style::analyze_style(language, text);
563                    }
564                    return result;
565                }
566            }
567            Language::Python => {
568                if let Some(result) = ts::analyze_python(text) {
569                    return result;
570                }
571            }
572            _ => {}
573        }
574    }
575
576    let (mut config, has_preprocessor) = language_scan_config(language);
577
578    // Python docstring lines are computed from the text and cannot be a static constant.
579    if language == Language::Python {
580        config.skip_lines = detect_python_docstring_lines(text);
581    }
582
583    // C, C++, and Objective-C have a preprocessor whose directive lines are tracked separately
584    // per IEEE 1045-1992 §4.2; every other language uses base flags.
585    let flags = IeeeFlags {
586        has_preprocessor_directives: has_preprocessor,
587        blank_in_block_comment_as_comment: options.blank_in_block_comment_as_comment,
588        collapse_continuation_lines: options.collapse_continuation_lines,
589    };
590    let mut result = analyze_generic(text, config, flags);
591    if options.enable_style && should_style_analyse(language, options.style_lang_scope) {
592        result.style_analysis = style::analyze_style(language, text);
593    }
594    result
595}
596
597/// Returns `true` when `language` should be style-analysed under `scope`.
598fn should_style_analyse(language: Language, scope: StyleLangScope) -> bool {
599    match scope {
600        StyleLangScope::CFamilyOnly => {
601            matches!(language, Language::C | Language::Cpp | Language::ObjectiveC)
602        }
603        StyleLangScope::All => true,
604    }
605}
606
607/// Returns the lexical scan configuration for `language` and whether it uses a C preprocessor.
608/// All fields are static constants except `skip_lines`, which is always empty here; callers that
609/// need non-empty skip sets (currently only Python) must populate the field after this call.
610///
611/// The implementation delegates to `LANG_SCAN_TABLE` (a static `&[(Language, StaticLangConfig)]`)
612/// defined below the `SP_*` symbol-pattern constants.  Each language appears exactly once in the
613/// table, so the linear scan is O(|languages|) but avoids a 41-arm `match` statement.
614fn language_scan_config(language: Language) -> (ScanConfig, bool) {
615    let cfg = LANG_SCAN_TABLE
616        .iter()
617        .find_map(|&(l, c)| (l == language).then_some(c))
618        .unwrap_or_else(|| panic!("language_scan_config: no entry for {language:?}"));
619    (
620        ScanConfig {
621            line_comments: cfg.line_comments,
622            block_comment: cfg.block_comment,
623            allow_single_quote_strings: cfg.allow_single_quote_strings,
624            allow_double_quote_strings: cfg.allow_double_quote_strings,
625            allow_triple_quote_strings: cfg.allow_triple_quote_strings,
626            allow_csharp_verbatim_strings: cfg.allow_csharp_verbatim_strings,
627            skip_lines: HashSet::new(),
628            symbol_patterns: cfg.symbol_patterns,
629        },
630        cfg.has_preprocessor,
631    )
632}
633
634/// Per-language keyword prefixes used for best-effort structural symbol detection.
635/// Each slice lists line prefixes (after leading whitespace is stripped) that indicate
636/// a definition of that category. Empty slice = detection disabled for that category.
637#[derive(Debug, Clone, Copy)]
638struct SymbolPatterns {
639    functions: &'static [&'static str],
640    /// Line prefixes that classify as a function only when the line ALSO contains `(`
641    /// AND there is no `=` between the prefix and the first `(`.  Used for C/C++ where
642    /// function definitions are led by the return type (`void`, `int`, `bool`, …) with
643    /// no dedicated keyword, so the paren guard distinguishes `void f(x)` from
644    /// `void* p = malloc(n)`.
645    functions_prefix_paren: &'static [&'static str],
646    classes: &'static [&'static str],
647    variables: &'static [&'static str],
648    imports: &'static [&'static str],
649    /// Line prefixes (after stripping leading whitespace) that indicate a test case or test
650    /// function definition. Matched against code lines only, same as other symbol categories.
651    tests: &'static [&'static str],
652    /// Line prefixes that indicate a test assertion call (`ASSERT_EQ`, assertEquals, `assert_eq`!,
653    /// Assert.AreEqual, etc.). Matched against code lines only.
654    assertions: &'static [&'static str],
655    /// Line prefixes that indicate a test suite / fixture / group declaration
656    /// (`TEST_GROUP`, `BOOST_AUTO_TEST_SUITE`, [`TestClass`], [`TestFixture`], etc.).
657    test_suites: &'static [&'static str],
658}
659
660impl SymbolPatterns {
661    const fn none() -> Self {
662        Self {
663            functions: &[],
664            functions_prefix_paren: &[],
665            classes: &[],
666            variables: &[],
667            imports: &[],
668            tests: &[],
669            assertions: &[],
670            test_suites: &[],
671        }
672    }
673}
674
675const SP_NONE: SymbolPatterns = SymbolPatterns::none(); // all fields are &[]
676
677const SP_RUST: SymbolPatterns = SymbolPatterns {
678    functions: &[
679        "fn ",
680        "pub fn ",
681        "pub(crate) fn ",
682        "pub(super) fn ",
683        "async fn ",
684        "pub async fn ",
685        "pub(crate) async fn ",
686        "unsafe fn ",
687        "pub unsafe fn ",
688        "pub(crate) unsafe fn ",
689        "const fn ",
690        "pub const fn ",
691        "pub(crate) const fn ",
692        "extern fn ",
693        "pub extern fn ",
694    ],
695    functions_prefix_paren: &[],
696    classes: &[
697        "struct ",
698        "pub struct ",
699        "pub(crate) struct ",
700        "enum ",
701        "pub enum ",
702        "pub(crate) enum ",
703        "trait ",
704        "pub trait ",
705        "pub(crate) trait ",
706        "impl ",
707        "impl<",
708        "type ",
709        "pub type ",
710        "pub(crate) type ",
711    ],
712    variables: &["let ", "let mut "],
713    imports: &["use ", "pub use ", "pub(crate) use ", "extern crate "],
714    // Built-in #[test], tokio/actix async test attributes, rstest
715    tests: &[
716        "#[test]",
717        "#[tokio::test]",
718        "#[actix_web::test]",
719        "#[rstest]",
720        "#[test_case",
721    ],
722    assertions: &[
723        "assert_eq!(",
724        "assert_ne!(",
725        "assert!(",
726        "assert_matches!(",
727        "assert_err!(",
728        "assert_ok!(",
729    ],
730    test_suites: &[],
731};
732
733const SP_PYTHON: SymbolPatterns = SymbolPatterns {
734    functions: &["def ", "async def "],
735    functions_prefix_paren: &[],
736    classes: &["class "],
737    variables: &[],
738    imports: &["import ", "from "],
739    // pytest: test_ prefix functions and Test* classes; unittest: test_ methods
740    tests: &["def test_", "async def test_", "class Test"],
741    assertions: &[
742        "self.assertEqual(",
743        "self.assertNotEqual(",
744        "self.assertTrue(",
745        "self.assertFalse(",
746        "self.assertIsNone(",
747        "self.assertIsNotNone(",
748        "self.assertIn(",
749        "self.assertNotIn(",
750        "self.assertRaises(",
751        "self.assertAlmostEqual(",
752    ],
753    test_suites: &[],
754};
755
756const SP_JS: SymbolPatterns = SymbolPatterns {
757    functions: &[
758        "function ",
759        "async function ",
760        "export function ",
761        "export async function ",
762        "export default function ",
763    ],
764    functions_prefix_paren: &[],
765    classes: &["class ", "export class ", "export default class "],
766    variables: &[
767        "var ",
768        "let ",
769        "const ",
770        "export var ",
771        "export let ",
772        "export const ",
773    ],
774    imports: &["import "],
775    // Jest/Mocha/Jasmine: describe/it/test block openers
776    tests: &[
777        "describe(",
778        "it(",
779        "test(",
780        "it.each(",
781        "test.each(",
782        "describe.each(",
783    ],
784    assertions: &["expect("],
785    test_suites: &[],
786};
787
788const SP_TS: SymbolPatterns = SymbolPatterns {
789    functions: &[
790        "function ",
791        "async function ",
792        "export function ",
793        "export async function ",
794        "export default function ",
795    ],
796    functions_prefix_paren: &[],
797    classes: &[
798        "class ",
799        "export class ",
800        "export default class ",
801        "abstract class ",
802        "export abstract class ",
803        "interface ",
804        "export interface ",
805        "declare class ",
806        "declare interface ",
807    ],
808    variables: &[
809        "var ",
810        "let ",
811        "const ",
812        "export var ",
813        "export let ",
814        "export const ",
815    ],
816    imports: &["import "],
817    // Jest/Mocha/Jasmine/Vitest: describe/it/test block openers
818    tests: &[
819        "describe(",
820        "it(",
821        "test(",
822        "it.each(",
823        "test.each(",
824        "describe.each(",
825    ],
826    assertions: &["expect("],
827    test_suites: &[],
828};
829
830const SP_GO: SymbolPatterns = SymbolPatterns {
831    functions: &["func "],
832    functions_prefix_paren: &[],
833    classes: &["type "],
834    variables: &["var "],
835    imports: &["import "],
836    // Go standard testing: Test* functions (convention is practically exclusive to _test.go files)
837    tests: &["func Test", "func Benchmark", "func Fuzz"],
838    assertions: &[],
839    test_suites: &[],
840};
841
842const SP_JAVA: SymbolPatterns = SymbolPatterns {
843    functions: &[],
844    functions_prefix_paren: &[],
845    classes: &[
846        "class ",
847        "public class ",
848        "private class ",
849        "protected class ",
850        "abstract class ",
851        "final class ",
852        "public abstract class ",
853        "public final class ",
854        "interface ",
855        "public interface ",
856        "enum ",
857        "public enum ",
858        "record ",
859        "public record ",
860        "@interface ",
861    ],
862    variables: &[],
863    imports: &["import "],
864    // JUnit 4 & 5, TestNG — annotations appear on their own line before the method
865    tests: &[
866        "@Test",
867        "@ParameterizedTest",
868        "@RepeatedTest",
869        "@TestFactory",
870        "@TestTemplate",
871    ],
872    assertions: &[
873        "assertEquals(",
874        "assertNotEquals(",
875        "assertTrue(",
876        "assertFalse(",
877        "assertNull(",
878        "assertNotNull(",
879        "assertThat(",
880        "assertThrows(",
881        "assertAll(",
882        "assertArrayEquals(",
883        "assertIterableEquals(",
884        "assertLinesMatch(",
885    ],
886    test_suites: &[],
887};
888
889const SP_CSHARP: SymbolPatterns = SymbolPatterns {
890    functions: &[],
891    functions_prefix_paren: &[],
892    classes: &[
893        "class ",
894        "public class ",
895        "private class ",
896        "protected class ",
897        "internal class ",
898        "abstract class ",
899        "sealed class ",
900        "static class ",
901        "partial class ",
902        "public abstract class ",
903        "public sealed class ",
904        "public static class ",
905        "interface ",
906        "public interface ",
907        "internal interface ",
908        "enum ",
909        "public enum ",
910        "struct ",
911        "public struct ",
912        "record ",
913        "public record ",
914    ],
915    variables: &["var "],
916    imports: &["using "],
917    // MSTest, NUnit, xUnit — attributes on their own line before the method
918    tests: &[
919        "[TestMethod]",
920        "[Test]",
921        "[Fact]",
922        "[Theory]",
923        "[TestCase(",
924        "[DataRow(",
925        "[InlineData(",
926        "[MemberData(",
927    ],
928    assertions: &[
929        "Assert.AreEqual(",
930        "Assert.AreNotEqual(",
931        "Assert.IsTrue(",
932        "Assert.IsFalse(",
933        "Assert.IsNull(",
934        "Assert.IsNotNull(",
935        "Assert.Equal(",
936        "Assert.NotEqual(",
937        "Assert.True(",
938        "Assert.False(",
939        "Assert.That(",
940        "Assert.Contains(",
941        "Assert.Throws(",
942        "Assert.ThrowsAsync(",
943        "Assert.IsInstanceOfType(",
944    ],
945    test_suites: &["[TestClass]", "[TestFixture]", "[SetUpFixture]"],
946};
947
948// GTest, Catch2/doctest, Boost.Test, Unity, Check, CMocka, CppUTest patterns for C and C++.
949const TEST_PATTERNS_C_CPP: &[&str] = &[
950    // Google Test
951    "TEST(",
952    "TEST_F(",
953    "TEST_P(",
954    "TYPED_TEST(",
955    "TYPED_TEST_P(",
956    "INSTANTIATE_TEST_SUITE_P(",
957    "INSTANTIATE_TYPED_TEST_SUITE_P(",
958    // Catch2 / doctest
959    "TEST_CASE(",
960    "SECTION(",
961    "SCENARIO(",
962    "SCENARIO_METHOD(",
963    "TEST_CASE_METHOD(",
964    // Boost.Test
965    "BOOST_AUTO_TEST_CASE(",
966    "BOOST_FIXTURE_TEST_CASE(",
967    "BOOST_AUTO_TEST_SUITE(",
968    "BOOST_PARAM_TEST_CASE(",
969    // CppUnit
970    "CPPUNIT_TEST(",
971    "CPPUNIT_TEST_SUITE(",
972    // Unity (embedded C)
973    "RUN_TEST(",
974    "TEST_IGNORE(",
975    "TEST_FAIL(",
976    // Check (libcheck — embedded C)
977    "START_TEST(",
978    "tcase_add_test(",
979    "suite_create(",
980    // CMocka (embedded C)
981    "cmocka_unit_test(",
982    "cmocka_run_group_tests(",
983    // CppUTest
984    "IGNORE_TEST(",
985    "TEST_GROUP(",
986    "TEST_GROUP_BASE(",
987];
988
989// Test assertion patterns shared by C and C++.
990const ASSERT_PATTERNS_C_CPP: &[&str] = &[
991    // Google Test ASSERT_* (test-stopping failures)
992    "ASSERT_EQ(",
993    "ASSERT_NE(",
994    "ASSERT_LT(",
995    "ASSERT_LE(",
996    "ASSERT_GT(",
997    "ASSERT_GE(",
998    "ASSERT_TRUE(",
999    "ASSERT_FALSE(",
1000    "ASSERT_STREQ(",
1001    "ASSERT_STRNE(",
1002    "ASSERT_FLOAT_EQ(",
1003    "ASSERT_DOUBLE_EQ(",
1004    "ASSERT_NEAR(",
1005    "ASSERT_THROW(",
1006    "ASSERT_NO_THROW(",
1007    "ASSERT_ANY_THROW(",
1008    // Google Test EXPECT_* (non-stopping failures)
1009    "EXPECT_EQ(",
1010    "EXPECT_NE(",
1011    "EXPECT_LT(",
1012    "EXPECT_LE(",
1013    "EXPECT_GT(",
1014    "EXPECT_GE(",
1015    "EXPECT_TRUE(",
1016    "EXPECT_FALSE(",
1017    "EXPECT_STREQ(",
1018    "EXPECT_STRNE(",
1019    "EXPECT_FLOAT_EQ(",
1020    "EXPECT_DOUBLE_EQ(",
1021    "EXPECT_NEAR(",
1022    "EXPECT_THROW(",
1023    "EXPECT_NO_THROW(",
1024    "EXPECT_ANY_THROW(",
1025    // Catch2 / doctest assertions
1026    "REQUIRE(",
1027    "CHECK(",
1028    "REQUIRE_FALSE(",
1029    "CHECK_FALSE(",
1030    "REQUIRE_NOTHROW(",
1031    "CHECK_NOTHROW(",
1032    "REQUIRE_THROWS(",
1033    "CHECK_THROWS(",
1034    "REQUIRE_THAT(",
1035    "CHECK_THAT(",
1036    // Unity assertions (embedded C)
1037    "TEST_ASSERT_EQUAL(",
1038    "TEST_ASSERT_EQUAL_INT(",
1039    "TEST_ASSERT_EQUAL_STRING(",
1040    "TEST_ASSERT_EQUAL_FLOAT(",
1041    "TEST_ASSERT_EQUAL_DOUBLE(",
1042    "TEST_ASSERT_EQUAL_PTR(",
1043    "TEST_ASSERT_TRUE(",
1044    "TEST_ASSERT_FALSE(",
1045    "TEST_ASSERT_NULL(",
1046    "TEST_ASSERT_NOT_NULL(",
1047    "TEST_ASSERT_BITS_HIGH(",
1048    "TEST_ASSERT_BITS_LOW(",
1049    // CMocka assertions (embedded C)
1050    "assert_int_equal(",
1051    "assert_int_not_equal(",
1052    "assert_string_equal(",
1053    "assert_string_not_equal(",
1054    "assert_true(",
1055    "assert_false(",
1056    "assert_null(",
1057    "assert_non_null(",
1058    "assert_ptr_equal(",
1059    "assert_memory_equal(",
1060    "assert_return_code(",
1061];
1062
1063// Test suite/group declaration patterns for C and C++.
1064const SUITE_PATTERNS_C_CPP: &[&str] = &[
1065    "TEST_GROUP(",
1066    "TEST_GROUP_BASE(",
1067    "BOOST_AUTO_TEST_SUITE(",
1068    "CPPUNIT_TEST_SUITE(",
1069    "CPPUNIT_TEST_SUITE_END(",
1070];
1071
1072const SP_C: SymbolPatterns = SymbolPatterns {
1073    // C has no function keyword; detect by common return types that precede `(` with no `=`.
1074    functions: &[],
1075    functions_prefix_paren: &[
1076        "void ",
1077        "int ",
1078        "char ",
1079        "float ",
1080        "double ",
1081        "long ",
1082        "unsigned ",
1083        "size_t ",
1084        "static ",
1085        "inline ",
1086        "const ",
1087        "extern ",
1088    ],
1089    classes: &[
1090        "struct ",
1091        "typedef struct ",
1092        "union ",
1093        "typedef union ",
1094        "typedef enum ",
1095    ],
1096    variables: &[],
1097    imports: &["#include "],
1098    tests: TEST_PATTERNS_C_CPP,
1099    assertions: ASSERT_PATTERNS_C_CPP,
1100    test_suites: SUITE_PATTERNS_C_CPP,
1101};
1102
1103const SP_CPP: SymbolPatterns = SymbolPatterns {
1104    // C++ specific function keyword-prefixes; return-type-led patterns use functions_prefix_paren.
1105    functions: &[
1106        "virtual ",  // virtual method declaration/definition
1107        "explicit ", // explicit constructor modifier
1108        "~",         // destructor (e.g. ~MyClass())
1109        "operator",  // operator overload (operator==, operator+, …)
1110    ],
1111    functions_prefix_paren: &[
1112        "void ",
1113        "bool ",
1114        "int ",
1115        "char ",
1116        "float ",
1117        "double ",
1118        "long ",
1119        "unsigned ",
1120        "size_t ",
1121        "auto ",
1122        "static ",
1123        "inline ",
1124        "constexpr ",
1125        "const ",
1126        "extern ",
1127    ],
1128    // `template<` (no space) is the dominant modern style alongside `template ` (with space).
1129    classes: &["class ", "struct ", "namespace ", "template ", "template<"],
1130    variables: &[],
1131    imports: &["#include "],
1132    tests: TEST_PATTERNS_C_CPP,
1133    assertions: ASSERT_PATTERNS_C_CPP,
1134    test_suites: SUITE_PATTERNS_C_CPP,
1135};
1136
1137const SP_SHELL: SymbolPatterns = SymbolPatterns {
1138    functions: &["function "],
1139    functions_prefix_paren: &[],
1140    classes: &[],
1141    variables: &["declare ", "local ", "export "],
1142    imports: &["source ", ". "],
1143    tests: &[],
1144    assertions: &[],
1145    test_suites: &[],
1146};
1147
1148const SP_POWERSHELL: SymbolPatterns = SymbolPatterns {
1149    functions: &["function ", "Function "],
1150    functions_prefix_paren: &[],
1151    classes: &["class "],
1152    variables: &[],
1153    imports: &["Import-Module ", "using "],
1154    // Pester test framework
1155    tests: &["Describe ", "It ", "Context "],
1156    assertions: &[],
1157    test_suites: &[],
1158};
1159
1160const SP_KOTLIN: SymbolPatterns = SymbolPatterns {
1161    functions: &[
1162        "fun ",
1163        "private fun ",
1164        "public fun ",
1165        "protected fun ",
1166        "internal fun ",
1167        "override fun ",
1168        "suspend fun ",
1169        "abstract fun ",
1170        "open fun ",
1171        "private suspend fun ",
1172        "public suspend fun ",
1173    ],
1174    functions_prefix_paren: &[],
1175    classes: &[
1176        "class ",
1177        "data class ",
1178        "sealed class ",
1179        "abstract class ",
1180        "open class ",
1181        "object ",
1182        "companion object",
1183        "interface ",
1184        "enum class ",
1185        "annotation class ",
1186    ],
1187    variables: &["val ", "var ", "private val ", "private var ", "const val "],
1188    imports: &["import "],
1189    // JUnit 4/5, KotlinTest, Kotest
1190    tests: &[
1191        "@Test",
1192        "@ParameterizedTest",
1193        "@RepeatedTest",
1194        "\"should ",
1195        "\"it ",
1196    ],
1197    assertions: &[
1198        "assertEquals(",
1199        "assertNotEquals(",
1200        "assertTrue(",
1201        "assertFalse(",
1202        "assertNull(",
1203        "assertNotNull(",
1204        "assertThat(",
1205        "assertThrows(",
1206        "shouldBe(",
1207        "shouldNotBe(",
1208        "shouldThrow(",
1209    ],
1210    test_suites: &[],
1211};
1212
1213const SP_SWIFT: SymbolPatterns = SymbolPatterns {
1214    functions: &[
1215        "func ",
1216        "private func ",
1217        "public func ",
1218        "internal func ",
1219        "override func ",
1220        "open func ",
1221        "static func ",
1222        "class func ",
1223        "mutating func ",
1224        "private static func ",
1225        "public static func ",
1226    ],
1227    functions_prefix_paren: &[],
1228    classes: &[
1229        "class ",
1230        "struct ",
1231        "protocol ",
1232        "enum ",
1233        "extension ",
1234        "actor ",
1235        "public class ",
1236        "private class ",
1237        "open class ",
1238        "final class ",
1239        "public struct ",
1240        "private struct ",
1241        "public protocol ",
1242    ],
1243    variables: &[
1244        "var ",
1245        "let ",
1246        "private var ",
1247        "private let ",
1248        "static var ",
1249        "static let ",
1250    ],
1251    imports: &["import "],
1252    // XCTest: test functions are named test* by convention; Swift Testing: @Test attribute
1253    tests: &["func test", "func Test", "@Test"],
1254    assertions: &[
1255        "XCTAssertEqual(",
1256        "XCTAssertNotEqual(",
1257        "XCTAssertTrue(",
1258        "XCTAssertFalse(",
1259        "XCTAssertNil(",
1260        "XCTAssertNotNil(",
1261        "XCTAssertGreaterThan(",
1262        "XCTAssertLessThan(",
1263        "XCTAssertThrowsError(",
1264        "XCTAssertNoThrow(",
1265        "#expect(",
1266    ],
1267    test_suites: &[],
1268};
1269
1270const SP_RUBY: SymbolPatterns = SymbolPatterns {
1271    functions: &["def ", "private def ", "protected def "],
1272    functions_prefix_paren: &[],
1273    classes: &["class ", "module "],
1274    variables: &[],
1275    imports: &["require ", "require_relative "],
1276    // RSpec / minitest
1277    tests: &["it ", "it(", "describe ", "context ", "test "],
1278    assertions: &[],
1279    test_suites: &[],
1280};
1281
1282const SP_SCALA: SymbolPatterns = SymbolPatterns {
1283    functions: &["def ", "private def ", "protected def ", "override def "],
1284    functions_prefix_paren: &[],
1285    classes: &[
1286        "class ",
1287        "case class ",
1288        "abstract class ",
1289        "sealed class ",
1290        "object ",
1291        "trait ",
1292    ],
1293    variables: &["val ", "var ", "lazy val "],
1294    imports: &["import "],
1295    // ScalaTest / MUnit: FunSuite test("..."), FlatSpec it("..."), AnyWordSpec "..." should
1296    tests: &["test(", "it(", "describe("],
1297    assertions: &[],
1298    test_suites: &[],
1299};
1300
1301const SP_PHP: SymbolPatterns = SymbolPatterns {
1302    functions: &[
1303        "function ",
1304        "public function ",
1305        "private function ",
1306        "protected function ",
1307        "static function ",
1308        "abstract function ",
1309        "final function ",
1310        "public static function ",
1311        "private static function ",
1312        "protected static function ",
1313    ],
1314    functions_prefix_paren: &[],
1315    classes: &[
1316        "class ",
1317        "abstract class ",
1318        "final class ",
1319        "interface ",
1320        "trait ",
1321        "enum ",
1322    ],
1323    variables: &[],
1324    imports: &[
1325        "use ",
1326        "require ",
1327        "require_once ",
1328        "include ",
1329        "include_once ",
1330    ],
1331    // PHPUnit: test methods start with test, or use @test annotation
1332    tests: &[
1333        "public function test",
1334        "function test",
1335        "#[Test]",
1336        "#[DataProvider(",
1337    ],
1338    assertions: &[],
1339    test_suites: &[],
1340};
1341
1342const SP_ELIXIR: SymbolPatterns = SymbolPatterns {
1343    functions: &[
1344        "def ",
1345        "defp ",
1346        "defmacro ",
1347        "defmacrop ",
1348        "defguard ",
1349        "defguardp ",
1350    ],
1351    functions_prefix_paren: &[],
1352    classes: &["defmodule ", "defprotocol ", "defimpl "],
1353    variables: &[],
1354    imports: &["import ", "alias ", "use ", "require "],
1355    // ExUnit
1356    tests: &["test ", "describe "],
1357    assertions: &[],
1358    test_suites: &[],
1359};
1360
1361const SP_ERLANG: SymbolPatterns = SymbolPatterns {
1362    functions: &[],
1363    functions_prefix_paren: &[],
1364    classes: &["-module("],
1365    variables: &[],
1366    imports: &["-import(", "-include(", "-include_lib("],
1367    tests: &[],
1368    assertions: &[],
1369    test_suites: &[],
1370};
1371
1372const SP_FSHARP: SymbolPatterns = SymbolPatterns {
1373    functions: &[
1374        "let ",
1375        "let rec ",
1376        "member ",
1377        "override ",
1378        "abstract member ",
1379    ],
1380    functions_prefix_paren: &[],
1381    classes: &["type "],
1382    variables: &["let mutable "],
1383    imports: &["open "],
1384    // NUnit / xUnit attributes on their own line; FsUnit uses [<Test>] / [<Fact>]
1385    tests: &["[<Test>]", "[<Fact>]", "[<Theory>]", "[<TestCase("],
1386    assertions: &[],
1387    test_suites: &[],
1388};
1389
1390const SP_GROOVY: SymbolPatterns = SymbolPatterns {
1391    functions: &["def ", "private def ", "public def ", "protected def "],
1392    functions_prefix_paren: &[],
1393    classes: &["class ", "abstract class ", "interface ", "enum ", "trait "],
1394    variables: &[],
1395    imports: &["import "],
1396    // Spock framework: feature methods; JUnit annotations
1397    tests: &["def \"", "@Test", "given:", "when:", "then:", "expect:"],
1398    assertions: &[],
1399    test_suites: &[],
1400};
1401
1402const SP_HASKELL: SymbolPatterns = SymbolPatterns {
1403    functions: &[],
1404    functions_prefix_paren: &[],
1405    classes: &["class ", "data ", "newtype ", "type "],
1406    variables: &[],
1407    imports: &["import "],
1408    tests: &[],
1409    assertions: &[],
1410    test_suites: &[],
1411};
1412
1413const SP_LUA: SymbolPatterns = SymbolPatterns {
1414    functions: &["function ", "local function "],
1415    functions_prefix_paren: &[],
1416    classes: &[],
1417    variables: &["local "],
1418    imports: &[],
1419    // busted test framework
1420    tests: &["it(", "describe(", "pending("],
1421    assertions: &[],
1422    test_suites: &[],
1423};
1424
1425const SP_NIM: SymbolPatterns = SymbolPatterns {
1426    functions: &[
1427        "proc ",
1428        "func ",
1429        "method ",
1430        "iterator ",
1431        "converter ",
1432        "template ",
1433        "macro ",
1434    ],
1435    functions_prefix_paren: &[],
1436    classes: &["type "],
1437    variables: &["var ", "let ", "const "],
1438    imports: &["import ", "from "],
1439    // unittest module
1440    tests: &["test "],
1441    assertions: &[],
1442    test_suites: &[],
1443};
1444
1445const SP_OBJECTIVEC: SymbolPatterns = SymbolPatterns {
1446    functions: &["- (", "+ ("],
1447    functions_prefix_paren: &[],
1448    classes: &["@interface ", "@implementation ", "@protocol "],
1449    variables: &[],
1450    imports: &["#import ", "#include "],
1451    // XCTest: test methods start with - (void)test
1452    tests: &["- (void)test"],
1453    assertions: &[
1454        "XCTAssertEqual(",
1455        "XCTAssertNotEqual(",
1456        "XCTAssertTrue(",
1457        "XCTAssertFalse(",
1458        "XCTAssertNil(",
1459        "XCTAssertNotNil(",
1460        "XCTAssertGreaterThan(",
1461        "XCTAssertLessThan(",
1462        "XCTAssertThrowsError(",
1463        "XCTAssertNoThrow(",
1464    ],
1465    test_suites: &[],
1466};
1467
1468const SP_OCAML: SymbolPatterns = SymbolPatterns {
1469    functions: &["let ", "let rec "],
1470    functions_prefix_paren: &[],
1471    classes: &["type ", "module ", "class "],
1472    variables: &[],
1473    imports: &["open "],
1474    tests: &[],
1475    assertions: &[],
1476    test_suites: &[],
1477};
1478
1479const SP_PERL: SymbolPatterns = SymbolPatterns {
1480    functions: &["sub "],
1481    functions_prefix_paren: &[],
1482    classes: &["package "],
1483    variables: &["my ", "our ", "local "],
1484    imports: &["use ", "require "],
1485    tests: &[],
1486    assertions: &[],
1487    test_suites: &[],
1488};
1489
1490const SP_CLOJURE: SymbolPatterns = SymbolPatterns {
1491    functions: &["(defn ", "(defn- ", "(defmacro ", "(defmulti "],
1492    functions_prefix_paren: &[],
1493    classes: &[
1494        "(defrecord ",
1495        "(defprotocol ",
1496        "(deftype ",
1497        "(definterface ",
1498    ],
1499    variables: &["(def ", "(defonce "],
1500    imports: &["(ns ", "(require "],
1501    // clojure.test
1502    tests: &["(deftest ", "(testing "],
1503    assertions: &[],
1504    test_suites: &[],
1505};
1506
1507const SP_JULIA: SymbolPatterns = SymbolPatterns {
1508    functions: &["function ", "macro "],
1509    functions_prefix_paren: &[],
1510    classes: &[
1511        "struct ",
1512        "mutable struct ",
1513        "abstract type ",
1514        "primitive type ",
1515    ],
1516    variables: &["const "],
1517    imports: &["import ", "using "],
1518    // Test.jl standard library
1519    tests: &["@test ", "@testset "],
1520    assertions: &[],
1521    test_suites: &[],
1522};
1523
1524const SP_DART: SymbolPatterns = SymbolPatterns {
1525    functions: &[],
1526    functions_prefix_paren: &[],
1527    classes: &["class ", "abstract class ", "mixin ", "extension ", "enum "],
1528    variables: &["var ", "final ", "const ", "late "],
1529    imports: &["import "],
1530    // flutter_test / test package
1531    tests: &["test(", "testWidgets(", "group("],
1532    assertions: &[],
1533    test_suites: &[],
1534};
1535
1536const SP_R: SymbolPatterns = SymbolPatterns {
1537    functions: &[],
1538    functions_prefix_paren: &[],
1539    classes: &[],
1540    variables: &[],
1541    imports: &["library(", "source("],
1542    // testthat
1543    tests: &["test_that(", "it(", "describe(", "expect_"],
1544    assertions: &[],
1545    test_suites: &[],
1546};
1547
1548const SP_SQL: SymbolPatterns = SymbolPatterns {
1549    functions: &[
1550        "create function ",
1551        "create or replace function ",
1552        "create procedure ",
1553        "create or replace procedure ",
1554        "CREATE FUNCTION ",
1555        "CREATE OR REPLACE FUNCTION ",
1556        "CREATE PROCEDURE ",
1557        "CREATE OR REPLACE PROCEDURE ",
1558    ],
1559    functions_prefix_paren: &[],
1560    classes: &[
1561        "create table ",
1562        "create view ",
1563        "create schema ",
1564        "CREATE TABLE ",
1565        "CREATE VIEW ",
1566        "CREATE SCHEMA ",
1567    ],
1568    variables: &["declare ", "DECLARE "],
1569    imports: &[],
1570    tests: &[],
1571    assertions: &[],
1572    test_suites: &[],
1573};
1574
1575const SP_ASSEMBLY: SymbolPatterns = SymbolPatterns {
1576    functions: &["proc ", "PROC "],
1577    functions_prefix_paren: &[],
1578    classes: &[],
1579    variables: &[],
1580    imports: &["include ", "INCLUDE ", "%include "],
1581    tests: &[],
1582    assertions: &[],
1583    test_suites: &[],
1584};
1585
1586const SP_ZIG: SymbolPatterns = SymbolPatterns {
1587    functions: &[
1588        "fn ",
1589        "pub fn ",
1590        "export fn ",
1591        "inline fn ",
1592        "pub inline fn ",
1593    ],
1594    functions_prefix_paren: &[],
1595    classes: &[],
1596    variables: &["var ", "pub var "],
1597    imports: &[],
1598    // Zig built-in test blocks
1599    tests: &["test \"", "test{"],
1600    assertions: &[],
1601    test_suites: &[],
1602};
1603
1604/// Static (non-heap) language scanning parameters.  All fields are `'static` so this struct
1605/// can be stored in a `static` array.  The dynamic `skip_lines` set (used only for Python
1606/// docstring detection) is kept in `ScanConfig` and populated by the caller after lookup.
1607#[allow(clippy::struct_excessive_bools)]
1608#[derive(Clone, Copy)]
1609struct StaticLangConfig {
1610    line_comments: &'static [&'static str],
1611    block_comment: Option<(&'static str, &'static str)>,
1612    allow_single_quote_strings: bool,
1613    allow_double_quote_strings: bool,
1614    allow_triple_quote_strings: bool,
1615    allow_csharp_verbatim_strings: bool,
1616    symbol_patterns: SymbolPatterns,
1617    /// `true` for C, C++, and Objective-C (languages that have a C preprocessor).
1618    has_preprocessor: bool,
1619}
1620
1621#[allow(clippy::struct_excessive_bools)]
1622#[derive(Debug, Clone)]
1623struct ScanConfig {
1624    line_comments: &'static [&'static str],
1625    block_comment: Option<(&'static str, &'static str)>,
1626    allow_single_quote_strings: bool,
1627    allow_double_quote_strings: bool,
1628    allow_triple_quote_strings: bool,
1629    allow_csharp_verbatim_strings: bool,
1630    skip_lines: HashSet<usize>,
1631    symbol_patterns: SymbolPatterns,
1632}
1633
1634// ── Per-family base configurations ───────────────────────────────────────────
1635//
1636// Most languages share one of two comment styles.  Define a base `const` for
1637// each family; table entries override only the fields that differ (symbol
1638// patterns, preprocessor flag, verbatim-string flag, etc.).
1639//
1640// C-slash family: `//` line, `/* */` block, single + double quotes.
1641// Covers C, C++, Obj-C, C#, Go, Java, JS/TS/Svelte/Vue, Dart, Groovy, Kotlin,
1642// Scala, SCSS, Swift, Rust, and Zig (Zig has no block comment → overridden).
1643const C_SLASH_BASE: StaticLangConfig = StaticLangConfig {
1644    line_comments: &["//"],
1645    block_comment: Some(("/*", "*/")),
1646    allow_single_quote_strings: true,
1647    allow_double_quote_strings: true,
1648    allow_triple_quote_strings: false,
1649    allow_csharp_verbatim_strings: false,
1650    symbol_patterns: SP_NONE,
1651    has_preprocessor: false,
1652};
1653
1654// Hash-comment family: `#` line comment, no block comment, single + double
1655// quotes.  Covers Shell, Ruby, R, Perl, Elixir (each overrides only SP_*);
1656// Python overrides triple-quote; PowerShell and Nim override block_comment.
1657const HASH_BASE: StaticLangConfig = StaticLangConfig {
1658    line_comments: &["#"],
1659    block_comment: None,
1660    allow_single_quote_strings: true,
1661    allow_double_quote_strings: true,
1662    allow_triple_quote_strings: false,
1663    allow_csharp_verbatim_strings: false,
1664    symbol_patterns: SP_NONE,
1665    has_preprocessor: false,
1666};
1667
1668/// Static language-scan configuration table — one entry per supported language.
1669/// Used by `language_scan_config` to avoid a 41-arm match.  All `SP_*` constants
1670/// referenced here are defined above in the same module.
1671static LANG_SCAN_TABLE: &[(Language, StaticLangConfig)] = &[
1672    // ── C preprocessor family ─────────────────────────────────────────────────
1673    (
1674        Language::C,
1675        StaticLangConfig {
1676            symbol_patterns: SP_C,
1677            has_preprocessor: true,
1678            ..C_SLASH_BASE
1679        },
1680    ),
1681    (
1682        Language::Cpp,
1683        StaticLangConfig {
1684            symbol_patterns: SP_CPP,
1685            has_preprocessor: true,
1686            ..C_SLASH_BASE
1687        },
1688    ),
1689    (
1690        Language::ObjectiveC,
1691        StaticLangConfig {
1692            symbol_patterns: SP_OBJECTIVEC,
1693            has_preprocessor: true,
1694            ..C_SLASH_BASE
1695        },
1696    ),
1697    // ── C-slash family ────────────────────────────────────────────────────────
1698    (
1699        Language::CSharp,
1700        StaticLangConfig {
1701            symbol_patterns: SP_CSHARP,
1702            allow_csharp_verbatim_strings: true,
1703            ..C_SLASH_BASE
1704        },
1705    ),
1706    (
1707        Language::Go,
1708        StaticLangConfig {
1709            symbol_patterns: SP_GO,
1710            ..C_SLASH_BASE
1711        },
1712    ),
1713    (
1714        Language::Java,
1715        StaticLangConfig {
1716            symbol_patterns: SP_JAVA,
1717            ..C_SLASH_BASE
1718        },
1719    ),
1720    (
1721        Language::JavaScript,
1722        StaticLangConfig {
1723            symbol_patterns: SP_JS,
1724            ..C_SLASH_BASE
1725        },
1726    ),
1727    (
1728        Language::TypeScript,
1729        StaticLangConfig {
1730            symbol_patterns: SP_TS,
1731            ..C_SLASH_BASE
1732        },
1733    ),
1734    (
1735        Language::Svelte,
1736        StaticLangConfig {
1737            symbol_patterns: SP_JS,
1738            ..C_SLASH_BASE
1739        },
1740    ),
1741    (
1742        Language::Vue,
1743        StaticLangConfig {
1744            symbol_patterns: SP_JS,
1745            ..C_SLASH_BASE
1746        },
1747    ),
1748    (
1749        Language::Dart,
1750        StaticLangConfig {
1751            symbol_patterns: SP_DART,
1752            ..C_SLASH_BASE
1753        },
1754    ),
1755    (
1756        Language::Groovy,
1757        StaticLangConfig {
1758            symbol_patterns: SP_GROOVY,
1759            ..C_SLASH_BASE
1760        },
1761    ),
1762    (
1763        Language::Kotlin,
1764        StaticLangConfig {
1765            symbol_patterns: SP_KOTLIN,
1766            ..C_SLASH_BASE
1767        },
1768    ),
1769    (
1770        Language::Scala,
1771        StaticLangConfig {
1772            symbol_patterns: SP_SCALA,
1773            ..C_SLASH_BASE
1774        },
1775    ),
1776    (
1777        Language::Scss,
1778        StaticLangConfig {
1779            symbol_patterns: SP_NONE,
1780            ..C_SLASH_BASE
1781        },
1782    ),
1783    // Rust: no single-quote char literals (they're lifetime annotations)
1784    (
1785        Language::Rust,
1786        StaticLangConfig {
1787            symbol_patterns: SP_RUST,
1788            allow_single_quote_strings: false,
1789            ..C_SLASH_BASE
1790        },
1791    ),
1792    // Swift: no single-quote strings
1793    (
1794        Language::Swift,
1795        StaticLangConfig {
1796            symbol_patterns: SP_SWIFT,
1797            allow_single_quote_strings: false,
1798            ..C_SLASH_BASE
1799        },
1800    ),
1801    // Zig: no block comment
1802    (
1803        Language::Zig,
1804        StaticLangConfig {
1805            symbol_patterns: SP_ZIG,
1806            block_comment: None,
1807            ..C_SLASH_BASE
1808        },
1809    ),
1810    // F#: `(*` … `*)` block comment, no single-quote strings
1811    (
1812        Language::FSharp,
1813        StaticLangConfig {
1814            line_comments: &["//"],
1815            block_comment: Some(("(*", "*)")),
1816            allow_single_quote_strings: false,
1817            allow_double_quote_strings: true,
1818            symbol_patterns: SP_FSHARP,
1819            ..C_SLASH_BASE
1820        },
1821    ),
1822    // ── Hash-comment family ───────────────────────────────────────────────────
1823    (
1824        Language::Shell,
1825        StaticLangConfig {
1826            symbol_patterns: SP_SHELL,
1827            ..HASH_BASE
1828        },
1829    ),
1830    (
1831        Language::Elixir,
1832        StaticLangConfig {
1833            symbol_patterns: SP_ELIXIR,
1834            ..HASH_BASE
1835        },
1836    ),
1837    (
1838        Language::Perl,
1839        StaticLangConfig {
1840            symbol_patterns: SP_PERL,
1841            ..HASH_BASE
1842        },
1843    ),
1844    (
1845        Language::R,
1846        StaticLangConfig {
1847            symbol_patterns: SP_R,
1848            ..HASH_BASE
1849        },
1850    ),
1851    (
1852        Language::Ruby,
1853        StaticLangConfig {
1854            symbol_patterns: SP_RUBY,
1855            ..HASH_BASE
1856        },
1857    ),
1858    // Python: triple-quote string literals
1859    (
1860        Language::Python,
1861        StaticLangConfig {
1862            symbol_patterns: SP_PYTHON,
1863            allow_triple_quote_strings: true,
1864            ..HASH_BASE
1865        },
1866    ),
1867    // PowerShell: `<# … #>` block comment
1868    (
1869        Language::PowerShell,
1870        StaticLangConfig {
1871            symbol_patterns: SP_POWERSHELL,
1872            block_comment: Some(("<#", "#>")),
1873            ..HASH_BASE
1874        },
1875    ),
1876    // Nim: `#[` … `]#` block comment
1877    (
1878        Language::Nim,
1879        StaticLangConfig {
1880            symbol_patterns: SP_NIM,
1881            block_comment: Some(("#[", "]#")),
1882            ..HASH_BASE
1883        },
1884    ),
1885    // Makefile / Dockerfile: `#` only, no string literals
1886    (
1887        Language::Makefile,
1888        StaticLangConfig {
1889            symbol_patterns: SP_NONE,
1890            allow_single_quote_strings: false,
1891            allow_double_quote_strings: false,
1892            ..HASH_BASE
1893        },
1894    ),
1895    (
1896        Language::Dockerfile,
1897        StaticLangConfig {
1898            symbol_patterns: SP_NONE,
1899            allow_single_quote_strings: false,
1900            allow_double_quote_strings: false,
1901            ..HASH_BASE
1902        },
1903    ),
1904    // ── Other unique comment styles ───────────────────────────────────────────
1905    // CSS / SCSS: only `/* */` block, no line comment
1906    (
1907        Language::Css,
1908        StaticLangConfig {
1909            line_comments: &[],
1910            block_comment: Some(("/*", "*/")),
1911            symbol_patterns: SP_NONE,
1912            ..C_SLASH_BASE
1913        },
1914    ),
1915    // HTML / XML: `<!-- -->` block, no line comment, no string literals
1916    (
1917        Language::Html,
1918        StaticLangConfig {
1919            line_comments: &[],
1920            block_comment: Some(("<!--", "-->")),
1921            allow_single_quote_strings: false,
1922            allow_double_quote_strings: false,
1923            symbol_patterns: SP_NONE,
1924            ..C_SLASH_BASE
1925        },
1926    ),
1927    (
1928        Language::Xml,
1929        StaticLangConfig {
1930            line_comments: &[],
1931            block_comment: Some(("<!--", "-->")),
1932            allow_single_quote_strings: false,
1933            allow_double_quote_strings: false,
1934            symbol_patterns: SP_NONE,
1935            ..C_SLASH_BASE
1936        },
1937    ),
1938    // Lua: `--` line, `--[[ ]]` block
1939    (
1940        Language::Lua,
1941        StaticLangConfig {
1942            line_comments: &["--"],
1943            block_comment: Some(("--[[", "]]")),
1944            symbol_patterns: SP_LUA,
1945            ..C_SLASH_BASE
1946        },
1947    ),
1948    // Haskell: `--` line, `{- -}` block
1949    (
1950        Language::Haskell,
1951        StaticLangConfig {
1952            line_comments: &["--"],
1953            block_comment: Some(("{-", "-}")),
1954            symbol_patterns: SP_HASKELL,
1955            ..C_SLASH_BASE
1956        },
1957    ),
1958    // SQL: `--` line, `/* */` block, single quote only
1959    (
1960        Language::Sql,
1961        StaticLangConfig {
1962            line_comments: &["--"],
1963            block_comment: Some(("/*", "*/")),
1964            allow_single_quote_strings: true,
1965            allow_double_quote_strings: false,
1966            symbol_patterns: SP_SQL,
1967            ..C_SLASH_BASE
1968        },
1969    ),
1970    // OCaml: `(*` … `*)` only, no line comment, no single-quote strings
1971    (
1972        Language::Ocaml,
1973        StaticLangConfig {
1974            line_comments: &[],
1975            block_comment: Some(("(*", "*)")),
1976            allow_single_quote_strings: false,
1977            symbol_patterns: SP_OCAML,
1978            ..C_SLASH_BASE
1979        },
1980    ),
1981    // Assembly / Clojure: `;` line comment, no block, no string literals
1982    (
1983        Language::Assembly,
1984        StaticLangConfig {
1985            line_comments: &[";"],
1986            block_comment: None,
1987            allow_single_quote_strings: false,
1988            allow_double_quote_strings: false,
1989            symbol_patterns: SP_ASSEMBLY,
1990            ..C_SLASH_BASE
1991        },
1992    ),
1993    (
1994        Language::Clojure,
1995        StaticLangConfig {
1996            line_comments: &[";"],
1997            block_comment: None,
1998            allow_single_quote_strings: false,
1999            symbol_patterns: SP_CLOJURE,
2000            ..C_SLASH_BASE
2001        },
2002    ),
2003    // Erlang: `%` line comment, no block, no single-quote strings
2004    (
2005        Language::Erlang,
2006        StaticLangConfig {
2007            line_comments: &["%"],
2008            block_comment: None,
2009            allow_single_quote_strings: false,
2010            symbol_patterns: SP_ERLANG,
2011            ..C_SLASH_BASE
2012        },
2013    ),
2014    // PHP: `//` or `#` line, `/* */` block
2015    (
2016        Language::Php,
2017        StaticLangConfig {
2018            line_comments: &["//", "#"],
2019            block_comment: Some(("/*", "*/")),
2020            symbol_patterns: SP_PHP,
2021            ..C_SLASH_BASE
2022        },
2023    ),
2024    // Julia: `#` line, `#= =#` block, double + triple quotes, no single
2025    (
2026        Language::Julia,
2027        StaticLangConfig {
2028            line_comments: &["#"],
2029            block_comment: Some(("#=", "=#")),
2030            allow_single_quote_strings: false,
2031            allow_triple_quote_strings: true,
2032            symbol_patterns: SP_JULIA,
2033            ..C_SLASH_BASE
2034        },
2035    ),
2036];
2037
2038/// Per-call IEEE 1045-1992 flags derived from `AnalysisOptions` plus per-language properties.
2039/// Private to this crate; constructed inside `analyze_text`.
2040#[derive(Debug, Clone, Copy)]
2041struct IeeeFlags {
2042    /// True for C, C++, and Objective-C — languages with a C preprocessor.
2043    has_preprocessor_directives: bool,
2044    /// Mirrors `AnalysisOptions::blank_in_block_comment_as_comment`.
2045    blank_in_block_comment_as_comment: bool,
2046    /// Mirrors `AnalysisOptions::collapse_continuation_lines`.
2047    collapse_continuation_lines: bool,
2048}
2049
2050#[derive(Debug, Clone, Copy)]
2051enum StringState {
2052    Single(char),
2053    Triple(&'static str),
2054    VerbatimDouble,
2055}
2056
2057#[allow(clippy::struct_excessive_bools)]
2058#[derive(Debug, Default)]
2059struct LineFacts {
2060    has_code: bool,
2061    has_single_comment: bool,
2062    has_multi_comment: bool,
2063    has_docstring: bool,
2064}
2065
2066/// Process one character while the lexer is inside a string literal.
2067///
2068/// Returns `(new_string_state, advance)` where `advance` is the number of chars to skip.
2069fn process_string_char(
2070    state: StringState,
2071    chars: &[char],
2072    i: usize,
2073) -> (Option<StringState>, usize) {
2074    match state {
2075        StringState::Single(delim) => {
2076            if chars[i] == '\\' {
2077                return (Some(state), 2); // skip escaped character
2078            }
2079            if chars[i] == delim {
2080                (None, 1)
2081            } else {
2082                (Some(state), 1)
2083            }
2084        }
2085        StringState::Triple(delim) => {
2086            if starts_with(chars, i, delim) {
2087                (None, delim.len())
2088            } else {
2089                (Some(state), 1)
2090            }
2091        }
2092        StringState::VerbatimDouble => {
2093            if starts_with(chars, i, "\"\"") {
2094                return (Some(state), 2); // escaped quote-quote inside verbatim string
2095            }
2096            if chars[i] == '"' {
2097                (None, 1)
2098            } else {
2099                (Some(state), 1)
2100            }
2101        }
2102    }
2103}
2104
2105/// Process one character while the lexer is inside a block comment.
2106///
2107/// Returns `(still_in_block_comment, advance)`.
2108fn process_block_comment_char(chars: &[char], i: usize, close: &str) -> (bool, usize) {
2109    if starts_with(chars, i, close) {
2110        (false, close.len())
2111    } else {
2112        (true, 1)
2113    }
2114}
2115
2116/// Attempt to begin a new string literal at position `i`.
2117///
2118/// Returns `Some((new_state, advance))` when a string opener is detected, else `None`.
2119fn try_open_string(chars: &[char], i: usize, config: &ScanConfig) -> Option<(StringState, usize)> {
2120    if config.allow_csharp_verbatim_strings && starts_with(chars, i, "@\"") {
2121        return Some((StringState::VerbatimDouble, 2));
2122    }
2123    if config.allow_triple_quote_strings {
2124        if starts_with(chars, i, "\"\"\"") {
2125            return Some((StringState::Triple("\"\"\""), 3));
2126        }
2127        if starts_with(chars, i, "'''") {
2128            return Some((StringState::Triple("'''"), 3));
2129        }
2130    }
2131    if config.allow_single_quote_strings && chars[i] == '\'' {
2132        return Some((StringState::Single('\''), 1));
2133    }
2134    if config.allow_double_quote_strings && chars[i] == '"' {
2135        return Some((StringState::Single('"'), 1));
2136    }
2137    None
2138}
2139
2140/// Advance past one character position while inside a block comment.
2141///
2142/// Updates `in_block_comment` if the closing delimiter is found and returns the
2143/// number of characters consumed. Returns 0 when no block-comment config is set
2144/// (preserving the caller's `continue`-without-advance behaviour for that impossible state).
2145fn step_through_block_comment(
2146    chars: &[char],
2147    i: usize,
2148    block_comment: Option<(&'static str, &'static str)>,
2149    in_block_comment: &mut bool,
2150) -> usize {
2151    if let Some((_, close)) = block_comment {
2152        let (still_in, advance) = process_block_comment_char(chars, i, close);
2153        *in_block_comment = still_in;
2154        return advance;
2155    }
2156    0
2157}
2158
2159/// If the character at `i` starts a block comment, return the length of the opening
2160/// delimiter so the caller can advance past it. Returns `None` if no match.
2161fn try_open_block_comment(
2162    chars: &[char],
2163    i: usize,
2164    block_comment: Option<(&'static str, &'static str)>,
2165) -> Option<usize> {
2166    let (open, _) = block_comment?;
2167    starts_with(chars, i, open).then_some(open.len())
2168}
2169
2170/// Scan a single physical line and update `facts`, `in_block_comment`, and `string_state`.
2171///
2172/// Returns `true` when the caller should break out of the per-line loop early (line comment hit).
2173fn scan_line(
2174    chars: &[char],
2175    config: &ScanConfig,
2176    facts: &mut LineFacts,
2177    in_block_comment: &mut bool,
2178    string_state: &mut Option<StringState>,
2179) {
2180    let mut i = 0usize;
2181    while i < chars.len() {
2182        // Inside a string literal — advance until the closing delimiter.
2183        if let Some(state) = *string_state {
2184            facts.has_code = true;
2185            let (new_state, advance) = process_string_char(state, chars, i);
2186            *string_state = new_state;
2187            i += advance;
2188            continue;
2189        }
2190
2191        // Inside a block comment — advance until the closing delimiter.
2192        if *in_block_comment {
2193            facts.has_multi_comment = true;
2194            i += step_through_block_comment(chars, i, config.block_comment, in_block_comment);
2195            continue;
2196        }
2197
2198        // Whitespace outside any string/comment — skip.
2199        if chars[i].is_whitespace() {
2200            i += 1;
2201            continue;
2202        }
2203
2204        // Attempt to open a string literal.
2205        if let Some((new_state, advance)) = try_open_string(chars, i, config) {
2206            facts.has_code = true;
2207            *string_state = Some(new_state);
2208            i += advance;
2209            continue;
2210        }
2211
2212        // Attempt to open a block comment.
2213        if let Some(advance) = try_open_block_comment(chars, i, config.block_comment) {
2214            facts.has_multi_comment = true;
2215            *in_block_comment = true;
2216            i += advance;
2217            continue;
2218        }
2219
2220        // Line comment — rest of the line is a comment; stop scanning.
2221        if config
2222            .line_comments
2223            .iter()
2224            .any(|prefix| starts_with(chars, i, prefix))
2225        {
2226            facts.has_single_comment = true;
2227            break;
2228        }
2229
2230        // Plain code character.
2231        facts.has_code = true;
2232        i += 1;
2233    }
2234}
2235
2236/// Apply IEEE 1045-1992 §4.2 preprocessor-directive tracking and continuation-line merging,
2237/// then emit the finalized `LineFacts` for this physical line.
2238///
2239/// Returns `None` when the line is part of a continuation sequence and should be deferred.
2240fn finalize_line_facts(
2241    facts: LineFacts,
2242    trimmed: &str,
2243    raw: &mut RawLineCounts,
2244    ieee: IeeeFlags,
2245    in_block_comment: bool,
2246    string_state: Option<StringState>,
2247    pending_continuation: &mut Option<LineFacts>,
2248) -> Option<LineFacts> {
2249    // IEEE 1045-1992 §4.2: track preprocessor/compiler directive lines (C/C++/ObjC).
2250    // A directive line is a pure code line (no comment on the same physical line) whose
2251    // trimmed content starts with '#'.
2252    if ieee.has_preprocessor_directives
2253        && facts.has_code
2254        && !facts.has_single_comment
2255        && !facts.has_multi_comment
2256        && trimmed.starts_with('#')
2257    {
2258        raw.compiler_directive_lines += 1;
2259    }
2260
2261    // IEEE 1045-1992 continuation-line handling.
2262    // A line is a continuation starter when it ends with '\' outside any comment or string.
2263    let is_continuation = ieee.collapse_continuation_lines
2264        && !in_block_comment
2265        && string_state.is_none()
2266        && trimmed.ends_with('\\');
2267
2268    if is_continuation {
2269        let pending = pending_continuation.get_or_insert_with(LineFacts::default);
2270        pending.has_code |= facts.has_code;
2271        pending.has_single_comment |= facts.has_single_comment;
2272        pending.has_multi_comment |= facts.has_multi_comment;
2273        pending.has_docstring |= facts.has_docstring;
2274        return None; // defer classification until the sequence ends
2275    }
2276
2277    // Merge any accumulated continuation facts into the final line.
2278    let emit = if let Some(pending) = pending_continuation.take() {
2279        LineFacts {
2280            has_code: pending.has_code | facts.has_code,
2281            has_single_comment: pending.has_single_comment | facts.has_single_comment,
2282            has_multi_comment: pending.has_multi_comment | facts.has_multi_comment,
2283            has_docstring: pending.has_docstring | facts.has_docstring,
2284        }
2285    } else {
2286        facts
2287    };
2288    Some(emit)
2289}
2290
2291/// Scan and classify one physical line, updating all running state in place.
2292///
2293/// Pre-classified lines (present in `config.skip_lines`) are counted as docstring-comment
2294/// lines and returned early without further analysis.
2295#[allow(clippy::needless_pass_by_value)]
2296#[allow(clippy::too_many_arguments)]
2297#[allow(clippy::many_single_char_names)] // destructuring return from count_symbols; names match field roles
2298fn process_physical_line(
2299    line: &str,
2300    line_idx: usize,
2301    config: &ScanConfig,
2302    raw: &mut RawLineCounts,
2303    in_block_comment: &mut bool,
2304    string_state: &mut Option<StringState>,
2305    pending_continuation: &mut Option<LineFacts>,
2306    ieee: IeeeFlags,
2307) {
2308    raw.total_physical_lines += 1;
2309
2310    if config.skip_lines.contains(&line_idx) {
2311        raw.docstring_comment_lines += 1;
2312        return;
2313    }
2314
2315    let trimmed = line.trim();
2316    let mut facts = LineFacts::default();
2317
2318    // IEEE 1045-1992: blank lines inside block comments are comment lines by default.
2319    // When blank_in_block_comment_as_comment is false, blank lines keep their blank
2320    // classification even while inside a block comment.
2321    if *in_block_comment && (ieee.blank_in_block_comment_as_comment || !trimmed.is_empty()) {
2322        facts.has_multi_comment = true;
2323    }
2324
2325    let chars: Vec<char> = line.chars().collect();
2326    scan_line(&chars, config, &mut facts, in_block_comment, string_state);
2327
2328    let Some(emit) = finalize_line_facts(
2329        facts,
2330        trimmed,
2331        raw,
2332        ieee,
2333        *in_block_comment,
2334        *string_state,
2335        pending_continuation,
2336    ) else {
2337        return;
2338    };
2339
2340    classify_line(raw, &emit, trimmed);
2341
2342    if emit.has_code {
2343        let (f, c, v, i, t, a, s) = count_symbols(&config.symbol_patterns, trimmed);
2344        raw.functions += f;
2345        raw.classes += c;
2346        raw.variables += v;
2347        raw.imports += i;
2348        raw.test_count += t;
2349        raw.test_assertion_count += a;
2350        raw.test_suite_count += s;
2351    }
2352}
2353
2354#[allow(clippy::needless_pass_by_value)]
2355fn analyze_generic(text: &str, config: ScanConfig, ieee: IeeeFlags) -> RawFileAnalysis {
2356    let normalized = text.replace("\r\n", "\n").replace('\r', "\n");
2357    let lines: Vec<&str> = normalized.split_terminator('\n').collect();
2358
2359    let mut raw = RawLineCounts::default();
2360    let mut warnings = Vec::new();
2361
2362    let mut in_block_comment = false;
2363    let mut string_state: Option<StringState> = None;
2364    // IEEE continuation-line state: accumulates facts across a backslash-continued sequence.
2365    let mut pending_continuation: Option<LineFacts> = None;
2366
2367    for (line_idx, line) in lines.iter().enumerate() {
2368        process_physical_line(
2369            line,
2370            line_idx,
2371            &config,
2372            &mut raw,
2373            &mut in_block_comment,
2374            &mut string_state,
2375            &mut pending_continuation,
2376            ieee,
2377        );
2378    }
2379
2380    // Flush any pending continuation that reaches end-of-file without a closing line.
2381    if let Some(pending) = pending_continuation.take() {
2382        classify_line(&mut raw, &pending, "");
2383    }
2384
2385    if in_block_comment {
2386        warnings.push("unclosed block comment detected; result is best effort".into());
2387    }
2388    if string_state.is_some() {
2389        warnings.push("unclosed string literal detected; result is best effort".into());
2390    }
2391
2392    RawFileAnalysis {
2393        raw,
2394        parse_mode: if warnings.is_empty() {
2395            ParseMode::Lexical
2396        } else {
2397            ParseMode::LexicalBestEffort
2398        },
2399        warnings,
2400        style_analysis: None,
2401    }
2402}
2403
2404const fn classify_line(raw: &mut RawLineCounts, facts: &LineFacts, trimmed: &str) {
2405    if facts.has_docstring {
2406        raw.docstring_comment_lines += 1;
2407    } else if !facts.has_code
2408        && !facts.has_single_comment
2409        && !facts.has_multi_comment
2410        && trimmed.is_empty()
2411    {
2412        raw.blank_only_lines += 1;
2413    } else if facts.has_code && facts.has_single_comment {
2414        raw.mixed_code_single_comment_lines += 1;
2415    } else if facts.has_code && facts.has_multi_comment {
2416        raw.mixed_code_multi_comment_lines += 1;
2417    } else if facts.has_code {
2418        raw.code_only_lines += 1;
2419    } else if facts.has_single_comment {
2420        raw.single_comment_only_lines += 1;
2421    } else if facts.has_multi_comment {
2422        raw.multi_comment_only_lines += 1;
2423    } else if trimmed.is_empty() {
2424        raw.blank_only_lines += 1;
2425    } else {
2426        raw.skipped_unknown_lines += 1;
2427    }
2428}
2429
2430fn count_symbols(patterns: &SymbolPatterns, trimmed: &str) -> (u64, u64, u64, u64, u64, u64, u64) {
2431    let hit = |pats: &[&str]| u64::from(pats.iter().any(|p| trimmed.starts_with(p)));
2432    // For return-type-led languages (C/C++): match prefix AND `(` present AND no `=` sits
2433    // between the prefix start and the first `(` (guards against `void* p = malloc(n)`).
2434    let fn_pp = if patterns.functions_prefix_paren.is_empty() {
2435        0
2436    } else if let Some(paren_pos) = trimmed.find('(') {
2437        if trimmed[..paren_pos].contains('=') {
2438            0
2439        } else {
2440            hit(patterns.functions_prefix_paren)
2441        }
2442    } else {
2443        0
2444    };
2445    let test_hit = hit(patterns.tests);
2446    // Lines matching a test pattern count as tests, not as plain functions or classes.
2447    // This prevents double-counting in Python (`def test_` / `class Test`) and Go
2448    // (`func Test` / `func Benchmark` / `func Fuzz`) where the same line satisfies both
2449    // a function/class prefix and a test pattern. Rust is unaffected: `#[test]` is a
2450    // standalone attribute line; the `fn` declaration on the next line does not match any
2451    // test pattern and still increments functions correctly.
2452    let fn_hit = if test_hit == 0 {
2453        hit(patterns.functions) | fn_pp
2454    } else {
2455        0
2456    };
2457    let class_hit = if test_hit == 0 {
2458        hit(patterns.classes)
2459    } else {
2460        0
2461    };
2462    (
2463        fn_hit,
2464        class_hit,
2465        hit(patterns.variables),
2466        hit(patterns.imports),
2467        test_hit,
2468        hit(patterns.assertions),
2469        hit(patterns.test_suites),
2470    )
2471}
2472
2473fn starts_with(chars: &[char], index: usize, needle: &str) -> bool {
2474    let needle_chars: Vec<char> = needle.chars().collect();
2475    chars.get(index..index + needle_chars.len()) == Some(needle_chars.as_slice())
2476}
2477
2478#[derive(Debug, Clone)]
2479struct PyContext {
2480    indent: usize,
2481    expect_docstring: bool,
2482}
2483
2484/// Update `contexts` to pop any scopes that the current `indent` has outdented past.
2485fn py_pop_outdented_contexts(contexts: &mut Vec<PyContext>, indent: usize) {
2486    while contexts.len() > 1 && indent < contexts.last().map_or(0, |c| c.indent) {
2487        contexts.pop();
2488    }
2489}
2490
2491/// Handle `pending_block_indent` transition: push a new docstring-expecting context when we
2492/// detect the first indented line of a new block, or cancel the pending state otherwise.
2493fn py_handle_pending_indent(
2494    pending_block_indent: &mut Option<usize>,
2495    contexts: &mut Vec<PyContext>,
2496    indent: usize,
2497    trimmed: &str,
2498) {
2499    let Some(base_indent) = *pending_block_indent else {
2500        return;
2501    };
2502    if indent > base_indent {
2503        contexts.push(PyContext {
2504            indent,
2505            expect_docstring: true,
2506        });
2507        *pending_block_indent = None;
2508    } else if !trimmed.starts_with('@') {
2509        *pending_block_indent = None;
2510    }
2511}
2512
2513/// Check whether the current line is a docstring opener in the current context.
2514///
2515/// If it is, records the line, adjusts `ctx.expect_docstring`, and optionally sets
2516/// `active_docstring` for multi-line docstrings. Returns `true` when the caller should
2517/// `continue` to the next line.
2518fn py_try_record_docstring(
2519    ctx: &mut PyContext,
2520    trimmed: &str,
2521    idx: usize,
2522    docstring_lines: &mut HashSet<usize>,
2523    active_docstring: &mut Option<(&'static str, usize)>,
2524) -> bool {
2525    if !ctx.expect_docstring {
2526        return false;
2527    }
2528    if let Some(delim) = docstring_delimiter(trimmed) {
2529        docstring_lines.insert(idx);
2530        ctx.expect_docstring = false;
2531        if !closes_triple_docstring(trimmed, delim, true) {
2532            *active_docstring = Some((delim, idx));
2533        }
2534        return true;
2535    }
2536    ctx.expect_docstring = false;
2537    false
2538}
2539
2540/// Advance through an active multi-line docstring: marks the current line and clears
2541/// `active_docstring` when the closing delimiter is found. Returns `true` when the caller
2542/// should `continue` to the next line (i.e. we were inside a docstring).
2543fn track_active_docstring(
2544    active_docstring: &mut Option<(&'static str, usize)>,
2545    docstring_lines: &mut HashSet<usize>,
2546    idx: usize,
2547    trimmed: &str,
2548) -> bool {
2549    let Some((delim, start_line)) = *active_docstring else {
2550        return false;
2551    };
2552    docstring_lines.insert(idx);
2553    if closes_triple_docstring(trimmed, delim, idx == start_line) {
2554        *active_docstring = None;
2555    }
2556    true
2557}
2558
2559/// Attempt to record a docstring opener using the top of the context stack.
2560/// Returns `true` when the caller should `continue` to the next line.
2561fn try_record_docstring_if_context(
2562    contexts: &mut [PyContext],
2563    trimmed: &str,
2564    idx: usize,
2565    docstring_lines: &mut HashSet<usize>,
2566    active_docstring: &mut Option<(&'static str, usize)>,
2567) -> bool {
2568    let Some(ctx) = contexts.last_mut() else {
2569        return false;
2570    };
2571    py_try_record_docstring(ctx, trimmed, idx, docstring_lines, active_docstring)
2572}
2573
2574/// If an unclosed docstring is still active at end-of-file, mark all remaining lines.
2575fn mark_unclosed_docstring_lines(
2576    active_docstring: Option<&(&'static str, usize)>,
2577    docstring_lines: &mut HashSet<usize>,
2578    num_lines: usize,
2579) {
2580    if let Some(&(_, start_line)) = active_docstring {
2581        for idx in start_line..num_lines {
2582            docstring_lines.insert(idx);
2583        }
2584    }
2585}
2586
2587fn detect_python_docstring_lines(text: &str) -> HashSet<usize> {
2588    let normalized = text.replace("\r\n", "\n").replace('\r', "\n");
2589    let lines: Vec<&str> = normalized.split_terminator('\n').collect();
2590
2591    let mut docstring_lines = HashSet::new();
2592    let mut contexts = vec![PyContext {
2593        indent: 0,
2594        expect_docstring: true,
2595    }];
2596    let mut pending_block_indent: Option<usize> = None;
2597    let mut active_docstring: Option<(&'static str, usize)> = None;
2598
2599    for (idx, line) in lines.iter().enumerate() {
2600        let trimmed = line.trim();
2601        let indent = leading_indent(line);
2602
2603        if track_active_docstring(&mut active_docstring, &mut docstring_lines, idx, trimmed) {
2604            continue;
2605        }
2606
2607        // Blank lines and comment lines don't affect docstring detection.
2608        if trimmed.is_empty() || trimmed.starts_with('#') {
2609            continue;
2610        }
2611
2612        py_pop_outdented_contexts(&mut contexts, indent);
2613        py_handle_pending_indent(&mut pending_block_indent, &mut contexts, indent, trimmed);
2614
2615        if try_record_docstring_if_context(
2616            &mut contexts,
2617            trimmed,
2618            idx,
2619            &mut docstring_lines,
2620            &mut active_docstring,
2621        ) {
2622            continue;
2623        }
2624
2625        if is_python_block_header(trimmed) {
2626            pending_block_indent = Some(indent);
2627        }
2628    }
2629
2630    mark_unclosed_docstring_lines(active_docstring.as_ref(), &mut docstring_lines, lines.len());
2631
2632    docstring_lines
2633}
2634
2635fn leading_indent(line: &str) -> usize {
2636    line.chars().take_while(|c| c.is_whitespace()).count()
2637}
2638
2639fn is_python_block_header(trimmed: &str) -> bool {
2640    (trimmed.starts_with("def ")
2641        || trimmed.starts_with("async def ")
2642        || trimmed.starts_with("class "))
2643        && trimmed.ends_with(':')
2644}
2645
2646fn docstring_delimiter(trimmed: &str) -> Option<&'static str> {
2647    let mut idx = 0usize;
2648    let bytes = trimmed.as_bytes();
2649    while idx < bytes.len() {
2650        let c = bytes[idx] as char;
2651        if matches!(c, 'r' | 'R' | 'u' | 'U' | 'b' | 'B' | 'f' | 'F') {
2652            idx += 1;
2653            continue;
2654        }
2655        break;
2656    }
2657
2658    let rest = &trimmed[idx..];
2659    if rest.starts_with("\"\"\"") {
2660        Some("\"\"\"")
2661    } else if rest.starts_with("'''") {
2662        Some("'''")
2663    } else {
2664        None
2665    }
2666}
2667
2668fn closes_triple_docstring(trimmed: &str, delim: &str, same_line_as_start: bool) -> bool {
2669    let mut occurrences = 0usize;
2670    let mut search = trimmed;
2671    while let Some(index) = search.find(delim) {
2672        occurrences += 1;
2673        search = &search[index + delim.len()..];
2674    }
2675
2676    if same_line_as_start {
2677        occurrences >= 2
2678    } else {
2679        occurrences >= 1
2680    }
2681}
2682
2683/// Tree-sitter-backed adapters (compiled only when the `tree-sitter` feature is enabled).
2684///
2685/// When parsing succeeds the result is used directly; on any failure the caller falls back
2686/// to the lexical state machine.
2687#[cfg(feature = "tree-sitter")]
2688pub mod ts {
2689    use tree_sitter::Node;
2690
2691    use super::{ParseMode, RawFileAnalysis, RawLineCounts};
2692
2693    /// Configuration for which AST node kinds map to symbols in this grammar.
2694    struct SymbolKinds {
2695        /// Node kind name for function definitions (e.g. `"function_definition"`).
2696        function_def: &'static str,
2697        /// Node kind name for class definitions (e.g. `"class_definition"`).
2698        class_def: &'static str,
2699        /// Name field of a function node that, when it starts with this prefix, marks a test.
2700        /// Empty string disables test-prefix detection.
2701        test_fn_prefix: &'static str,
2702        /// Name field of a class node that, when it starts with this prefix, marks a test.
2703        /// Empty string disables test-prefix detection.
2704        test_class_prefix: &'static str,
2705        /// When non-empty, `call` nodes whose `function` is an `attribute` access and whose
2706        /// attribute identifier starts with this prefix are counted as test assertions.
2707        /// Used for Python `self.assertXxx(...)` detection.
2708        assertion_attr_prefix: &'static str,
2709    }
2710
2711    impl SymbolKinds {
2712        const fn none() -> Self {
2713            Self {
2714                function_def: "",
2715                class_def: "",
2716                test_fn_prefix: "",
2717                test_class_prefix: "",
2718                assertion_attr_prefix: "",
2719            }
2720        }
2721    }
2722
2723    /// Classify every line of `text` using a tree-sitter grammar.
2724    ///
2725    /// `comment_node_kinds` — node type names that represent comments in this grammar
2726    /// `docstring_stmt_kind` — optional parent node type whose direct `string` child is a docstring
2727    /// `symbols` — AST node kinds used to populate symbol counters
2728    fn analyze_lines(
2729        text: &str,
2730        ts_language: &tree_sitter::Language,
2731        comment_node_kinds: &[&str],
2732        docstring_stmt_kind: Option<&str>,
2733        symbols: &SymbolKinds,
2734    ) -> Option<RawFileAnalysis> {
2735        let mut parser = tree_sitter::Parser::new();
2736        parser.set_language(ts_language).ok()?;
2737        let tree = parser.parse(text, None)?;
2738
2739        let lines: Vec<&str> = text.split_terminator('\n').collect();
2740        let n = lines.len();
2741
2742        let mut has_code = vec![false; n];
2743        let mut has_comment = vec![false; n];
2744        let mut comment_is_block = vec![false; n];
2745        let mut has_docstring = vec![false; n];
2746
2747        // Walk every node in the tree and mark line arrays.
2748        let mut ctx = VisitCtx {
2749            source: text.as_bytes(),
2750            comment_kinds: comment_node_kinds,
2751            docstring_stmt_kind,
2752            has_code: &mut has_code,
2753            has_comment: &mut has_comment,
2754            comment_is_block: &mut comment_is_block,
2755            has_docstring: &mut has_docstring,
2756        };
2757        visit(tree.root_node(), &mut ctx);
2758
2759        let mut raw = RawLineCounts::default();
2760        classify_ts_lines(
2761            &lines,
2762            &has_code,
2763            &has_comment,
2764            &comment_is_block,
2765            &has_docstring,
2766            &mut raw,
2767        );
2768
2769        // Symbol counting: walk the AST a second time to collect function/class/test counts.
2770        if !symbols.function_def.is_empty() || !symbols.class_def.is_empty() {
2771            count_symbols(tree.root_node(), text.as_bytes(), symbols, &mut raw);
2772        }
2773
2774        Some(RawFileAnalysis {
2775            raw,
2776            parse_mode: ParseMode::TreeSitter,
2777            warnings: Vec::new(),
2778        })
2779    }
2780
2781    /// Recurse into every direct child of `node`.
2782    fn recurse_children(node: Node, source: &[u8], kinds: &SymbolKinds, raw: &mut RawLineCounts) {
2783        for i in 0..node.child_count() {
2784            #[allow(clippy::cast_possible_truncation)]
2785            if let Some(child) = node.child(i as u32) {
2786                count_symbols(child, source, kinds, raw);
2787            }
2788        }
2789    }
2790
2791    /// Handle a function-definition node. Returns `true` if the node matched.
2792    fn try_count_function(
2793        node: Node,
2794        source: &[u8],
2795        kinds: &SymbolKinds,
2796        raw: &mut RawLineCounts,
2797    ) -> bool {
2798        if kinds.function_def.is_empty() || node.kind() != kinds.function_def {
2799            return false;
2800        }
2801        let name = node
2802            .child_by_field_name("name")
2803            .and_then(|n| n.utf8_text(source).ok())
2804            .unwrap_or("");
2805        if !kinds.test_fn_prefix.is_empty() && name.starts_with(kinds.test_fn_prefix) {
2806            raw.test_count += 1;
2807        } else {
2808            raw.functions += 1;
2809        }
2810        recurse_children(node, source, kinds, raw);
2811        true
2812    }
2813
2814    /// Handle a class-definition node. Returns `true` if the node matched.
2815    fn try_count_class(
2816        node: Node,
2817        source: &[u8],
2818        kinds: &SymbolKinds,
2819        raw: &mut RawLineCounts,
2820    ) -> bool {
2821        if kinds.class_def.is_empty() || node.kind() != kinds.class_def {
2822            return false;
2823        }
2824        let name = node
2825            .child_by_field_name("name")
2826            .and_then(|n| n.utf8_text(source).ok())
2827            .unwrap_or("");
2828        if !kinds.test_class_prefix.is_empty() && name.starts_with(kinds.test_class_prefix) {
2829            raw.test_count += 1;
2830        } else {
2831            raw.classes += 1;
2832        }
2833        recurse_children(node, source, kinds, raw);
2834        true
2835    }
2836
2837    /// Handle an assertion call node. Returns `true` if the node matched (skips recursion
2838    /// into arguments, preserving "don't double-count test bodies" semantics).
2839    fn try_count_assertion(
2840        node: Node,
2841        source: &[u8],
2842        kinds: &SymbolKinds,
2843        raw: &mut RawLineCounts,
2844    ) -> bool {
2845        if kinds.assertion_attr_prefix.is_empty() || node.kind() != "call" {
2846            return false;
2847        }
2848        let Some(func) = node.child_by_field_name("function") else {
2849            return false;
2850        };
2851        if func.kind() != "attribute" {
2852            return false;
2853        }
2854        let attr_text = func
2855            .child_by_field_name("attribute")
2856            .and_then(|n| n.utf8_text(source).ok())
2857            .unwrap_or("");
2858        if !attr_text.starts_with(kinds.assertion_attr_prefix) {
2859            return false;
2860        }
2861        raw.test_assertion_count += 1;
2862        true
2863    }
2864
2865    /// Walk the AST and populate `raw.functions`, `raw.classes`, `raw.test_count`,
2866    /// and `raw.test_assertion_count`.
2867    fn count_symbols(node: Node, source: &[u8], kinds: &SymbolKinds, raw: &mut RawLineCounts) {
2868        if try_count_function(node, source, kinds, raw) {
2869            return;
2870        }
2871        if try_count_class(node, source, kinds, raw) {
2872            return;
2873        }
2874        if try_count_assertion(node, source, kinds, raw) {
2875            return;
2876        }
2877        recurse_children(node, source, kinds, raw);
2878    }
2879
2880    /// Flags describing what kinds of content appear on a single line.
2881    // Four bools are the natural representation for these four independent properties.
2882    #[allow(clippy::struct_excessive_bools)]
2883    #[derive(Clone, Copy)]
2884    struct TsLineFlags {
2885        has_code: bool,
2886        has_comment: bool,
2887        comment_is_block: bool,
2888        has_docstring: bool,
2889    }
2890
2891    /// Classify a single tree-sitter-annotated line and accumulate into `raw`.
2892    const fn classify_ts_line(trimmed: &str, flags: TsLineFlags, raw: &mut RawLineCounts) {
2893        if trimmed.is_empty() {
2894            raw.blank_only_lines += 1;
2895        } else if flags.has_docstring && !flags.has_code {
2896            raw.docstring_comment_lines += 1;
2897        } else if flags.has_code && flags.has_comment {
2898            // Classify the mixed line as single or multi based on what kind of comment is on it.
2899            if flags.comment_is_block {
2900                raw.mixed_code_multi_comment_lines += 1;
2901            } else {
2902                raw.mixed_code_single_comment_lines += 1;
2903            }
2904        } else if flags.has_comment {
2905            if flags.comment_is_block {
2906                raw.multi_comment_only_lines += 1;
2907            } else {
2908                raw.single_comment_only_lines += 1;
2909            }
2910        } else {
2911            raw.code_only_lines += 1;
2912        }
2913    }
2914
2915    /// Classify each tree-sitter-annotated line and accumulate counts into `raw`.
2916    fn classify_ts_lines(
2917        lines: &[&str],
2918        has_code: &[bool],
2919        has_comment: &[bool],
2920        comment_is_block: &[bool],
2921        has_docstring: &[bool],
2922        raw: &mut RawLineCounts,
2923    ) {
2924        for i in 0..lines.len() {
2925            raw.total_physical_lines += 1;
2926            classify_ts_line(
2927                lines[i].trim(),
2928                TsLineFlags {
2929                    has_code: has_code[i],
2930                    has_comment: has_comment[i],
2931                    comment_is_block: comment_is_block[i],
2932                    has_docstring: has_docstring[i],
2933                },
2934                raw,
2935            );
2936        }
2937    }
2938
2939    struct VisitCtx<'a> {
2940        source: &'a [u8],
2941        comment_kinds: &'a [&'a str],
2942        docstring_stmt_kind: Option<&'a str>,
2943        has_code: &'a mut Vec<bool>,
2944        has_comment: &'a mut Vec<bool>,
2945        comment_is_block: &'a mut Vec<bool>,
2946        has_docstring: &'a mut Vec<bool>,
2947    }
2948
2949    /// Mark all rows of a comment node and detect whether it is a block comment.
2950    fn visit_comment_node(node: Node, ctx: &mut VisitCtx<'_>) {
2951        let start_row = node.start_position().row;
2952        let end_row = node.end_position().row;
2953        let first_two = node
2954            .utf8_text(ctx.source)
2955            .unwrap_or("")
2956            .get(..2)
2957            .unwrap_or("");
2958        let is_block = first_two == "/*" || first_two == "<#";
2959        for row in start_row..=end_row {
2960            if row < ctx.has_comment.len() {
2961                ctx.has_comment[row] = true;
2962                if is_block {
2963                    ctx.comment_is_block[row] = true;
2964                }
2965            }
2966        }
2967    }
2968
2969    /// If `node` is an `expression_statement` whose sole named child is a string literal,
2970    /// mark those rows as docstring and return `true`.
2971    fn visit_maybe_docstring(node: Node, kind: &str, ctx: &mut VisitCtx<'_>) -> bool {
2972        let Some(stmt_kind) = ctx.docstring_stmt_kind else {
2973            return false;
2974        };
2975        if kind != stmt_kind || node.named_child_count() != 1 {
2976            return false;
2977        }
2978        let Some(child) = node.named_child(0) else {
2979            return false;
2980        };
2981        if child.kind() != "string" {
2982            return false;
2983        }
2984        let child_start = child.start_position().row;
2985        let child_end = child.end_position().row;
2986        for row in child_start..=child_end {
2987            if row < ctx.has_docstring.len() {
2988                ctx.has_docstring[row] = true;
2989            }
2990        }
2991        true
2992    }
2993
2994    /// Mark all rows of a leaf (non-comment, non-extra) node as code.
2995    fn visit_leaf_code(node: Node, ctx: &mut VisitCtx<'_>) {
2996        let start_row = node.start_position().row;
2997        let end_row = node.end_position().row;
2998        for row in start_row..=end_row {
2999            if row < ctx.has_code.len() {
3000                ctx.has_code[row] = true;
3001            }
3002        }
3003    }
3004
3005    #[allow(clippy::too_many_lines)]
3006    fn visit(node: Node, ctx: &mut VisitCtx<'_>) {
3007        let kind = node.kind();
3008
3009        // Comment node — mark rows as comment, detect block vs. line comment.
3010        if ctx.comment_kinds.contains(&kind) {
3011            visit_comment_node(node, ctx);
3012            return;
3013        }
3014
3015        // Python docstring: expression_statement whose only named child is a string literal.
3016        if visit_maybe_docstring(node, kind, ctx) {
3017            return;
3018        }
3019
3020        // Leaf non-comment node: mark as code.
3021        if node.child_count() == 0 && !node.is_extra() {
3022            visit_leaf_code(node, ctx);
3023            return;
3024        }
3025
3026        for i in 0..node.child_count() {
3027            #[allow(clippy::cast_possible_truncation)]
3028            // child_count bounded by tree-sitter u32 capacity
3029            if let Some(child) = node.child(i as u32) {
3030                visit(child, ctx);
3031            }
3032        }
3033    }
3034
3035    const C_SYMBOLS: SymbolKinds = SymbolKinds::none();
3036
3037    const PYTHON_SYMBOLS: SymbolKinds = SymbolKinds {
3038        function_def: "function_definition",
3039        class_def: "class_definition",
3040        test_fn_prefix: "test_",
3041        test_class_prefix: "Test",
3042        assertion_attr_prefix: "assert",
3043    };
3044
3045    /// Parse C or C++ source with tree-sitter-c.
3046    #[must_use]
3047    pub fn analyze_c(text: &str) -> Option<RawFileAnalysis> {
3048        let lang: tree_sitter::Language = tree_sitter_c::LANGUAGE.into();
3049        analyze_lines(text, &lang, &["comment"], None, &C_SYMBOLS)
3050    }
3051
3052    /// Parse Python source with tree-sitter-python.
3053    #[must_use]
3054    pub fn analyze_python(text: &str) -> Option<RawFileAnalysis> {
3055        let lang: tree_sitter::Language = tree_sitter_python::LANGUAGE.into();
3056        analyze_lines(
3057            text,
3058            &lang,
3059            &["comment"],
3060            Some("expression_statement"),
3061            &PYTHON_SYMBOLS,
3062        )
3063    }
3064}
3065
3066#[cfg(test)]
3067mod tests {
3068    use super::*;
3069
3070    #[test]
3071    fn python_docstrings_are_separated() {
3072        let input = r#""""module docs"""
3073
3074
3075def fn_a():
3076    """function docs"""
3077    value = 1  # trailing comment
3078    return value
3079"#;
3080
3081        let result = analyze_text(Language::Python, input, AnalysisOptions::default());
3082        assert_eq!(result.raw.docstring_comment_lines, 2);
3083        assert_eq!(result.raw.mixed_code_single_comment_lines, 1);
3084        assert_eq!(result.raw.code_only_lines, 2);
3085    }
3086
3087    #[test]
3088    fn c_style_mixed_lines_are_captured() {
3089        let input = "int x = 1; // note\n/* block */\n";
3090        let result = analyze_text(Language::C, input, AnalysisOptions::default());
3091        assert_eq!(result.raw.mixed_code_single_comment_lines, 1);
3092        assert_eq!(result.raw.multi_comment_only_lines, 1);
3093    }
3094
3095    #[test]
3096    fn detect_language_by_shebang() {
3097        let language = detect_language(
3098            Path::new("script"),
3099            Some("#!/usr/bin/env bash"),
3100            &BTreeMap::new(),
3101            true,
3102        );
3103        assert_eq!(language, Some(Language::Shell));
3104    }
3105
3106    // ── count_symbols: no double-counting of test functions ──────────────────
3107
3108    fn sym(lang: Language, line: &str) -> (u64, u64, u64, u64, u64, u64, u64) {
3109        let result = analyze_text(lang, &format!("{line}\n"), AnalysisOptions::default());
3110        let r = &result.raw;
3111        (
3112            r.functions,
3113            r.classes,
3114            r.variables,
3115            r.imports,
3116            r.test_count,
3117            r.test_assertion_count,
3118            r.test_suite_count,
3119        )
3120    }
3121
3122    #[test]
3123    fn python_test_fn_not_double_counted() {
3124        // def test_ lines count as tests only, NOT as functions
3125        let (f, c, _, _, t, _, _) = sym(Language::Python, "def test_foo():");
3126        assert_eq!(f, 0, "test fn must not also increment functions");
3127        assert_eq!(t, 1, "must be counted as a test");
3128        assert_eq!(c, 0);
3129    }
3130
3131    #[test]
3132    fn python_test_class_not_double_counted() {
3133        // class Test* lines count as tests only, NOT as classes
3134        let (f, c, _, _, t, _, _) = sym(Language::Python, "class TestFoo:");
3135        assert_eq!(c, 0, "test class must not also increment classes");
3136        assert_eq!(t, 1, "must be counted as a test");
3137        assert_eq!(f, 0);
3138    }
3139
3140    #[test]
3141    fn python_regular_fn_counts_as_function() {
3142        let (f, c, _, _, t, _, _) = sym(Language::Python, "def regular():");
3143        assert_eq!(f, 1, "regular function must be counted");
3144        assert_eq!(t, 0);
3145        assert_eq!(c, 0);
3146    }
3147
3148    #[test]
3149    fn python_regular_class_counts_as_class() {
3150        let (f, c, _, _, t, _, _) = sym(Language::Python, "class Regular:");
3151        assert_eq!(c, 1, "regular class must be counted");
3152        assert_eq!(t, 0);
3153        assert_eq!(f, 0);
3154    }
3155
3156    #[test]
3157    fn go_test_fn_not_double_counted() {
3158        let (f, _, _, _, t, _, _) = sym(Language::Go, "func TestFoo(t *testing.T) {");
3159        assert_eq!(f, 0, "Go test func must not also increment functions");
3160        assert_eq!(t, 1, "must be counted as a test");
3161    }
3162
3163    #[test]
3164    fn go_benchmark_fn_not_double_counted() {
3165        let (f, _, _, _, t, _, _) = sym(Language::Go, "func BenchmarkBar(b *testing.B) {");
3166        assert_eq!(f, 0, "Go benchmark func must not also increment functions");
3167        assert_eq!(t, 1, "must be counted as a test");
3168    }
3169
3170    #[test]
3171    fn go_regular_fn_counts_as_function() {
3172        let (f, _, _, _, t, _, _) = sym(Language::Go, "func doSomething() {");
3173        assert_eq!(f, 1, "regular Go func must be counted");
3174        assert_eq!(t, 0);
3175    }
3176
3177    #[test]
3178    fn rust_test_attr_counts_as_test_not_function() {
3179        // #[test] is a standalone attribute line — counted as a test, never as a function
3180        let (f, _, _, _, t, _, _) = sym(Language::Rust, "#[test]");
3181        assert_eq!(t, 1, "#[test] must be counted as a test");
3182        assert_eq!(f, 0, "#[test] attribute must not be counted as a function");
3183    }
3184
3185    #[test]
3186    fn rust_fn_line_counts_as_function_not_test() {
3187        // The fn declaration after #[test] does NOT match any test pattern
3188        let (f, _, _, _, t, _, _) = sym(Language::Rust, "fn test_something() {");
3189        assert_eq!(f, 1, "fn declaration must count as a function");
3190        assert_eq!(
3191            t, 0,
3192            "fn declaration line must not be double-counted as a test"
3193        );
3194    }
3195
3196    #[test]
3197    fn js_describe_counts_as_test_not_function() {
3198        let (f, _, _, _, t, _, _) = sym(Language::JavaScript, "describe('suite', () => {");
3199        assert_eq!(t, 1, "describe must be counted as a test");
3200        assert_eq!(f, 0, "describe must not be counted as a function");
3201    }
3202
3203    #[test]
3204    fn js_regular_fn_counts_as_function() {
3205        let (f, _, _, _, t, _, _) = sym(Language::JavaScript, "function doWork() {");
3206        assert_eq!(f, 1, "JS function declaration must be counted");
3207        assert_eq!(t, 0);
3208    }
3209
3210    // ── Language detection tests ─────────────────────────────────────────────
3211
3212    use std::collections::BTreeMap;
3213    use std::path::Path;
3214
3215    #[test]
3216    fn detect_language_rs_extension() {
3217        let lang = detect_language(Path::new("foo.rs"), None, &BTreeMap::new(), false);
3218        assert_eq!(lang, Some(Language::Rust));
3219    }
3220
3221    #[test]
3222    fn detect_language_py_extension() {
3223        let lang = detect_language(Path::new("foo.py"), None, &BTreeMap::new(), false);
3224        assert_eq!(lang, Some(Language::Python));
3225    }
3226
3227    #[test]
3228    fn detect_language_ts_extension() {
3229        let lang = detect_language(Path::new("app.ts"), None, &BTreeMap::new(), false);
3230        assert_eq!(lang, Some(Language::TypeScript));
3231    }
3232
3233    #[test]
3234    fn detect_language_js_extension() {
3235        let lang = detect_language(Path::new("app.js"), None, &BTreeMap::new(), false);
3236        assert_eq!(lang, Some(Language::JavaScript));
3237    }
3238
3239    #[test]
3240    fn detect_language_go_extension() {
3241        let lang = detect_language(Path::new("main.go"), None, &BTreeMap::new(), false);
3242        assert_eq!(lang, Some(Language::Go));
3243    }
3244
3245    #[test]
3246    fn detect_language_c_extension() {
3247        let lang = detect_language(Path::new("main.c"), None, &BTreeMap::new(), false);
3248        assert_eq!(lang, Some(Language::C));
3249    }
3250
3251    #[test]
3252    fn detect_language_cpp_extension() {
3253        let lang = detect_language(Path::new("main.cpp"), None, &BTreeMap::new(), false);
3254        assert_eq!(lang, Some(Language::Cpp));
3255    }
3256
3257    #[test]
3258    fn detect_language_java_extension() {
3259        let lang = detect_language(Path::new("Main.java"), None, &BTreeMap::new(), false);
3260        assert_eq!(lang, Some(Language::Java));
3261    }
3262
3263    #[test]
3264    fn detect_language_makefile_exact_name() {
3265        let lang = detect_language(Path::new("Makefile"), None, &BTreeMap::new(), false);
3266        assert_eq!(lang, Some(Language::Makefile));
3267    }
3268
3269    #[test]
3270    fn detect_language_dockerfile_exact_name() {
3271        let lang = detect_language(Path::new("Dockerfile"), None, &BTreeMap::new(), false);
3272        assert_eq!(lang, Some(Language::Dockerfile));
3273    }
3274
3275    #[test]
3276    fn detect_language_rakefile() {
3277        let lang = detect_language(Path::new("Rakefile"), None, &BTreeMap::new(), false);
3278        assert_eq!(lang, Some(Language::Ruby));
3279    }
3280
3281    #[test]
3282    fn detect_language_gemfile() {
3283        let lang = detect_language(Path::new("Gemfile"), None, &BTreeMap::new(), false);
3284        assert_eq!(lang, Some(Language::Ruby));
3285    }
3286
3287    #[test]
3288    fn detect_language_unknown_extension_returns_none() {
3289        let lang = detect_language(Path::new("foo.xyz123"), None, &BTreeMap::new(), false);
3290        assert_eq!(lang, None);
3291    }
3292
3293    #[test]
3294    fn detect_language_extension_override() {
3295        let mut overrides = BTreeMap::new();
3296        overrides.insert("h".into(), "cpp".into());
3297        let lang = detect_language(Path::new("header.h"), None, &overrides, false);
3298        assert_eq!(lang, Some(Language::Cpp));
3299    }
3300
3301    #[test]
3302    fn detect_language_shebang_python() {
3303        let lang = detect_language(
3304            Path::new("script"),
3305            Some("#!/usr/bin/env python3"),
3306            &BTreeMap::new(),
3307            true,
3308        );
3309        assert_eq!(lang, Some(Language::Python));
3310    }
3311
3312    #[test]
3313    fn detect_language_shebang_bash() {
3314        let lang = detect_language(
3315            Path::new("script"),
3316            Some("#!/bin/bash"),
3317            &BTreeMap::new(),
3318            true,
3319        );
3320        assert_eq!(lang, Some(Language::Shell));
3321    }
3322
3323    #[test]
3324    fn detect_language_shebang_ruby() {
3325        let lang = detect_language(
3326            Path::new("script"),
3327            Some("#!/usr/bin/env ruby"),
3328            &BTreeMap::new(),
3329            true,
3330        );
3331        assert_eq!(lang, Some(Language::Ruby));
3332    }
3333
3334    #[test]
3335    fn detect_language_shebang_disabled() {
3336        // When shebang_detection=false, shebang is ignored
3337        let lang = detect_language(
3338            Path::new("script"),
3339            Some("#!/usr/bin/env python3"),
3340            &BTreeMap::new(),
3341            false,
3342        );
3343        assert_eq!(lang, None);
3344    }
3345
3346    #[test]
3347    fn from_name_rust() {
3348        assert_eq!(Language::from_name("rust"), Some(Language::Rust));
3349    }
3350
3351    #[test]
3352    fn from_name_python() {
3353        assert_eq!(Language::from_name("python"), Some(Language::Python));
3354    }
3355
3356    #[test]
3357    fn from_name_unknown() {
3358        assert_eq!(Language::from_name("brainfuck"), None);
3359    }
3360
3361    #[test]
3362    fn from_name_roundtrip_all() {
3363        // Every language's slug should round-trip through from_name
3364        for lang in [
3365            Language::C,
3366            Language::Cpp,
3367            Language::CSharp,
3368            Language::Go,
3369            Language::Java,
3370            Language::JavaScript,
3371            Language::Python,
3372            Language::Rust,
3373            Language::Shell,
3374            Language::PowerShell,
3375            Language::TypeScript,
3376            Language::Assembly,
3377            Language::Clojure,
3378            Language::Css,
3379            Language::Dart,
3380            Language::Dockerfile,
3381            Language::Elixir,
3382            Language::Erlang,
3383            Language::FSharp,
3384            Language::Groovy,
3385            Language::Haskell,
3386            Language::Html,
3387            Language::Julia,
3388            Language::Kotlin,
3389            Language::Lua,
3390            Language::Makefile,
3391            Language::Nim,
3392            Language::ObjectiveC,
3393            Language::Ocaml,
3394            Language::Perl,
3395            Language::Php,
3396            Language::R,
3397            Language::Ruby,
3398            Language::Scala,
3399            Language::Scss,
3400            Language::Sql,
3401            Language::Svelte,
3402            Language::Swift,
3403            Language::Vue,
3404            Language::Xml,
3405            Language::Zig,
3406        ] {
3407            let slug = lang.as_slug();
3408            let roundtripped = Language::from_name(slug);
3409            assert_eq!(
3410                roundtripped,
3411                Some(lang),
3412                "from_name({slug:?}) should return {:?}",
3413                lang
3414            );
3415        }
3416    }
3417}