Skip to main content

sloc_languages/
lib.rs

1// SPDX-License-Identifier: AGPL-3.0-or-later
2// Copyright (C) 2026 Nima Shafie <nimzshafie@gmail.com>
3
4use std::collections::{BTreeMap, BTreeSet, HashSet};
5use std::path::Path;
6
7use serde::{Deserialize, Serialize};
8
9#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
10#[serde(rename_all = "snake_case")]
11pub enum Language {
12    C,
13    Cpp,
14    CSharp,
15    Go,
16    Java,
17    JavaScript,
18    Python,
19    Rust,
20    Shell,
21    PowerShell,
22    TypeScript,
23    // --- Extended language support ---
24    Assembly,
25    Clojure,
26    Css,
27    Dart,
28    Dockerfile,
29    Elixir,
30    Erlang,
31    FSharp,
32    Groovy,
33    Haskell,
34    Html,
35    Julia,
36    Kotlin,
37    Lua,
38    Makefile,
39    Nim,
40    ObjectiveC,
41    Ocaml,
42    Perl,
43    Php,
44    R,
45    Ruby,
46    Scala,
47    Scss,
48    Sql,
49    Svelte,
50    Swift,
51    Vue,
52    Xml,
53    Zig,
54}
55
56impl Language {
57    #[must_use]
58    pub const fn display_name(&self) -> &'static str {
59        match self {
60            Self::C => "C",
61            Self::Cpp => "C++",
62            Self::CSharp => "C#",
63            Self::Go => "Go",
64            Self::Java => "Java",
65            Self::JavaScript => "JavaScript",
66            Self::Python => "Python",
67            Self::Rust => "Rust",
68            Self::Shell => "Shell",
69            Self::PowerShell => "PowerShell",
70            Self::TypeScript => "TypeScript",
71            Self::Assembly => "Assembly",
72            Self::Clojure => "Clojure",
73            Self::Css => "CSS",
74            Self::Dart => "Dart",
75            Self::Dockerfile => "Dockerfile",
76            Self::Elixir => "Elixir",
77            Self::Erlang => "Erlang",
78            Self::FSharp => "F#",
79            Self::Groovy => "Groovy",
80            Self::Haskell => "Haskell",
81            Self::Html => "HTML",
82            Self::Julia => "Julia",
83            Self::Kotlin => "Kotlin",
84            Self::Lua => "Lua",
85            Self::Makefile => "Makefile",
86            Self::Nim => "Nim",
87            Self::ObjectiveC => "Objective-C",
88            Self::Ocaml => "OCaml",
89            Self::Perl => "Perl",
90            Self::Php => "PHP",
91            Self::R => "R",
92            Self::Ruby => "Ruby",
93            Self::Scala => "Scala",
94            Self::Scss => "SCSS",
95            Self::Sql => "SQL",
96            Self::Svelte => "Svelte",
97            Self::Swift => "Swift",
98            Self::Vue => "Vue",
99            Self::Xml => "XML",
100            Self::Zig => "Zig",
101        }
102    }
103
104    #[must_use]
105    pub const fn as_slug(&self) -> &'static str {
106        match self {
107            Self::C => "c",
108            Self::Cpp => "cpp",
109            Self::CSharp => "csharp",
110            Self::Go => "go",
111            Self::Java => "java",
112            Self::JavaScript => "javascript",
113            Self::Python => "python",
114            Self::Rust => "rust",
115            Self::Shell => "shell",
116            Self::PowerShell => "powershell",
117            Self::TypeScript => "typescript",
118            Self::Assembly => "assembly",
119            Self::Clojure => "clojure",
120            Self::Css => "css",
121            Self::Dart => "dart",
122            Self::Dockerfile => "dockerfile",
123            Self::Elixir => "elixir",
124            Self::Erlang => "erlang",
125            Self::FSharp => "fsharp",
126            Self::Groovy => "groovy",
127            Self::Haskell => "haskell",
128            Self::Html => "html",
129            Self::Julia => "julia",
130            Self::Kotlin => "kotlin",
131            Self::Lua => "lua",
132            Self::Makefile => "makefile",
133            Self::Nim => "nim",
134            Self::ObjectiveC => "objectivec",
135            Self::Ocaml => "ocaml",
136            Self::Perl => "perl",
137            Self::Php => "php",
138            Self::R => "r",
139            Self::Ruby => "ruby",
140            Self::Scala => "scala",
141            Self::Scss => "scss",
142            Self::Sql => "sql",
143            Self::Svelte => "svelte",
144            Self::Swift => "swift",
145            Self::Vue => "vue",
146            Self::Xml => "xml",
147            Self::Zig => "zig",
148        }
149    }
150
151    #[must_use]
152    pub fn from_name(name: &str) -> Option<Self> {
153        match name.trim().to_ascii_lowercase().as_str() {
154            "c" => Some(Self::C),
155            "cpp" | "c++" | "cplusplus" => Some(Self::Cpp),
156            "csharp" | "c#" | "cs" => Some(Self::CSharp),
157            "go" | "golang" => Some(Self::Go),
158            "java" => Some(Self::Java),
159            "javascript" | "js" => Some(Self::JavaScript),
160            "python" | "py" => Some(Self::Python),
161            "rust" | "rs" => Some(Self::Rust),
162            "shell" | "sh" | "bash" => Some(Self::Shell),
163            "powershell" | "pwsh" | "ps" => Some(Self::PowerShell),
164            "typescript" | "ts" => Some(Self::TypeScript),
165            "assembly" | "asm" => Some(Self::Assembly),
166            "clojure" | "clj" => Some(Self::Clojure),
167            "css" => Some(Self::Css),
168            "dart" => Some(Self::Dart),
169            "dockerfile" | "docker" => Some(Self::Dockerfile),
170            "elixir" | "ex" => Some(Self::Elixir),
171            "erlang" | "erl" => Some(Self::Erlang),
172            "fsharp" | "f#" | "fs" => Some(Self::FSharp),
173            "groovy" => Some(Self::Groovy),
174            "haskell" | "hs" => Some(Self::Haskell),
175            "html" | "htm" => Some(Self::Html),
176            "julia" | "jl" => Some(Self::Julia),
177            "kotlin" | "kt" => Some(Self::Kotlin),
178            "lua" => Some(Self::Lua),
179            "makefile" | "make" | "mk" => Some(Self::Makefile),
180            "nim" => Some(Self::Nim),
181            "objectivec" | "objc" | "objective-c" => Some(Self::ObjectiveC),
182            "ocaml" | "ml" => Some(Self::Ocaml),
183            "perl" | "pl" => Some(Self::Perl),
184            "php" => Some(Self::Php),
185            "r" => Some(Self::R),
186            "ruby" | "rb" => Some(Self::Ruby),
187            "scala" => Some(Self::Scala),
188            "scss" | "sass" => Some(Self::Scss),
189            "sql" => Some(Self::Sql),
190            "svelte" => Some(Self::Svelte),
191            "swift" => Some(Self::Swift),
192            "vue" => Some(Self::Vue),
193            "xml" => Some(Self::Xml),
194            "zig" => Some(Self::Zig),
195            _ => None,
196        }
197    }
198}
199
200#[derive(Debug, Clone, Serialize, Deserialize, Default)]
201pub struct RawLineCounts {
202    pub total_physical_lines: u64,
203    pub blank_only_lines: u64,
204    pub code_only_lines: u64,
205    pub single_comment_only_lines: u64,
206    pub multi_comment_only_lines: u64,
207    pub mixed_code_single_comment_lines: u64,
208    pub mixed_code_multi_comment_lines: u64,
209    pub docstring_comment_lines: u64,
210    pub skipped_unknown_lines: u64,
211    /// Best-effort count of function/method definition lines detected lexically.
212    #[serde(default)]
213    pub functions: u64,
214    /// Best-effort count of class/struct/trait/type definition lines detected lexically.
215    #[serde(default)]
216    pub classes: u64,
217    /// Best-effort count of variable declaration lines detected lexically.
218    #[serde(default)]
219    pub variables: u64,
220    /// Best-effort count of import/use/include statement lines detected lexically.
221    #[serde(default)]
222    pub imports: u64,
223    /// Lines consisting solely of preprocessor/compiler directives (e.g. `#include`, `#define`
224    /// in C/C++/Objective-C). Always a subset of `code_only_lines`. Controlled by
225    /// `AnalysisConfig::count_compiler_directives`. IEEE 1045-1992 §4.2.
226    #[serde(default)]
227    pub compiler_directive_lines: u64,
228    /// Best-effort count of test case / test function definition lines detected lexically
229    /// (`GTest`, Catch2, `PyTest`, `JUnit`, etc.). Always a subset of `code_only_lines`.
230    #[serde(default)]
231    pub test_count: u64,
232    /// Best-effort count of test assertion call lines detected lexically
233    /// (`ASSERT_EQ`, `EXPECT_TRUE`, assertEquals, Assert.AreEqual, `assert_eq`!, etc.).
234    #[serde(default)]
235    pub test_assertion_count: u64,
236    /// Best-effort count of test suite / fixture / group declaration lines detected lexically
237    /// (`TEST_GROUP`, `BOOST_AUTO_TEST_SUITE`, [`TestClass`], [`TestFixture`], etc.).
238    #[serde(default)]
239    pub test_suite_count: u64,
240}
241
242#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
243#[serde(rename_all = "snake_case")]
244pub enum ParseMode {
245    Lexical,
246    LexicalBestEffort,
247    TreeSitter,
248}
249
250#[derive(Debug, Clone, Serialize, Deserialize)]
251pub struct RawFileAnalysis {
252    pub raw: RawLineCounts,
253    pub parse_mode: ParseMode,
254    pub warnings: Vec<String>,
255}
256
257/// IEEE 1045-1992 counting options passed from `sloc-core` (built from `AnalysisConfig`).
258///
259/// `analyze_text` accepts this struct so that the caller can control behaviour that the
260/// standard defines as configurable parameters rather than fixed conventions.
261#[derive(Debug, Clone, Copy)]
262pub struct AnalysisOptions {
263    /// When `true` (IEEE 1045-1992 default), blank lines inside block comments count as
264    /// comment lines rather than blank lines.
265    pub blank_in_block_comment_as_comment: bool,
266    /// When `true`, backslash-continued physical lines are collapsed into a single logical
267    /// line for SLOC counting purposes (IEEE logical SLOC mode).
268    pub collapse_continuation_lines: bool,
269}
270
271impl Default for AnalysisOptions {
272    fn default() -> Self {
273        Self {
274            blank_in_block_comment_as_comment: true,
275            collapse_continuation_lines: false,
276        }
277    }
278}
279
280#[must_use]
281pub fn supported_languages() -> BTreeSet<Language> {
282    [
283        Language::Assembly,
284        Language::C,
285        Language::Clojure,
286        Language::Cpp,
287        Language::CSharp,
288        Language::Css,
289        Language::Dart,
290        Language::Dockerfile,
291        Language::Elixir,
292        Language::Erlang,
293        Language::FSharp,
294        Language::Go,
295        Language::Groovy,
296        Language::Haskell,
297        Language::Html,
298        Language::Java,
299        Language::JavaScript,
300        Language::Julia,
301        Language::Kotlin,
302        Language::Lua,
303        Language::Makefile,
304        Language::Nim,
305        Language::ObjectiveC,
306        Language::Ocaml,
307        Language::Perl,
308        Language::Php,
309        Language::PowerShell,
310        Language::Python,
311        Language::R,
312        Language::Ruby,
313        Language::Rust,
314        Language::Scala,
315        Language::Scss,
316        Language::Shell,
317        Language::Sql,
318        Language::Svelte,
319        Language::Swift,
320        Language::TypeScript,
321        Language::Vue,
322        Language::Xml,
323        Language::Zig,
324    ]
325    .into_iter()
326    .collect()
327}
328
329/// Detect language from a shebang line (e.g. `#!/usr/bin/env python3`).
330fn detect_by_shebang(line: &str) -> Option<Language> {
331    let lower = line.to_ascii_lowercase();
332    if !lower.starts_with("#!") {
333        return None;
334    }
335    if lower.contains("python") {
336        return Some(Language::Python);
337    }
338    if lower.contains("pwsh") || lower.contains("powershell") {
339        return Some(Language::PowerShell);
340    }
341    if lower.contains("bash")
342        || lower.contains("/sh")
343        || lower.contains("zsh")
344        || lower.contains("ksh")
345    {
346        return Some(Language::Shell);
347    }
348    if lower.contains("ruby") {
349        return Some(Language::Ruby);
350    }
351    if lower.contains("perl") {
352        return Some(Language::Perl);
353    }
354    if lower.contains("php") {
355        return Some(Language::Php);
356    }
357    if lower.contains("node") || lower.contains("nodejs") {
358        return Some(Language::JavaScript);
359    }
360    None
361}
362
363/// Detect language purely from a (lowercased) file extension.
364fn detect_by_extension(ext: &str) -> Option<Language> {
365    // Static table avoids a large match statement; each extension maps 1-to-1 to a language.
366    static EXT_MAP: &[(&str, Language)] = &[
367        ("c", Language::C),
368        ("h", Language::C),
369        ("cc", Language::Cpp),
370        ("cp", Language::Cpp),
371        ("cpp", Language::Cpp),
372        ("cxx", Language::Cpp),
373        ("hh", Language::Cpp),
374        ("hpp", Language::Cpp),
375        ("hxx", Language::Cpp),
376        ("cs", Language::CSharp),
377        ("go", Language::Go),
378        ("java", Language::Java),
379        ("js", Language::JavaScript),
380        ("mjs", Language::JavaScript),
381        ("cjs", Language::JavaScript),
382        ("py", Language::Python),
383        ("rs", Language::Rust),
384        ("sh", Language::Shell),
385        ("bash", Language::Shell),
386        ("zsh", Language::Shell),
387        ("ksh", Language::Shell),
388        ("ps1", Language::PowerShell),
389        ("psm1", Language::PowerShell),
390        ("psd1", Language::PowerShell),
391        ("ts", Language::TypeScript),
392        ("mts", Language::TypeScript),
393        ("cts", Language::TypeScript),
394        ("asm", Language::Assembly),
395        ("s", Language::Assembly),
396        ("clj", Language::Clojure),
397        ("cljs", Language::Clojure),
398        ("cljc", Language::Clojure),
399        ("edn", Language::Clojure),
400        ("css", Language::Css),
401        ("dart", Language::Dart),
402        ("ex", Language::Elixir),
403        ("exs", Language::Elixir),
404        ("erl", Language::Erlang),
405        ("hrl", Language::Erlang),
406        ("fs", Language::FSharp),
407        ("fsi", Language::FSharp),
408        ("fsx", Language::FSharp),
409        ("groovy", Language::Groovy),
410        ("gradle", Language::Groovy),
411        ("hs", Language::Haskell),
412        ("lhs", Language::Haskell),
413        ("html", Language::Html),
414        ("htm", Language::Html),
415        ("xhtml", Language::Html),
416        ("jl", Language::Julia),
417        ("kt", Language::Kotlin),
418        ("kts", Language::Kotlin),
419        ("lua", Language::Lua),
420        ("mk", Language::Makefile),
421        ("nim", Language::Nim),
422        ("nims", Language::Nim),
423        ("m", Language::ObjectiveC),
424        ("mm", Language::ObjectiveC),
425        ("ml", Language::Ocaml),
426        ("mli", Language::Ocaml),
427        ("pl", Language::Perl),
428        ("pm", Language::Perl),
429        ("t", Language::Perl),
430        ("php", Language::Php),
431        ("php3", Language::Php),
432        ("php4", Language::Php),
433        ("php5", Language::Php),
434        ("php7", Language::Php),
435        ("phtml", Language::Php),
436        ("r", Language::R),
437        ("rb", Language::Ruby),
438        ("rake", Language::Ruby),
439        ("scala", Language::Scala),
440        ("sc", Language::Scala),
441        ("scss", Language::Scss),
442        ("sass", Language::Scss),
443        ("sql", Language::Sql),
444        ("svelte", Language::Svelte),
445        ("swift", Language::Swift),
446        ("vue", Language::Vue),
447        ("xml", Language::Xml),
448        ("xsd", Language::Xml),
449        ("xsl", Language::Xml),
450        ("xslt", Language::Xml),
451        ("svg", Language::Xml),
452        ("zig", Language::Zig),
453    ];
454    EXT_MAP.iter().find_map(|&(e, l)| (e == ext).then_some(l))
455}
456
457/// Detect language from an exact filename (no extension) or well-known filename patterns.
458fn detect_by_filename(filename: &str, filename_lower: &str) -> Option<Language> {
459    // Dockerfile: exact name or Dockerfile.* variant
460    if filename == "Dockerfile"
461        || filename.starts_with("Dockerfile.")
462        || filename_lower == "dockerfile"
463    {
464        return Some(Language::Dockerfile);
465    }
466    // Makefile variants
467    if matches!(
468        filename,
469        "Makefile" | "GNUmakefile" | "makefile" | "BSDmakefile"
470    ) {
471        return Some(Language::Makefile);
472    }
473    // Ruby ecosystem files that have no extension
474    if matches!(
475        filename,
476        "Rakefile" | "Gemfile" | "Guardfile" | "Vagrantfile" | "Fastfile" | "Podfile"
477    ) {
478        return Some(Language::Ruby);
479    }
480    None
481}
482
483#[must_use]
484#[allow(clippy::too_many_lines)]
485pub fn detect_language(
486    path: &Path,
487    first_line: Option<&str>,
488    extension_overrides: &BTreeMap<String, String>,
489    shebang_detection: bool,
490) -> Option<Language> {
491    let extension = path
492        .extension()
493        .and_then(|ext| ext.to_str())
494        .map(str::to_ascii_lowercase);
495
496    // Extension override check (user-configured mappings win over everything)
497    if let Some(ext) = extension.as_ref() {
498        if let Some(override_name) = extension_overrides.get(ext.as_str()) {
499            if let Some(lang) = Language::from_name(override_name) {
500                return Some(lang);
501            }
502        }
503    }
504
505    // Filename-based detection for files that have no extension or use exact names
506    let filename = path.file_name().and_then(|s| s.to_str()).unwrap_or("");
507    let filename_lower = filename.to_ascii_lowercase();
508
509    if let Some(lang) = detect_by_filename(filename, &filename_lower) {
510        return Some(lang);
511    }
512
513    // Extension-based detection
514    if let Some(lang) = extension.as_deref().and_then(detect_by_extension) {
515        return Some(lang);
516    }
517
518    // Shebang detection (last resort — only for extensionless scripts)
519    if shebang_detection {
520        if let Some(line) = first_line {
521            if let Some(lang) = detect_by_shebang(line) {
522                return Some(lang);
523            }
524        }
525    }
526
527    None
528}
529
530#[must_use]
531pub fn analyze_text(language: Language, text: &str, options: AnalysisOptions) -> RawFileAnalysis {
532    // tree-sitter fast-paths (compiled out when feature is disabled)
533    #[cfg(feature = "tree-sitter")]
534    {
535        match language {
536            Language::C | Language::Cpp => {
537                if let Some(result) = ts::analyze_c(text) {
538                    return result;
539                }
540            }
541            Language::Python => {
542                if let Some(result) = ts::analyze_python(text) {
543                    return result;
544                }
545            }
546            _ => {}
547        }
548    }
549
550    let (mut config, has_preprocessor) = language_scan_config(language);
551
552    // Python docstring lines are computed from the text and cannot be a static constant.
553    if language == Language::Python {
554        config.skip_lines = detect_python_docstring_lines(text);
555    }
556
557    // C, C++, and Objective-C have a preprocessor whose directive lines are tracked separately
558    // per IEEE 1045-1992 §4.2; every other language uses base flags.
559    let flags = IeeeFlags {
560        has_preprocessor_directives: has_preprocessor,
561        blank_in_block_comment_as_comment: options.blank_in_block_comment_as_comment,
562        collapse_continuation_lines: options.collapse_continuation_lines,
563    };
564    analyze_generic(text, config, flags)
565}
566
567/// Returns the lexical scan configuration for `language` and whether it uses a C preprocessor.
568/// All fields are static constants except `skip_lines`, which is always empty here; callers that
569/// need non-empty skip sets (currently only Python) must populate the field after this call.
570///
571/// The implementation delegates to `LANG_SCAN_TABLE` (a static `&[(Language, StaticLangConfig)]`)
572/// defined below the `SP_*` symbol-pattern constants.  Each language appears exactly once in the
573/// table, so the linear scan is O(|languages|) but avoids a 41-arm `match` statement.
574fn language_scan_config(language: Language) -> (ScanConfig, bool) {
575    let cfg = LANG_SCAN_TABLE
576        .iter()
577        .find_map(|&(l, c)| (l == language).then_some(c))
578        .unwrap_or_else(|| panic!("language_scan_config: no entry for {language:?}"));
579    (
580        ScanConfig {
581            line_comments: cfg.line_comments,
582            block_comment: cfg.block_comment,
583            allow_single_quote_strings: cfg.allow_single_quote_strings,
584            allow_double_quote_strings: cfg.allow_double_quote_strings,
585            allow_triple_quote_strings: cfg.allow_triple_quote_strings,
586            allow_csharp_verbatim_strings: cfg.allow_csharp_verbatim_strings,
587            skip_lines: HashSet::new(),
588            symbol_patterns: cfg.symbol_patterns,
589        },
590        cfg.has_preprocessor,
591    )
592}
593
594/// Per-language keyword prefixes used for best-effort structural symbol detection.
595/// Each slice lists line prefixes (after leading whitespace is stripped) that indicate
596/// a definition of that category. Empty slice = detection disabled for that category.
597#[derive(Debug, Clone, Copy)]
598struct SymbolPatterns {
599    functions: &'static [&'static str],
600    /// Line prefixes that classify as a function only when the line ALSO contains `(`
601    /// AND there is no `=` between the prefix and the first `(`.  Used for C/C++ where
602    /// function definitions are led by the return type (`void`, `int`, `bool`, …) with
603    /// no dedicated keyword, so the paren guard distinguishes `void f(x)` from
604    /// `void* p = malloc(n)`.
605    functions_prefix_paren: &'static [&'static str],
606    classes: &'static [&'static str],
607    variables: &'static [&'static str],
608    imports: &'static [&'static str],
609    /// Line prefixes (after stripping leading whitespace) that indicate a test case or test
610    /// function definition. Matched against code lines only, same as other symbol categories.
611    tests: &'static [&'static str],
612    /// Line prefixes that indicate a test assertion call (`ASSERT_EQ`, assertEquals, `assert_eq`!,
613    /// Assert.AreEqual, etc.). Matched against code lines only.
614    assertions: &'static [&'static str],
615    /// Line prefixes that indicate a test suite / fixture / group declaration
616    /// (`TEST_GROUP`, `BOOST_AUTO_TEST_SUITE`, [`TestClass`], [`TestFixture`], etc.).
617    test_suites: &'static [&'static str],
618}
619
620impl SymbolPatterns {
621    const fn none() -> Self {
622        Self {
623            functions: &[],
624            functions_prefix_paren: &[],
625            classes: &[],
626            variables: &[],
627            imports: &[],
628            tests: &[],
629            assertions: &[],
630            test_suites: &[],
631        }
632    }
633}
634
635const SP_NONE: SymbolPatterns = SymbolPatterns::none(); // all fields are &[]
636
637const SP_RUST: SymbolPatterns = SymbolPatterns {
638    functions: &[
639        "fn ",
640        "pub fn ",
641        "pub(crate) fn ",
642        "pub(super) fn ",
643        "async fn ",
644        "pub async fn ",
645        "pub(crate) async fn ",
646        "unsafe fn ",
647        "pub unsafe fn ",
648        "pub(crate) unsafe fn ",
649        "const fn ",
650        "pub const fn ",
651        "pub(crate) const fn ",
652        "extern fn ",
653        "pub extern fn ",
654    ],
655    functions_prefix_paren: &[],
656    classes: &[
657        "struct ",
658        "pub struct ",
659        "pub(crate) struct ",
660        "enum ",
661        "pub enum ",
662        "pub(crate) enum ",
663        "trait ",
664        "pub trait ",
665        "pub(crate) trait ",
666        "impl ",
667        "impl<",
668        "type ",
669        "pub type ",
670        "pub(crate) type ",
671    ],
672    variables: &["let ", "let mut "],
673    imports: &["use ", "pub use ", "pub(crate) use ", "extern crate "],
674    // Built-in #[test], tokio/actix async test attributes, rstest
675    tests: &[
676        "#[test]",
677        "#[tokio::test]",
678        "#[actix_web::test]",
679        "#[rstest]",
680        "#[test_case",
681    ],
682    assertions: &[
683        "assert_eq!(",
684        "assert_ne!(",
685        "assert!(",
686        "assert_matches!(",
687        "assert_err!(",
688        "assert_ok!(",
689    ],
690    test_suites: &[],
691};
692
693const SP_PYTHON: SymbolPatterns = SymbolPatterns {
694    functions: &["def ", "async def "],
695    functions_prefix_paren: &[],
696    classes: &["class "],
697    variables: &[],
698    imports: &["import ", "from "],
699    // pytest: test_ prefix functions and Test* classes; unittest: test_ methods
700    tests: &["def test_", "async def test_", "class Test"],
701    assertions: &[
702        "self.assertEqual(",
703        "self.assertNotEqual(",
704        "self.assertTrue(",
705        "self.assertFalse(",
706        "self.assertIsNone(",
707        "self.assertIsNotNone(",
708        "self.assertIn(",
709        "self.assertNotIn(",
710        "self.assertRaises(",
711        "self.assertAlmostEqual(",
712    ],
713    test_suites: &[],
714};
715
716const SP_JS: SymbolPatterns = SymbolPatterns {
717    functions: &[
718        "function ",
719        "async function ",
720        "export function ",
721        "export async function ",
722        "export default function ",
723    ],
724    functions_prefix_paren: &[],
725    classes: &["class ", "export class ", "export default class "],
726    variables: &[
727        "var ",
728        "let ",
729        "const ",
730        "export var ",
731        "export let ",
732        "export const ",
733    ],
734    imports: &["import "],
735    // Jest/Mocha/Jasmine: describe/it/test block openers
736    tests: &[
737        "describe(",
738        "it(",
739        "test(",
740        "it.each(",
741        "test.each(",
742        "describe.each(",
743    ],
744    assertions: &["expect("],
745    test_suites: &[],
746};
747
748const SP_TS: SymbolPatterns = SymbolPatterns {
749    functions: &[
750        "function ",
751        "async function ",
752        "export function ",
753        "export async function ",
754        "export default function ",
755    ],
756    functions_prefix_paren: &[],
757    classes: &[
758        "class ",
759        "export class ",
760        "export default class ",
761        "abstract class ",
762        "export abstract class ",
763        "interface ",
764        "export interface ",
765        "declare class ",
766        "declare interface ",
767    ],
768    variables: &[
769        "var ",
770        "let ",
771        "const ",
772        "export var ",
773        "export let ",
774        "export const ",
775    ],
776    imports: &["import "],
777    // Jest/Mocha/Jasmine/Vitest: describe/it/test block openers
778    tests: &[
779        "describe(",
780        "it(",
781        "test(",
782        "it.each(",
783        "test.each(",
784        "describe.each(",
785    ],
786    assertions: &["expect("],
787    test_suites: &[],
788};
789
790const SP_GO: SymbolPatterns = SymbolPatterns {
791    functions: &["func "],
792    functions_prefix_paren: &[],
793    classes: &["type "],
794    variables: &["var "],
795    imports: &["import "],
796    // Go standard testing: Test* functions (convention is practically exclusive to _test.go files)
797    tests: &["func Test", "func Benchmark", "func Fuzz"],
798    assertions: &[],
799    test_suites: &[],
800};
801
802const SP_JAVA: SymbolPatterns = SymbolPatterns {
803    functions: &[],
804    functions_prefix_paren: &[],
805    classes: &[
806        "class ",
807        "public class ",
808        "private class ",
809        "protected class ",
810        "abstract class ",
811        "final class ",
812        "public abstract class ",
813        "public final class ",
814        "interface ",
815        "public interface ",
816        "enum ",
817        "public enum ",
818        "record ",
819        "public record ",
820        "@interface ",
821    ],
822    variables: &[],
823    imports: &["import "],
824    // JUnit 4 & 5, TestNG — annotations appear on their own line before the method
825    tests: &[
826        "@Test",
827        "@ParameterizedTest",
828        "@RepeatedTest",
829        "@TestFactory",
830        "@TestTemplate",
831    ],
832    assertions: &[
833        "assertEquals(",
834        "assertNotEquals(",
835        "assertTrue(",
836        "assertFalse(",
837        "assertNull(",
838        "assertNotNull(",
839        "assertThat(",
840        "assertThrows(",
841        "assertAll(",
842        "assertArrayEquals(",
843        "assertIterableEquals(",
844        "assertLinesMatch(",
845    ],
846    test_suites: &[],
847};
848
849const SP_CSHARP: SymbolPatterns = SymbolPatterns {
850    functions: &[],
851    functions_prefix_paren: &[],
852    classes: &[
853        "class ",
854        "public class ",
855        "private class ",
856        "protected class ",
857        "internal class ",
858        "abstract class ",
859        "sealed class ",
860        "static class ",
861        "partial class ",
862        "public abstract class ",
863        "public sealed class ",
864        "public static class ",
865        "interface ",
866        "public interface ",
867        "internal interface ",
868        "enum ",
869        "public enum ",
870        "struct ",
871        "public struct ",
872        "record ",
873        "public record ",
874    ],
875    variables: &["var "],
876    imports: &["using "],
877    // MSTest, NUnit, xUnit — attributes on their own line before the method
878    tests: &[
879        "[TestMethod]",
880        "[Test]",
881        "[Fact]",
882        "[Theory]",
883        "[TestCase(",
884        "[DataRow(",
885        "[InlineData(",
886        "[MemberData(",
887    ],
888    assertions: &[
889        "Assert.AreEqual(",
890        "Assert.AreNotEqual(",
891        "Assert.IsTrue(",
892        "Assert.IsFalse(",
893        "Assert.IsNull(",
894        "Assert.IsNotNull(",
895        "Assert.Equal(",
896        "Assert.NotEqual(",
897        "Assert.True(",
898        "Assert.False(",
899        "Assert.That(",
900        "Assert.Contains(",
901        "Assert.Throws(",
902        "Assert.ThrowsAsync(",
903        "Assert.IsInstanceOfType(",
904    ],
905    test_suites: &["[TestClass]", "[TestFixture]", "[SetUpFixture]"],
906};
907
908// GTest, Catch2/doctest, Boost.Test, Unity, Check, CMocka, CppUTest patterns for C and C++.
909const TEST_PATTERNS_C_CPP: &[&str] = &[
910    // Google Test
911    "TEST(",
912    "TEST_F(",
913    "TEST_P(",
914    "TYPED_TEST(",
915    "TYPED_TEST_P(",
916    "INSTANTIATE_TEST_SUITE_P(",
917    "INSTANTIATE_TYPED_TEST_SUITE_P(",
918    // Catch2 / doctest
919    "TEST_CASE(",
920    "SECTION(",
921    "SCENARIO(",
922    "SCENARIO_METHOD(",
923    "TEST_CASE_METHOD(",
924    // Boost.Test
925    "BOOST_AUTO_TEST_CASE(",
926    "BOOST_FIXTURE_TEST_CASE(",
927    "BOOST_AUTO_TEST_SUITE(",
928    "BOOST_PARAM_TEST_CASE(",
929    // CppUnit
930    "CPPUNIT_TEST(",
931    "CPPUNIT_TEST_SUITE(",
932    // Unity (embedded C)
933    "RUN_TEST(",
934    "TEST_IGNORE(",
935    "TEST_FAIL(",
936    // Check (libcheck — embedded C)
937    "START_TEST(",
938    "tcase_add_test(",
939    "suite_create(",
940    // CMocka (embedded C)
941    "cmocka_unit_test(",
942    "cmocka_run_group_tests(",
943    // CppUTest
944    "IGNORE_TEST(",
945    "TEST_GROUP(",
946    "TEST_GROUP_BASE(",
947];
948
949// Test assertion patterns shared by C and C++.
950const ASSERT_PATTERNS_C_CPP: &[&str] = &[
951    // Google Test ASSERT_* (test-stopping failures)
952    "ASSERT_EQ(",
953    "ASSERT_NE(",
954    "ASSERT_LT(",
955    "ASSERT_LE(",
956    "ASSERT_GT(",
957    "ASSERT_GE(",
958    "ASSERT_TRUE(",
959    "ASSERT_FALSE(",
960    "ASSERT_STREQ(",
961    "ASSERT_STRNE(",
962    "ASSERT_FLOAT_EQ(",
963    "ASSERT_DOUBLE_EQ(",
964    "ASSERT_NEAR(",
965    "ASSERT_THROW(",
966    "ASSERT_NO_THROW(",
967    "ASSERT_ANY_THROW(",
968    // Google Test EXPECT_* (non-stopping failures)
969    "EXPECT_EQ(",
970    "EXPECT_NE(",
971    "EXPECT_LT(",
972    "EXPECT_LE(",
973    "EXPECT_GT(",
974    "EXPECT_GE(",
975    "EXPECT_TRUE(",
976    "EXPECT_FALSE(",
977    "EXPECT_STREQ(",
978    "EXPECT_STRNE(",
979    "EXPECT_FLOAT_EQ(",
980    "EXPECT_DOUBLE_EQ(",
981    "EXPECT_NEAR(",
982    "EXPECT_THROW(",
983    "EXPECT_NO_THROW(",
984    "EXPECT_ANY_THROW(",
985    // Catch2 / doctest assertions
986    "REQUIRE(",
987    "CHECK(",
988    "REQUIRE_FALSE(",
989    "CHECK_FALSE(",
990    "REQUIRE_NOTHROW(",
991    "CHECK_NOTHROW(",
992    "REQUIRE_THROWS(",
993    "CHECK_THROWS(",
994    "REQUIRE_THAT(",
995    "CHECK_THAT(",
996    // Unity assertions (embedded C)
997    "TEST_ASSERT_EQUAL(",
998    "TEST_ASSERT_EQUAL_INT(",
999    "TEST_ASSERT_EQUAL_STRING(",
1000    "TEST_ASSERT_EQUAL_FLOAT(",
1001    "TEST_ASSERT_EQUAL_DOUBLE(",
1002    "TEST_ASSERT_EQUAL_PTR(",
1003    "TEST_ASSERT_TRUE(",
1004    "TEST_ASSERT_FALSE(",
1005    "TEST_ASSERT_NULL(",
1006    "TEST_ASSERT_NOT_NULL(",
1007    "TEST_ASSERT_BITS_HIGH(",
1008    "TEST_ASSERT_BITS_LOW(",
1009    // CMocka assertions (embedded C)
1010    "assert_int_equal(",
1011    "assert_int_not_equal(",
1012    "assert_string_equal(",
1013    "assert_string_not_equal(",
1014    "assert_true(",
1015    "assert_false(",
1016    "assert_null(",
1017    "assert_non_null(",
1018    "assert_ptr_equal(",
1019    "assert_memory_equal(",
1020    "assert_return_code(",
1021];
1022
1023// Test suite/group declaration patterns for C and C++.
1024const SUITE_PATTERNS_C_CPP: &[&str] = &[
1025    "TEST_GROUP(",
1026    "TEST_GROUP_BASE(",
1027    "BOOST_AUTO_TEST_SUITE(",
1028    "CPPUNIT_TEST_SUITE(",
1029    "CPPUNIT_TEST_SUITE_END(",
1030];
1031
1032const SP_C: SymbolPatterns = SymbolPatterns {
1033    // C has no function keyword; detect by common return types that precede `(` with no `=`.
1034    functions: &[],
1035    functions_prefix_paren: &[
1036        "void ",
1037        "int ",
1038        "char ",
1039        "float ",
1040        "double ",
1041        "long ",
1042        "unsigned ",
1043        "size_t ",
1044        "static ",
1045        "inline ",
1046        "const ",
1047        "extern ",
1048    ],
1049    classes: &[
1050        "struct ",
1051        "typedef struct ",
1052        "union ",
1053        "typedef union ",
1054        "typedef enum ",
1055    ],
1056    variables: &[],
1057    imports: &["#include "],
1058    tests: TEST_PATTERNS_C_CPP,
1059    assertions: ASSERT_PATTERNS_C_CPP,
1060    test_suites: SUITE_PATTERNS_C_CPP,
1061};
1062
1063const SP_CPP: SymbolPatterns = SymbolPatterns {
1064    // C++ specific function keyword-prefixes; return-type-led patterns use functions_prefix_paren.
1065    functions: &[
1066        "virtual ",  // virtual method declaration/definition
1067        "explicit ", // explicit constructor modifier
1068        "~",         // destructor (e.g. ~MyClass())
1069        "operator",  // operator overload (operator==, operator+, …)
1070    ],
1071    functions_prefix_paren: &[
1072        "void ",
1073        "bool ",
1074        "int ",
1075        "char ",
1076        "float ",
1077        "double ",
1078        "long ",
1079        "unsigned ",
1080        "size_t ",
1081        "auto ",
1082        "static ",
1083        "inline ",
1084        "constexpr ",
1085        "const ",
1086        "extern ",
1087    ],
1088    // `template<` (no space) is the dominant modern style alongside `template ` (with space).
1089    classes: &["class ", "struct ", "namespace ", "template ", "template<"],
1090    variables: &[],
1091    imports: &["#include "],
1092    tests: TEST_PATTERNS_C_CPP,
1093    assertions: ASSERT_PATTERNS_C_CPP,
1094    test_suites: SUITE_PATTERNS_C_CPP,
1095};
1096
1097const SP_SHELL: SymbolPatterns = SymbolPatterns {
1098    functions: &["function "],
1099    functions_prefix_paren: &[],
1100    classes: &[],
1101    variables: &["declare ", "local ", "export "],
1102    imports: &["source ", ". "],
1103    tests: &[],
1104    assertions: &[],
1105    test_suites: &[],
1106};
1107
1108const SP_POWERSHELL: SymbolPatterns = SymbolPatterns {
1109    functions: &["function ", "Function "],
1110    functions_prefix_paren: &[],
1111    classes: &["class "],
1112    variables: &[],
1113    imports: &["Import-Module ", "using "],
1114    // Pester test framework
1115    tests: &["Describe ", "It ", "Context "],
1116    assertions: &[],
1117    test_suites: &[],
1118};
1119
1120const SP_KOTLIN: SymbolPatterns = SymbolPatterns {
1121    functions: &[
1122        "fun ",
1123        "private fun ",
1124        "public fun ",
1125        "protected fun ",
1126        "internal fun ",
1127        "override fun ",
1128        "suspend fun ",
1129        "abstract fun ",
1130        "open fun ",
1131        "private suspend fun ",
1132        "public suspend fun ",
1133    ],
1134    functions_prefix_paren: &[],
1135    classes: &[
1136        "class ",
1137        "data class ",
1138        "sealed class ",
1139        "abstract class ",
1140        "open class ",
1141        "object ",
1142        "companion object",
1143        "interface ",
1144        "enum class ",
1145        "annotation class ",
1146    ],
1147    variables: &["val ", "var ", "private val ", "private var ", "const val "],
1148    imports: &["import "],
1149    // JUnit 4/5, KotlinTest, Kotest
1150    tests: &[
1151        "@Test",
1152        "@ParameterizedTest",
1153        "@RepeatedTest",
1154        "\"should ",
1155        "\"it ",
1156    ],
1157    assertions: &[
1158        "assertEquals(",
1159        "assertNotEquals(",
1160        "assertTrue(",
1161        "assertFalse(",
1162        "assertNull(",
1163        "assertNotNull(",
1164        "assertThat(",
1165        "assertThrows(",
1166        "shouldBe(",
1167        "shouldNotBe(",
1168        "shouldThrow(",
1169    ],
1170    test_suites: &[],
1171};
1172
1173const SP_SWIFT: SymbolPatterns = SymbolPatterns {
1174    functions: &[
1175        "func ",
1176        "private func ",
1177        "public func ",
1178        "internal func ",
1179        "override func ",
1180        "open func ",
1181        "static func ",
1182        "class func ",
1183        "mutating func ",
1184        "private static func ",
1185        "public static func ",
1186    ],
1187    functions_prefix_paren: &[],
1188    classes: &[
1189        "class ",
1190        "struct ",
1191        "protocol ",
1192        "enum ",
1193        "extension ",
1194        "actor ",
1195        "public class ",
1196        "private class ",
1197        "open class ",
1198        "final class ",
1199        "public struct ",
1200        "private struct ",
1201        "public protocol ",
1202    ],
1203    variables: &[
1204        "var ",
1205        "let ",
1206        "private var ",
1207        "private let ",
1208        "static var ",
1209        "static let ",
1210    ],
1211    imports: &["import "],
1212    // XCTest: test functions are named test* by convention; Swift Testing: @Test attribute
1213    tests: &["func test", "func Test", "@Test"],
1214    assertions: &[
1215        "XCTAssertEqual(",
1216        "XCTAssertNotEqual(",
1217        "XCTAssertTrue(",
1218        "XCTAssertFalse(",
1219        "XCTAssertNil(",
1220        "XCTAssertNotNil(",
1221        "XCTAssertGreaterThan(",
1222        "XCTAssertLessThan(",
1223        "XCTAssertThrowsError(",
1224        "XCTAssertNoThrow(",
1225        "#expect(",
1226    ],
1227    test_suites: &[],
1228};
1229
1230const SP_RUBY: SymbolPatterns = SymbolPatterns {
1231    functions: &["def ", "private def ", "protected def "],
1232    functions_prefix_paren: &[],
1233    classes: &["class ", "module "],
1234    variables: &[],
1235    imports: &["require ", "require_relative "],
1236    // RSpec / minitest
1237    tests: &["it ", "it(", "describe ", "context ", "test "],
1238    assertions: &[],
1239    test_suites: &[],
1240};
1241
1242const SP_SCALA: SymbolPatterns = SymbolPatterns {
1243    functions: &["def ", "private def ", "protected def ", "override def "],
1244    functions_prefix_paren: &[],
1245    classes: &[
1246        "class ",
1247        "case class ",
1248        "abstract class ",
1249        "sealed class ",
1250        "object ",
1251        "trait ",
1252    ],
1253    variables: &["val ", "var ", "lazy val "],
1254    imports: &["import "],
1255    // ScalaTest / MUnit: FunSuite test("..."), FlatSpec it("..."), AnyWordSpec "..." should
1256    tests: &["test(", "it(", "describe("],
1257    assertions: &[],
1258    test_suites: &[],
1259};
1260
1261const SP_PHP: SymbolPatterns = SymbolPatterns {
1262    functions: &[
1263        "function ",
1264        "public function ",
1265        "private function ",
1266        "protected function ",
1267        "static function ",
1268        "abstract function ",
1269        "final function ",
1270        "public static function ",
1271        "private static function ",
1272        "protected static function ",
1273    ],
1274    functions_prefix_paren: &[],
1275    classes: &[
1276        "class ",
1277        "abstract class ",
1278        "final class ",
1279        "interface ",
1280        "trait ",
1281        "enum ",
1282    ],
1283    variables: &[],
1284    imports: &[
1285        "use ",
1286        "require ",
1287        "require_once ",
1288        "include ",
1289        "include_once ",
1290    ],
1291    // PHPUnit: test methods start with test, or use @test annotation
1292    tests: &[
1293        "public function test",
1294        "function test",
1295        "#[Test]",
1296        "#[DataProvider(",
1297    ],
1298    assertions: &[],
1299    test_suites: &[],
1300};
1301
1302const SP_ELIXIR: SymbolPatterns = SymbolPatterns {
1303    functions: &[
1304        "def ",
1305        "defp ",
1306        "defmacro ",
1307        "defmacrop ",
1308        "defguard ",
1309        "defguardp ",
1310    ],
1311    functions_prefix_paren: &[],
1312    classes: &["defmodule ", "defprotocol ", "defimpl "],
1313    variables: &[],
1314    imports: &["import ", "alias ", "use ", "require "],
1315    // ExUnit
1316    tests: &["test ", "describe "],
1317    assertions: &[],
1318    test_suites: &[],
1319};
1320
1321const SP_ERLANG: SymbolPatterns = SymbolPatterns {
1322    functions: &[],
1323    functions_prefix_paren: &[],
1324    classes: &["-module("],
1325    variables: &[],
1326    imports: &["-import(", "-include(", "-include_lib("],
1327    tests: &[],
1328    assertions: &[],
1329    test_suites: &[],
1330};
1331
1332const SP_FSHARP: SymbolPatterns = SymbolPatterns {
1333    functions: &[
1334        "let ",
1335        "let rec ",
1336        "member ",
1337        "override ",
1338        "abstract member ",
1339    ],
1340    functions_prefix_paren: &[],
1341    classes: &["type "],
1342    variables: &["let mutable "],
1343    imports: &["open "],
1344    // NUnit / xUnit attributes on their own line; FsUnit uses [<Test>] / [<Fact>]
1345    tests: &["[<Test>]", "[<Fact>]", "[<Theory>]", "[<TestCase("],
1346    assertions: &[],
1347    test_suites: &[],
1348};
1349
1350const SP_GROOVY: SymbolPatterns = SymbolPatterns {
1351    functions: &["def ", "private def ", "public def ", "protected def "],
1352    functions_prefix_paren: &[],
1353    classes: &["class ", "abstract class ", "interface ", "enum ", "trait "],
1354    variables: &[],
1355    imports: &["import "],
1356    // Spock framework: feature methods; JUnit annotations
1357    tests: &["def \"", "@Test", "given:", "when:", "then:", "expect:"],
1358    assertions: &[],
1359    test_suites: &[],
1360};
1361
1362const SP_HASKELL: SymbolPatterns = SymbolPatterns {
1363    functions: &[],
1364    functions_prefix_paren: &[],
1365    classes: &["class ", "data ", "newtype ", "type "],
1366    variables: &[],
1367    imports: &["import "],
1368    tests: &[],
1369    assertions: &[],
1370    test_suites: &[],
1371};
1372
1373const SP_LUA: SymbolPatterns = SymbolPatterns {
1374    functions: &["function ", "local function "],
1375    functions_prefix_paren: &[],
1376    classes: &[],
1377    variables: &["local "],
1378    imports: &[],
1379    // busted test framework
1380    tests: &["it(", "describe(", "pending("],
1381    assertions: &[],
1382    test_suites: &[],
1383};
1384
1385const SP_NIM: SymbolPatterns = SymbolPatterns {
1386    functions: &[
1387        "proc ",
1388        "func ",
1389        "method ",
1390        "iterator ",
1391        "converter ",
1392        "template ",
1393        "macro ",
1394    ],
1395    functions_prefix_paren: &[],
1396    classes: &["type "],
1397    variables: &["var ", "let ", "const "],
1398    imports: &["import ", "from "],
1399    // unittest module
1400    tests: &["test "],
1401    assertions: &[],
1402    test_suites: &[],
1403};
1404
1405const SP_OBJECTIVEC: SymbolPatterns = SymbolPatterns {
1406    functions: &["- (", "+ ("],
1407    functions_prefix_paren: &[],
1408    classes: &["@interface ", "@implementation ", "@protocol "],
1409    variables: &[],
1410    imports: &["#import ", "#include "],
1411    // XCTest: test methods start with - (void)test
1412    tests: &["- (void)test"],
1413    assertions: &[
1414        "XCTAssertEqual(",
1415        "XCTAssertNotEqual(",
1416        "XCTAssertTrue(",
1417        "XCTAssertFalse(",
1418        "XCTAssertNil(",
1419        "XCTAssertNotNil(",
1420        "XCTAssertGreaterThan(",
1421        "XCTAssertLessThan(",
1422        "XCTAssertThrowsError(",
1423        "XCTAssertNoThrow(",
1424    ],
1425    test_suites: &[],
1426};
1427
1428const SP_OCAML: SymbolPatterns = SymbolPatterns {
1429    functions: &["let ", "let rec "],
1430    functions_prefix_paren: &[],
1431    classes: &["type ", "module ", "class "],
1432    variables: &[],
1433    imports: &["open "],
1434    tests: &[],
1435    assertions: &[],
1436    test_suites: &[],
1437};
1438
1439const SP_PERL: SymbolPatterns = SymbolPatterns {
1440    functions: &["sub "],
1441    functions_prefix_paren: &[],
1442    classes: &["package "],
1443    variables: &["my ", "our ", "local "],
1444    imports: &["use ", "require "],
1445    tests: &[],
1446    assertions: &[],
1447    test_suites: &[],
1448};
1449
1450const SP_CLOJURE: SymbolPatterns = SymbolPatterns {
1451    functions: &["(defn ", "(defn- ", "(defmacro ", "(defmulti "],
1452    functions_prefix_paren: &[],
1453    classes: &[
1454        "(defrecord ",
1455        "(defprotocol ",
1456        "(deftype ",
1457        "(definterface ",
1458    ],
1459    variables: &["(def ", "(defonce "],
1460    imports: &["(ns ", "(require "],
1461    // clojure.test
1462    tests: &["(deftest ", "(testing "],
1463    assertions: &[],
1464    test_suites: &[],
1465};
1466
1467const SP_JULIA: SymbolPatterns = SymbolPatterns {
1468    functions: &["function ", "macro "],
1469    functions_prefix_paren: &[],
1470    classes: &[
1471        "struct ",
1472        "mutable struct ",
1473        "abstract type ",
1474        "primitive type ",
1475    ],
1476    variables: &["const "],
1477    imports: &["import ", "using "],
1478    // Test.jl standard library
1479    tests: &["@test ", "@testset "],
1480    assertions: &[],
1481    test_suites: &[],
1482};
1483
1484const SP_DART: SymbolPatterns = SymbolPatterns {
1485    functions: &[],
1486    functions_prefix_paren: &[],
1487    classes: &["class ", "abstract class ", "mixin ", "extension ", "enum "],
1488    variables: &["var ", "final ", "const ", "late "],
1489    imports: &["import "],
1490    // flutter_test / test package
1491    tests: &["test(", "testWidgets(", "group("],
1492    assertions: &[],
1493    test_suites: &[],
1494};
1495
1496const SP_R: SymbolPatterns = SymbolPatterns {
1497    functions: &[],
1498    functions_prefix_paren: &[],
1499    classes: &[],
1500    variables: &[],
1501    imports: &["library(", "source("],
1502    // testthat
1503    tests: &["test_that(", "it(", "describe(", "expect_"],
1504    assertions: &[],
1505    test_suites: &[],
1506};
1507
1508const SP_SQL: SymbolPatterns = SymbolPatterns {
1509    functions: &[
1510        "create function ",
1511        "create or replace function ",
1512        "create procedure ",
1513        "create or replace procedure ",
1514        "CREATE FUNCTION ",
1515        "CREATE OR REPLACE FUNCTION ",
1516        "CREATE PROCEDURE ",
1517        "CREATE OR REPLACE PROCEDURE ",
1518    ],
1519    functions_prefix_paren: &[],
1520    classes: &[
1521        "create table ",
1522        "create view ",
1523        "create schema ",
1524        "CREATE TABLE ",
1525        "CREATE VIEW ",
1526        "CREATE SCHEMA ",
1527    ],
1528    variables: &["declare ", "DECLARE "],
1529    imports: &[],
1530    tests: &[],
1531    assertions: &[],
1532    test_suites: &[],
1533};
1534
1535const SP_ASSEMBLY: SymbolPatterns = SymbolPatterns {
1536    functions: &["proc ", "PROC "],
1537    functions_prefix_paren: &[],
1538    classes: &[],
1539    variables: &[],
1540    imports: &["include ", "INCLUDE ", "%include "],
1541    tests: &[],
1542    assertions: &[],
1543    test_suites: &[],
1544};
1545
1546const SP_ZIG: SymbolPatterns = SymbolPatterns {
1547    functions: &[
1548        "fn ",
1549        "pub fn ",
1550        "export fn ",
1551        "inline fn ",
1552        "pub inline fn ",
1553    ],
1554    functions_prefix_paren: &[],
1555    classes: &[],
1556    variables: &["var ", "pub var "],
1557    imports: &[],
1558    // Zig built-in test blocks
1559    tests: &["test \"", "test{"],
1560    assertions: &[],
1561    test_suites: &[],
1562};
1563
1564/// Static (non-heap) language scanning parameters.  All fields are `'static` so this struct
1565/// can be stored in a `static` array.  The dynamic `skip_lines` set (used only for Python
1566/// docstring detection) is kept in `ScanConfig` and populated by the caller after lookup.
1567#[allow(clippy::struct_excessive_bools)]
1568#[derive(Clone, Copy)]
1569struct StaticLangConfig {
1570    line_comments: &'static [&'static str],
1571    block_comment: Option<(&'static str, &'static str)>,
1572    allow_single_quote_strings: bool,
1573    allow_double_quote_strings: bool,
1574    allow_triple_quote_strings: bool,
1575    allow_csharp_verbatim_strings: bool,
1576    symbol_patterns: SymbolPatterns,
1577    /// `true` for C, C++, and Objective-C (languages that have a C preprocessor).
1578    has_preprocessor: bool,
1579}
1580
1581#[allow(clippy::struct_excessive_bools)]
1582#[derive(Debug, Clone)]
1583struct ScanConfig {
1584    line_comments: &'static [&'static str],
1585    block_comment: Option<(&'static str, &'static str)>,
1586    allow_single_quote_strings: bool,
1587    allow_double_quote_strings: bool,
1588    allow_triple_quote_strings: bool,
1589    allow_csharp_verbatim_strings: bool,
1590    skip_lines: HashSet<usize>,
1591    symbol_patterns: SymbolPatterns,
1592}
1593
1594// ── Per-family base configurations ───────────────────────────────────────────
1595//
1596// Most languages share one of two comment styles.  Define a base `const` for
1597// each family; table entries override only the fields that differ (symbol
1598// patterns, preprocessor flag, verbatim-string flag, etc.).
1599//
1600// C-slash family: `//` line, `/* */` block, single + double quotes.
1601// Covers C, C++, Obj-C, C#, Go, Java, JS/TS/Svelte/Vue, Dart, Groovy, Kotlin,
1602// Scala, SCSS, Swift, Rust, and Zig (Zig has no block comment → overridden).
1603const C_SLASH_BASE: StaticLangConfig = StaticLangConfig {
1604    line_comments: &["//"],
1605    block_comment: Some(("/*", "*/")),
1606    allow_single_quote_strings: true,
1607    allow_double_quote_strings: true,
1608    allow_triple_quote_strings: false,
1609    allow_csharp_verbatim_strings: false,
1610    symbol_patterns: SP_NONE,
1611    has_preprocessor: false,
1612};
1613
1614// Hash-comment family: `#` line comment, no block comment, single + double
1615// quotes.  Covers Shell, Ruby, R, Perl, Elixir (each overrides only SP_*);
1616// Python overrides triple-quote; PowerShell and Nim override block_comment.
1617const HASH_BASE: StaticLangConfig = StaticLangConfig {
1618    line_comments: &["#"],
1619    block_comment: None,
1620    allow_single_quote_strings: true,
1621    allow_double_quote_strings: true,
1622    allow_triple_quote_strings: false,
1623    allow_csharp_verbatim_strings: false,
1624    symbol_patterns: SP_NONE,
1625    has_preprocessor: false,
1626};
1627
1628/// Static language-scan configuration table — one entry per supported language.
1629/// Used by `language_scan_config` to avoid a 41-arm match.  All `SP_*` constants
1630/// referenced here are defined above in the same module.
1631static LANG_SCAN_TABLE: &[(Language, StaticLangConfig)] = &[
1632    // ── C preprocessor family ─────────────────────────────────────────────────
1633    (
1634        Language::C,
1635        StaticLangConfig {
1636            symbol_patterns: SP_C,
1637            has_preprocessor: true,
1638            ..C_SLASH_BASE
1639        },
1640    ),
1641    (
1642        Language::Cpp,
1643        StaticLangConfig {
1644            symbol_patterns: SP_CPP,
1645            has_preprocessor: true,
1646            ..C_SLASH_BASE
1647        },
1648    ),
1649    (
1650        Language::ObjectiveC,
1651        StaticLangConfig {
1652            symbol_patterns: SP_OBJECTIVEC,
1653            has_preprocessor: true,
1654            ..C_SLASH_BASE
1655        },
1656    ),
1657    // ── C-slash family ────────────────────────────────────────────────────────
1658    (
1659        Language::CSharp,
1660        StaticLangConfig {
1661            symbol_patterns: SP_CSHARP,
1662            allow_csharp_verbatim_strings: true,
1663            ..C_SLASH_BASE
1664        },
1665    ),
1666    (
1667        Language::Go,
1668        StaticLangConfig {
1669            symbol_patterns: SP_GO,
1670            ..C_SLASH_BASE
1671        },
1672    ),
1673    (
1674        Language::Java,
1675        StaticLangConfig {
1676            symbol_patterns: SP_JAVA,
1677            ..C_SLASH_BASE
1678        },
1679    ),
1680    (
1681        Language::JavaScript,
1682        StaticLangConfig {
1683            symbol_patterns: SP_JS,
1684            ..C_SLASH_BASE
1685        },
1686    ),
1687    (
1688        Language::TypeScript,
1689        StaticLangConfig {
1690            symbol_patterns: SP_TS,
1691            ..C_SLASH_BASE
1692        },
1693    ),
1694    (
1695        Language::Svelte,
1696        StaticLangConfig {
1697            symbol_patterns: SP_JS,
1698            ..C_SLASH_BASE
1699        },
1700    ),
1701    (
1702        Language::Vue,
1703        StaticLangConfig {
1704            symbol_patterns: SP_JS,
1705            ..C_SLASH_BASE
1706        },
1707    ),
1708    (
1709        Language::Dart,
1710        StaticLangConfig {
1711            symbol_patterns: SP_DART,
1712            ..C_SLASH_BASE
1713        },
1714    ),
1715    (
1716        Language::Groovy,
1717        StaticLangConfig {
1718            symbol_patterns: SP_GROOVY,
1719            ..C_SLASH_BASE
1720        },
1721    ),
1722    (
1723        Language::Kotlin,
1724        StaticLangConfig {
1725            symbol_patterns: SP_KOTLIN,
1726            ..C_SLASH_BASE
1727        },
1728    ),
1729    (
1730        Language::Scala,
1731        StaticLangConfig {
1732            symbol_patterns: SP_SCALA,
1733            ..C_SLASH_BASE
1734        },
1735    ),
1736    (
1737        Language::Scss,
1738        StaticLangConfig {
1739            symbol_patterns: SP_NONE,
1740            ..C_SLASH_BASE
1741        },
1742    ),
1743    // Rust: no single-quote char literals (they're lifetime annotations)
1744    (
1745        Language::Rust,
1746        StaticLangConfig {
1747            symbol_patterns: SP_RUST,
1748            allow_single_quote_strings: false,
1749            ..C_SLASH_BASE
1750        },
1751    ),
1752    // Swift: no single-quote strings
1753    (
1754        Language::Swift,
1755        StaticLangConfig {
1756            symbol_patterns: SP_SWIFT,
1757            allow_single_quote_strings: false,
1758            ..C_SLASH_BASE
1759        },
1760    ),
1761    // Zig: no block comment
1762    (
1763        Language::Zig,
1764        StaticLangConfig {
1765            symbol_patterns: SP_ZIG,
1766            block_comment: None,
1767            ..C_SLASH_BASE
1768        },
1769    ),
1770    // F#: `(*` … `*)` block comment, no single-quote strings
1771    (
1772        Language::FSharp,
1773        StaticLangConfig {
1774            line_comments: &["//"],
1775            block_comment: Some(("(*", "*)")),
1776            allow_single_quote_strings: false,
1777            allow_double_quote_strings: true,
1778            symbol_patterns: SP_FSHARP,
1779            ..C_SLASH_BASE
1780        },
1781    ),
1782    // ── Hash-comment family ───────────────────────────────────────────────────
1783    (
1784        Language::Shell,
1785        StaticLangConfig {
1786            symbol_patterns: SP_SHELL,
1787            ..HASH_BASE
1788        },
1789    ),
1790    (
1791        Language::Elixir,
1792        StaticLangConfig {
1793            symbol_patterns: SP_ELIXIR,
1794            ..HASH_BASE
1795        },
1796    ),
1797    (
1798        Language::Perl,
1799        StaticLangConfig {
1800            symbol_patterns: SP_PERL,
1801            ..HASH_BASE
1802        },
1803    ),
1804    (
1805        Language::R,
1806        StaticLangConfig {
1807            symbol_patterns: SP_R,
1808            ..HASH_BASE
1809        },
1810    ),
1811    (
1812        Language::Ruby,
1813        StaticLangConfig {
1814            symbol_patterns: SP_RUBY,
1815            ..HASH_BASE
1816        },
1817    ),
1818    // Python: triple-quote string literals
1819    (
1820        Language::Python,
1821        StaticLangConfig {
1822            symbol_patterns: SP_PYTHON,
1823            allow_triple_quote_strings: true,
1824            ..HASH_BASE
1825        },
1826    ),
1827    // PowerShell: `<# … #>` block comment
1828    (
1829        Language::PowerShell,
1830        StaticLangConfig {
1831            symbol_patterns: SP_POWERSHELL,
1832            block_comment: Some(("<#", "#>")),
1833            ..HASH_BASE
1834        },
1835    ),
1836    // Nim: `#[` … `]#` block comment
1837    (
1838        Language::Nim,
1839        StaticLangConfig {
1840            symbol_patterns: SP_NIM,
1841            block_comment: Some(("#[", "]#")),
1842            ..HASH_BASE
1843        },
1844    ),
1845    // Makefile / Dockerfile: `#` only, no string literals
1846    (
1847        Language::Makefile,
1848        StaticLangConfig {
1849            symbol_patterns: SP_NONE,
1850            allow_single_quote_strings: false,
1851            allow_double_quote_strings: false,
1852            ..HASH_BASE
1853        },
1854    ),
1855    (
1856        Language::Dockerfile,
1857        StaticLangConfig {
1858            symbol_patterns: SP_NONE,
1859            allow_single_quote_strings: false,
1860            allow_double_quote_strings: false,
1861            ..HASH_BASE
1862        },
1863    ),
1864    // ── Other unique comment styles ───────────────────────────────────────────
1865    // CSS / SCSS: only `/* */` block, no line comment
1866    (
1867        Language::Css,
1868        StaticLangConfig {
1869            line_comments: &[],
1870            block_comment: Some(("/*", "*/")),
1871            symbol_patterns: SP_NONE,
1872            ..C_SLASH_BASE
1873        },
1874    ),
1875    // HTML / XML: `<!-- -->` block, no line comment, no string literals
1876    (
1877        Language::Html,
1878        StaticLangConfig {
1879            line_comments: &[],
1880            block_comment: Some(("<!--", "-->")),
1881            allow_single_quote_strings: false,
1882            allow_double_quote_strings: false,
1883            symbol_patterns: SP_NONE,
1884            ..C_SLASH_BASE
1885        },
1886    ),
1887    (
1888        Language::Xml,
1889        StaticLangConfig {
1890            line_comments: &[],
1891            block_comment: Some(("<!--", "-->")),
1892            allow_single_quote_strings: false,
1893            allow_double_quote_strings: false,
1894            symbol_patterns: SP_NONE,
1895            ..C_SLASH_BASE
1896        },
1897    ),
1898    // Lua: `--` line, `--[[ ]]` block
1899    (
1900        Language::Lua,
1901        StaticLangConfig {
1902            line_comments: &["--"],
1903            block_comment: Some(("--[[", "]]")),
1904            symbol_patterns: SP_LUA,
1905            ..C_SLASH_BASE
1906        },
1907    ),
1908    // Haskell: `--` line, `{- -}` block
1909    (
1910        Language::Haskell,
1911        StaticLangConfig {
1912            line_comments: &["--"],
1913            block_comment: Some(("{-", "-}")),
1914            symbol_patterns: SP_HASKELL,
1915            ..C_SLASH_BASE
1916        },
1917    ),
1918    // SQL: `--` line, `/* */` block, single quote only
1919    (
1920        Language::Sql,
1921        StaticLangConfig {
1922            line_comments: &["--"],
1923            block_comment: Some(("/*", "*/")),
1924            allow_single_quote_strings: true,
1925            allow_double_quote_strings: false,
1926            symbol_patterns: SP_SQL,
1927            ..C_SLASH_BASE
1928        },
1929    ),
1930    // OCaml: `(*` … `*)` only, no line comment, no single-quote strings
1931    (
1932        Language::Ocaml,
1933        StaticLangConfig {
1934            line_comments: &[],
1935            block_comment: Some(("(*", "*)")),
1936            allow_single_quote_strings: false,
1937            symbol_patterns: SP_OCAML,
1938            ..C_SLASH_BASE
1939        },
1940    ),
1941    // Assembly / Clojure: `;` line comment, no block, no string literals
1942    (
1943        Language::Assembly,
1944        StaticLangConfig {
1945            line_comments: &[";"],
1946            block_comment: None,
1947            allow_single_quote_strings: false,
1948            allow_double_quote_strings: false,
1949            symbol_patterns: SP_ASSEMBLY,
1950            ..C_SLASH_BASE
1951        },
1952    ),
1953    (
1954        Language::Clojure,
1955        StaticLangConfig {
1956            line_comments: &[";"],
1957            block_comment: None,
1958            allow_single_quote_strings: false,
1959            symbol_patterns: SP_CLOJURE,
1960            ..C_SLASH_BASE
1961        },
1962    ),
1963    // Erlang: `%` line comment, no block, no single-quote strings
1964    (
1965        Language::Erlang,
1966        StaticLangConfig {
1967            line_comments: &["%"],
1968            block_comment: None,
1969            allow_single_quote_strings: false,
1970            symbol_patterns: SP_ERLANG,
1971            ..C_SLASH_BASE
1972        },
1973    ),
1974    // PHP: `//` or `#` line, `/* */` block
1975    (
1976        Language::Php,
1977        StaticLangConfig {
1978            line_comments: &["//", "#"],
1979            block_comment: Some(("/*", "*/")),
1980            symbol_patterns: SP_PHP,
1981            ..C_SLASH_BASE
1982        },
1983    ),
1984    // Julia: `#` line, `#= =#` block, double + triple quotes, no single
1985    (
1986        Language::Julia,
1987        StaticLangConfig {
1988            line_comments: &["#"],
1989            block_comment: Some(("#=", "=#")),
1990            allow_single_quote_strings: false,
1991            allow_triple_quote_strings: true,
1992            symbol_patterns: SP_JULIA,
1993            ..C_SLASH_BASE
1994        },
1995    ),
1996];
1997
1998/// Per-call IEEE 1045-1992 flags derived from `AnalysisOptions` plus per-language properties.
1999/// Private to this crate; constructed inside `analyze_text`.
2000#[derive(Debug, Clone, Copy)]
2001struct IeeeFlags {
2002    /// True for C, C++, and Objective-C — languages with a C preprocessor.
2003    has_preprocessor_directives: bool,
2004    /// Mirrors `AnalysisOptions::blank_in_block_comment_as_comment`.
2005    blank_in_block_comment_as_comment: bool,
2006    /// Mirrors `AnalysisOptions::collapse_continuation_lines`.
2007    collapse_continuation_lines: bool,
2008}
2009
2010#[derive(Debug, Clone, Copy)]
2011enum StringState {
2012    Single(char),
2013    Triple(&'static str),
2014    VerbatimDouble,
2015}
2016
2017#[allow(clippy::struct_excessive_bools)]
2018#[derive(Debug, Default)]
2019struct LineFacts {
2020    has_code: bool,
2021    has_single_comment: bool,
2022    has_multi_comment: bool,
2023    has_docstring: bool,
2024}
2025
2026/// Process one character while the lexer is inside a string literal.
2027///
2028/// Returns `(new_string_state, advance)` where `advance` is the number of chars to skip.
2029fn process_string_char(
2030    state: StringState,
2031    chars: &[char],
2032    i: usize,
2033) -> (Option<StringState>, usize) {
2034    match state {
2035        StringState::Single(delim) => {
2036            if chars[i] == '\\' {
2037                return (Some(state), 2); // skip escaped character
2038            }
2039            if chars[i] == delim {
2040                (None, 1)
2041            } else {
2042                (Some(state), 1)
2043            }
2044        }
2045        StringState::Triple(delim) => {
2046            if starts_with(chars, i, delim) {
2047                (None, delim.len())
2048            } else {
2049                (Some(state), 1)
2050            }
2051        }
2052        StringState::VerbatimDouble => {
2053            if starts_with(chars, i, "\"\"") {
2054                return (Some(state), 2); // escaped quote-quote inside verbatim string
2055            }
2056            if chars[i] == '"' {
2057                (None, 1)
2058            } else {
2059                (Some(state), 1)
2060            }
2061        }
2062    }
2063}
2064
2065/// Process one character while the lexer is inside a block comment.
2066///
2067/// Returns `(still_in_block_comment, advance)`.
2068fn process_block_comment_char(chars: &[char], i: usize, close: &str) -> (bool, usize) {
2069    if starts_with(chars, i, close) {
2070        (false, close.len())
2071    } else {
2072        (true, 1)
2073    }
2074}
2075
2076/// Attempt to begin a new string literal at position `i`.
2077///
2078/// Returns `Some((new_state, advance))` when a string opener is detected, else `None`.
2079fn try_open_string(chars: &[char], i: usize, config: &ScanConfig) -> Option<(StringState, usize)> {
2080    if config.allow_csharp_verbatim_strings && starts_with(chars, i, "@\"") {
2081        return Some((StringState::VerbatimDouble, 2));
2082    }
2083    if config.allow_triple_quote_strings {
2084        if starts_with(chars, i, "\"\"\"") {
2085            return Some((StringState::Triple("\"\"\""), 3));
2086        }
2087        if starts_with(chars, i, "'''") {
2088            return Some((StringState::Triple("'''"), 3));
2089        }
2090    }
2091    if config.allow_single_quote_strings && chars[i] == '\'' {
2092        return Some((StringState::Single('\''), 1));
2093    }
2094    if config.allow_double_quote_strings && chars[i] == '"' {
2095        return Some((StringState::Single('"'), 1));
2096    }
2097    None
2098}
2099
2100/// Advance past one character position while inside a block comment.
2101///
2102/// Updates `in_block_comment` if the closing delimiter is found and returns the
2103/// number of characters consumed. Returns 0 when no block-comment config is set
2104/// (preserving the caller's `continue`-without-advance behaviour for that impossible state).
2105fn step_through_block_comment(
2106    chars: &[char],
2107    i: usize,
2108    block_comment: Option<(&'static str, &'static str)>,
2109    in_block_comment: &mut bool,
2110) -> usize {
2111    if let Some((_, close)) = block_comment {
2112        let (still_in, advance) = process_block_comment_char(chars, i, close);
2113        *in_block_comment = still_in;
2114        return advance;
2115    }
2116    0
2117}
2118
2119/// If the character at `i` starts a block comment, return the length of the opening
2120/// delimiter so the caller can advance past it. Returns `None` if no match.
2121fn try_open_block_comment(
2122    chars: &[char],
2123    i: usize,
2124    block_comment: Option<(&'static str, &'static str)>,
2125) -> Option<usize> {
2126    let (open, _) = block_comment?;
2127    starts_with(chars, i, open).then_some(open.len())
2128}
2129
2130/// Scan a single physical line and update `facts`, `in_block_comment`, and `string_state`.
2131///
2132/// Returns `true` when the caller should break out of the per-line loop early (line comment hit).
2133fn scan_line(
2134    chars: &[char],
2135    config: &ScanConfig,
2136    facts: &mut LineFacts,
2137    in_block_comment: &mut bool,
2138    string_state: &mut Option<StringState>,
2139) {
2140    let mut i = 0usize;
2141    while i < chars.len() {
2142        // Inside a string literal — advance until the closing delimiter.
2143        if let Some(state) = *string_state {
2144            facts.has_code = true;
2145            let (new_state, advance) = process_string_char(state, chars, i);
2146            *string_state = new_state;
2147            i += advance;
2148            continue;
2149        }
2150
2151        // Inside a block comment — advance until the closing delimiter.
2152        if *in_block_comment {
2153            facts.has_multi_comment = true;
2154            i += step_through_block_comment(chars, i, config.block_comment, in_block_comment);
2155            continue;
2156        }
2157
2158        // Whitespace outside any string/comment — skip.
2159        if chars[i].is_whitespace() {
2160            i += 1;
2161            continue;
2162        }
2163
2164        // Attempt to open a string literal.
2165        if let Some((new_state, advance)) = try_open_string(chars, i, config) {
2166            facts.has_code = true;
2167            *string_state = Some(new_state);
2168            i += advance;
2169            continue;
2170        }
2171
2172        // Attempt to open a block comment.
2173        if let Some(advance) = try_open_block_comment(chars, i, config.block_comment) {
2174            facts.has_multi_comment = true;
2175            *in_block_comment = true;
2176            i += advance;
2177            continue;
2178        }
2179
2180        // Line comment — rest of the line is a comment; stop scanning.
2181        if config
2182            .line_comments
2183            .iter()
2184            .any(|prefix| starts_with(chars, i, prefix))
2185        {
2186            facts.has_single_comment = true;
2187            break;
2188        }
2189
2190        // Plain code character.
2191        facts.has_code = true;
2192        i += 1;
2193    }
2194}
2195
2196/// Apply IEEE 1045-1992 §4.2 preprocessor-directive tracking and continuation-line merging,
2197/// then emit the finalized `LineFacts` for this physical line.
2198///
2199/// Returns `None` when the line is part of a continuation sequence and should be deferred.
2200fn finalize_line_facts(
2201    facts: LineFacts,
2202    trimmed: &str,
2203    raw: &mut RawLineCounts,
2204    ieee: IeeeFlags,
2205    in_block_comment: bool,
2206    string_state: Option<StringState>,
2207    pending_continuation: &mut Option<LineFacts>,
2208) -> Option<LineFacts> {
2209    // IEEE 1045-1992 §4.2: track preprocessor/compiler directive lines (C/C++/ObjC).
2210    // A directive line is a pure code line (no comment on the same physical line) whose
2211    // trimmed content starts with '#'.
2212    if ieee.has_preprocessor_directives
2213        && facts.has_code
2214        && !facts.has_single_comment
2215        && !facts.has_multi_comment
2216        && trimmed.starts_with('#')
2217    {
2218        raw.compiler_directive_lines += 1;
2219    }
2220
2221    // IEEE 1045-1992 continuation-line handling.
2222    // A line is a continuation starter when it ends with '\' outside any comment or string.
2223    let is_continuation = ieee.collapse_continuation_lines
2224        && !in_block_comment
2225        && string_state.is_none()
2226        && trimmed.ends_with('\\');
2227
2228    if is_continuation {
2229        let pending = pending_continuation.get_or_insert_with(LineFacts::default);
2230        pending.has_code |= facts.has_code;
2231        pending.has_single_comment |= facts.has_single_comment;
2232        pending.has_multi_comment |= facts.has_multi_comment;
2233        pending.has_docstring |= facts.has_docstring;
2234        return None; // defer classification until the sequence ends
2235    }
2236
2237    // Merge any accumulated continuation facts into the final line.
2238    let emit = if let Some(pending) = pending_continuation.take() {
2239        LineFacts {
2240            has_code: pending.has_code | facts.has_code,
2241            has_single_comment: pending.has_single_comment | facts.has_single_comment,
2242            has_multi_comment: pending.has_multi_comment | facts.has_multi_comment,
2243            has_docstring: pending.has_docstring | facts.has_docstring,
2244        }
2245    } else {
2246        facts
2247    };
2248    Some(emit)
2249}
2250
2251/// Scan and classify one physical line, updating all running state in place.
2252///
2253/// Pre-classified lines (present in `config.skip_lines`) are counted as docstring-comment
2254/// lines and returned early without further analysis.
2255#[allow(clippy::needless_pass_by_value)]
2256#[allow(clippy::too_many_arguments)]
2257#[allow(clippy::many_single_char_names)] // destructuring return from count_symbols; names match field roles
2258fn process_physical_line(
2259    line: &str,
2260    line_idx: usize,
2261    config: &ScanConfig,
2262    raw: &mut RawLineCounts,
2263    in_block_comment: &mut bool,
2264    string_state: &mut Option<StringState>,
2265    pending_continuation: &mut Option<LineFacts>,
2266    ieee: IeeeFlags,
2267) {
2268    raw.total_physical_lines += 1;
2269
2270    if config.skip_lines.contains(&line_idx) {
2271        raw.docstring_comment_lines += 1;
2272        return;
2273    }
2274
2275    let trimmed = line.trim();
2276    let mut facts = LineFacts::default();
2277
2278    // IEEE 1045-1992: blank lines inside block comments are comment lines by default.
2279    // When blank_in_block_comment_as_comment is false, blank lines keep their blank
2280    // classification even while inside a block comment.
2281    if *in_block_comment && (ieee.blank_in_block_comment_as_comment || !trimmed.is_empty()) {
2282        facts.has_multi_comment = true;
2283    }
2284
2285    let chars: Vec<char> = line.chars().collect();
2286    scan_line(&chars, config, &mut facts, in_block_comment, string_state);
2287
2288    let Some(emit) = finalize_line_facts(
2289        facts,
2290        trimmed,
2291        raw,
2292        ieee,
2293        *in_block_comment,
2294        *string_state,
2295        pending_continuation,
2296    ) else {
2297        return;
2298    };
2299
2300    classify_line(raw, &emit, trimmed);
2301
2302    if emit.has_code {
2303        let (f, c, v, i, t, a, s) = count_symbols(&config.symbol_patterns, trimmed);
2304        raw.functions += f;
2305        raw.classes += c;
2306        raw.variables += v;
2307        raw.imports += i;
2308        raw.test_count += t;
2309        raw.test_assertion_count += a;
2310        raw.test_suite_count += s;
2311    }
2312}
2313
2314#[allow(clippy::needless_pass_by_value)]
2315fn analyze_generic(text: &str, config: ScanConfig, ieee: IeeeFlags) -> RawFileAnalysis {
2316    let normalized = text.replace("\r\n", "\n").replace('\r', "\n");
2317    let lines: Vec<&str> = normalized.split_terminator('\n').collect();
2318
2319    let mut raw = RawLineCounts::default();
2320    let mut warnings = Vec::new();
2321
2322    let mut in_block_comment = false;
2323    let mut string_state: Option<StringState> = None;
2324    // IEEE continuation-line state: accumulates facts across a backslash-continued sequence.
2325    let mut pending_continuation: Option<LineFacts> = None;
2326
2327    for (line_idx, line) in lines.iter().enumerate() {
2328        process_physical_line(
2329            line,
2330            line_idx,
2331            &config,
2332            &mut raw,
2333            &mut in_block_comment,
2334            &mut string_state,
2335            &mut pending_continuation,
2336            ieee,
2337        );
2338    }
2339
2340    // Flush any pending continuation that reaches end-of-file without a closing line.
2341    if let Some(pending) = pending_continuation.take() {
2342        classify_line(&mut raw, &pending, "");
2343    }
2344
2345    if in_block_comment {
2346        warnings.push("unclosed block comment detected; result is best effort".into());
2347    }
2348    if string_state.is_some() {
2349        warnings.push("unclosed string literal detected; result is best effort".into());
2350    }
2351
2352    RawFileAnalysis {
2353        raw,
2354        parse_mode: if warnings.is_empty() {
2355            ParseMode::Lexical
2356        } else {
2357            ParseMode::LexicalBestEffort
2358        },
2359        warnings,
2360    }
2361}
2362
2363const fn classify_line(raw: &mut RawLineCounts, facts: &LineFacts, trimmed: &str) {
2364    if facts.has_docstring {
2365        raw.docstring_comment_lines += 1;
2366    } else if !facts.has_code
2367        && !facts.has_single_comment
2368        && !facts.has_multi_comment
2369        && trimmed.is_empty()
2370    {
2371        raw.blank_only_lines += 1;
2372    } else if facts.has_code && facts.has_single_comment {
2373        raw.mixed_code_single_comment_lines += 1;
2374    } else if facts.has_code && facts.has_multi_comment {
2375        raw.mixed_code_multi_comment_lines += 1;
2376    } else if facts.has_code {
2377        raw.code_only_lines += 1;
2378    } else if facts.has_single_comment {
2379        raw.single_comment_only_lines += 1;
2380    } else if facts.has_multi_comment {
2381        raw.multi_comment_only_lines += 1;
2382    } else if trimmed.is_empty() {
2383        raw.blank_only_lines += 1;
2384    } else {
2385        raw.skipped_unknown_lines += 1;
2386    }
2387}
2388
2389fn count_symbols(patterns: &SymbolPatterns, trimmed: &str) -> (u64, u64, u64, u64, u64, u64, u64) {
2390    let hit = |pats: &[&str]| u64::from(pats.iter().any(|p| trimmed.starts_with(p)));
2391    // For return-type-led languages (C/C++): match prefix AND `(` present AND no `=` sits
2392    // between the prefix start and the first `(` (guards against `void* p = malloc(n)`).
2393    let fn_pp = if patterns.functions_prefix_paren.is_empty() {
2394        0
2395    } else if let Some(paren_pos) = trimmed.find('(') {
2396        if trimmed[..paren_pos].contains('=') {
2397            0
2398        } else {
2399            hit(patterns.functions_prefix_paren)
2400        }
2401    } else {
2402        0
2403    };
2404    let test_hit = hit(patterns.tests);
2405    // Lines matching a test pattern count as tests, not as plain functions or classes.
2406    // This prevents double-counting in Python (`def test_` / `class Test`) and Go
2407    // (`func Test` / `func Benchmark` / `func Fuzz`) where the same line satisfies both
2408    // a function/class prefix and a test pattern. Rust is unaffected: `#[test]` is a
2409    // standalone attribute line; the `fn` declaration on the next line does not match any
2410    // test pattern and still increments functions correctly.
2411    let fn_hit = if test_hit == 0 {
2412        hit(patterns.functions) | fn_pp
2413    } else {
2414        0
2415    };
2416    let class_hit = if test_hit == 0 {
2417        hit(patterns.classes)
2418    } else {
2419        0
2420    };
2421    (
2422        fn_hit,
2423        class_hit,
2424        hit(patterns.variables),
2425        hit(patterns.imports),
2426        test_hit,
2427        hit(patterns.assertions),
2428        hit(patterns.test_suites),
2429    )
2430}
2431
2432fn starts_with(chars: &[char], index: usize, needle: &str) -> bool {
2433    let needle_chars: Vec<char> = needle.chars().collect();
2434    chars.get(index..index + needle_chars.len()) == Some(needle_chars.as_slice())
2435}
2436
2437#[derive(Debug, Clone)]
2438struct PyContext {
2439    indent: usize,
2440    expect_docstring: bool,
2441}
2442
2443/// Update `contexts` to pop any scopes that the current `indent` has outdented past.
2444fn py_pop_outdented_contexts(contexts: &mut Vec<PyContext>, indent: usize) {
2445    while contexts.len() > 1 && indent < contexts.last().map_or(0, |c| c.indent) {
2446        contexts.pop();
2447    }
2448}
2449
2450/// Handle `pending_block_indent` transition: push a new docstring-expecting context when we
2451/// detect the first indented line of a new block, or cancel the pending state otherwise.
2452fn py_handle_pending_indent(
2453    pending_block_indent: &mut Option<usize>,
2454    contexts: &mut Vec<PyContext>,
2455    indent: usize,
2456    trimmed: &str,
2457) {
2458    let Some(base_indent) = *pending_block_indent else {
2459        return;
2460    };
2461    if indent > base_indent {
2462        contexts.push(PyContext {
2463            indent,
2464            expect_docstring: true,
2465        });
2466        *pending_block_indent = None;
2467    } else if !trimmed.starts_with('@') {
2468        *pending_block_indent = None;
2469    }
2470}
2471
2472/// Check whether the current line is a docstring opener in the current context.
2473///
2474/// If it is, records the line, adjusts `ctx.expect_docstring`, and optionally sets
2475/// `active_docstring` for multi-line docstrings. Returns `true` when the caller should
2476/// `continue` to the next line.
2477fn py_try_record_docstring(
2478    ctx: &mut PyContext,
2479    trimmed: &str,
2480    idx: usize,
2481    docstring_lines: &mut HashSet<usize>,
2482    active_docstring: &mut Option<(&'static str, usize)>,
2483) -> bool {
2484    if !ctx.expect_docstring {
2485        return false;
2486    }
2487    if let Some(delim) = docstring_delimiter(trimmed) {
2488        docstring_lines.insert(idx);
2489        ctx.expect_docstring = false;
2490        if !closes_triple_docstring(trimmed, delim, true) {
2491            *active_docstring = Some((delim, idx));
2492        }
2493        return true;
2494    }
2495    ctx.expect_docstring = false;
2496    false
2497}
2498
2499/// Advance through an active multi-line docstring: marks the current line and clears
2500/// `active_docstring` when the closing delimiter is found. Returns `true` when the caller
2501/// should `continue` to the next line (i.e. we were inside a docstring).
2502fn track_active_docstring(
2503    active_docstring: &mut Option<(&'static str, usize)>,
2504    docstring_lines: &mut HashSet<usize>,
2505    idx: usize,
2506    trimmed: &str,
2507) -> bool {
2508    let Some((delim, start_line)) = *active_docstring else {
2509        return false;
2510    };
2511    docstring_lines.insert(idx);
2512    if closes_triple_docstring(trimmed, delim, idx == start_line) {
2513        *active_docstring = None;
2514    }
2515    true
2516}
2517
2518/// Attempt to record a docstring opener using the top of the context stack.
2519/// Returns `true` when the caller should `continue` to the next line.
2520fn try_record_docstring_if_context(
2521    contexts: &mut [PyContext],
2522    trimmed: &str,
2523    idx: usize,
2524    docstring_lines: &mut HashSet<usize>,
2525    active_docstring: &mut Option<(&'static str, usize)>,
2526) -> bool {
2527    let Some(ctx) = contexts.last_mut() else {
2528        return false;
2529    };
2530    py_try_record_docstring(ctx, trimmed, idx, docstring_lines, active_docstring)
2531}
2532
2533/// If an unclosed docstring is still active at end-of-file, mark all remaining lines.
2534fn mark_unclosed_docstring_lines(
2535    active_docstring: Option<&(&'static str, usize)>,
2536    docstring_lines: &mut HashSet<usize>,
2537    num_lines: usize,
2538) {
2539    if let Some(&(_, start_line)) = active_docstring {
2540        for idx in start_line..num_lines {
2541            docstring_lines.insert(idx);
2542        }
2543    }
2544}
2545
2546fn detect_python_docstring_lines(text: &str) -> HashSet<usize> {
2547    let normalized = text.replace("\r\n", "\n").replace('\r', "\n");
2548    let lines: Vec<&str> = normalized.split_terminator('\n').collect();
2549
2550    let mut docstring_lines = HashSet::new();
2551    let mut contexts = vec![PyContext {
2552        indent: 0,
2553        expect_docstring: true,
2554    }];
2555    let mut pending_block_indent: Option<usize> = None;
2556    let mut active_docstring: Option<(&'static str, usize)> = None;
2557
2558    for (idx, line) in lines.iter().enumerate() {
2559        let trimmed = line.trim();
2560        let indent = leading_indent(line);
2561
2562        if track_active_docstring(&mut active_docstring, &mut docstring_lines, idx, trimmed) {
2563            continue;
2564        }
2565
2566        // Blank lines and comment lines don't affect docstring detection.
2567        if trimmed.is_empty() || trimmed.starts_with('#') {
2568            continue;
2569        }
2570
2571        py_pop_outdented_contexts(&mut contexts, indent);
2572        py_handle_pending_indent(&mut pending_block_indent, &mut contexts, indent, trimmed);
2573
2574        if try_record_docstring_if_context(
2575            &mut contexts,
2576            trimmed,
2577            idx,
2578            &mut docstring_lines,
2579            &mut active_docstring,
2580        ) {
2581            continue;
2582        }
2583
2584        if is_python_block_header(trimmed) {
2585            pending_block_indent = Some(indent);
2586        }
2587    }
2588
2589    mark_unclosed_docstring_lines(active_docstring.as_ref(), &mut docstring_lines, lines.len());
2590
2591    docstring_lines
2592}
2593
2594fn leading_indent(line: &str) -> usize {
2595    line.chars().take_while(|c| c.is_whitespace()).count()
2596}
2597
2598fn is_python_block_header(trimmed: &str) -> bool {
2599    (trimmed.starts_with("def ")
2600        || trimmed.starts_with("async def ")
2601        || trimmed.starts_with("class "))
2602        && trimmed.ends_with(':')
2603}
2604
2605fn docstring_delimiter(trimmed: &str) -> Option<&'static str> {
2606    let mut idx = 0usize;
2607    let bytes = trimmed.as_bytes();
2608    while idx < bytes.len() {
2609        let c = bytes[idx] as char;
2610        if matches!(c, 'r' | 'R' | 'u' | 'U' | 'b' | 'B' | 'f' | 'F') {
2611            idx += 1;
2612            continue;
2613        }
2614        break;
2615    }
2616
2617    let rest = &trimmed[idx..];
2618    if rest.starts_with("\"\"\"") {
2619        Some("\"\"\"")
2620    } else if rest.starts_with("'''") {
2621        Some("'''")
2622    } else {
2623        None
2624    }
2625}
2626
2627fn closes_triple_docstring(trimmed: &str, delim: &str, same_line_as_start: bool) -> bool {
2628    let mut occurrences = 0usize;
2629    let mut search = trimmed;
2630    while let Some(index) = search.find(delim) {
2631        occurrences += 1;
2632        search = &search[index + delim.len()..];
2633    }
2634
2635    if same_line_as_start {
2636        occurrences >= 2
2637    } else {
2638        occurrences >= 1
2639    }
2640}
2641
2642/// Tree-sitter-backed adapters (compiled only when the `tree-sitter` feature is enabled).
2643///
2644/// When parsing succeeds the result is used directly; on any failure the caller falls back
2645/// to the lexical state machine.
2646#[cfg(feature = "tree-sitter")]
2647pub mod ts {
2648    use tree_sitter::Node;
2649
2650    use super::{ParseMode, RawFileAnalysis, RawLineCounts};
2651
2652    /// Configuration for which AST node kinds map to symbols in this grammar.
2653    struct SymbolKinds {
2654        /// Node kind name for function definitions (e.g. `"function_definition"`).
2655        function_def: &'static str,
2656        /// Node kind name for class definitions (e.g. `"class_definition"`).
2657        class_def: &'static str,
2658        /// Name field of a function node that, when it starts with this prefix, marks a test.
2659        /// Empty string disables test-prefix detection.
2660        test_fn_prefix: &'static str,
2661        /// Name field of a class node that, when it starts with this prefix, marks a test.
2662        /// Empty string disables test-prefix detection.
2663        test_class_prefix: &'static str,
2664        /// When non-empty, `call` nodes whose `function` is an `attribute` access and whose
2665        /// attribute identifier starts with this prefix are counted as test assertions.
2666        /// Used for Python `self.assertXxx(...)` detection.
2667        assertion_attr_prefix: &'static str,
2668    }
2669
2670    impl SymbolKinds {
2671        const fn none() -> Self {
2672            Self {
2673                function_def: "",
2674                class_def: "",
2675                test_fn_prefix: "",
2676                test_class_prefix: "",
2677                assertion_attr_prefix: "",
2678            }
2679        }
2680    }
2681
2682    /// Classify every line of `text` using a tree-sitter grammar.
2683    ///
2684    /// `comment_node_kinds` — node type names that represent comments in this grammar
2685    /// `docstring_stmt_kind` — optional parent node type whose direct `string` child is a docstring
2686    /// `symbols` — AST node kinds used to populate symbol counters
2687    fn analyze_lines(
2688        text: &str,
2689        ts_language: &tree_sitter::Language,
2690        comment_node_kinds: &[&str],
2691        docstring_stmt_kind: Option<&str>,
2692        symbols: &SymbolKinds,
2693    ) -> Option<RawFileAnalysis> {
2694        let mut parser = tree_sitter::Parser::new();
2695        parser.set_language(ts_language).ok()?;
2696        let tree = parser.parse(text, None)?;
2697
2698        let lines: Vec<&str> = text.split_terminator('\n').collect();
2699        let n = lines.len();
2700
2701        let mut has_code = vec![false; n];
2702        let mut has_comment = vec![false; n];
2703        let mut comment_is_block = vec![false; n];
2704        let mut has_docstring = vec![false; n];
2705
2706        // Walk every node in the tree and mark line arrays.
2707        let mut ctx = VisitCtx {
2708            source: text.as_bytes(),
2709            comment_kinds: comment_node_kinds,
2710            docstring_stmt_kind,
2711            has_code: &mut has_code,
2712            has_comment: &mut has_comment,
2713            comment_is_block: &mut comment_is_block,
2714            has_docstring: &mut has_docstring,
2715        };
2716        visit(tree.root_node(), &mut ctx);
2717
2718        let mut raw = RawLineCounts::default();
2719        classify_ts_lines(
2720            &lines,
2721            &has_code,
2722            &has_comment,
2723            &comment_is_block,
2724            &has_docstring,
2725            &mut raw,
2726        );
2727
2728        // Symbol counting: walk the AST a second time to collect function/class/test counts.
2729        if !symbols.function_def.is_empty() || !symbols.class_def.is_empty() {
2730            count_symbols(tree.root_node(), text.as_bytes(), symbols, &mut raw);
2731        }
2732
2733        Some(RawFileAnalysis {
2734            raw,
2735            parse_mode: ParseMode::TreeSitter,
2736            warnings: Vec::new(),
2737        })
2738    }
2739
2740    /// Recurse into every direct child of `node`.
2741    fn recurse_children(node: Node, source: &[u8], kinds: &SymbolKinds, raw: &mut RawLineCounts) {
2742        for i in 0..node.child_count() {
2743            #[allow(clippy::cast_possible_truncation)]
2744            if let Some(child) = node.child(i as u32) {
2745                count_symbols(child, source, kinds, raw);
2746            }
2747        }
2748    }
2749
2750    /// Handle a function-definition node. Returns `true` if the node matched.
2751    fn try_count_function(
2752        node: Node,
2753        source: &[u8],
2754        kinds: &SymbolKinds,
2755        raw: &mut RawLineCounts,
2756    ) -> bool {
2757        if kinds.function_def.is_empty() || node.kind() != kinds.function_def {
2758            return false;
2759        }
2760        let name = node
2761            .child_by_field_name("name")
2762            .and_then(|n| n.utf8_text(source).ok())
2763            .unwrap_or("");
2764        if !kinds.test_fn_prefix.is_empty() && name.starts_with(kinds.test_fn_prefix) {
2765            raw.test_count += 1;
2766        } else {
2767            raw.functions += 1;
2768        }
2769        recurse_children(node, source, kinds, raw);
2770        true
2771    }
2772
2773    /// Handle a class-definition node. Returns `true` if the node matched.
2774    fn try_count_class(
2775        node: Node,
2776        source: &[u8],
2777        kinds: &SymbolKinds,
2778        raw: &mut RawLineCounts,
2779    ) -> bool {
2780        if kinds.class_def.is_empty() || node.kind() != kinds.class_def {
2781            return false;
2782        }
2783        let name = node
2784            .child_by_field_name("name")
2785            .and_then(|n| n.utf8_text(source).ok())
2786            .unwrap_or("");
2787        if !kinds.test_class_prefix.is_empty() && name.starts_with(kinds.test_class_prefix) {
2788            raw.test_count += 1;
2789        } else {
2790            raw.classes += 1;
2791        }
2792        recurse_children(node, source, kinds, raw);
2793        true
2794    }
2795
2796    /// Handle an assertion call node. Returns `true` if the node matched (skips recursion
2797    /// into arguments, preserving "don't double-count test bodies" semantics).
2798    fn try_count_assertion(
2799        node: Node,
2800        source: &[u8],
2801        kinds: &SymbolKinds,
2802        raw: &mut RawLineCounts,
2803    ) -> bool {
2804        if kinds.assertion_attr_prefix.is_empty() || node.kind() != "call" {
2805            return false;
2806        }
2807        let Some(func) = node.child_by_field_name("function") else {
2808            return false;
2809        };
2810        if func.kind() != "attribute" {
2811            return false;
2812        }
2813        let attr_text = func
2814            .child_by_field_name("attribute")
2815            .and_then(|n| n.utf8_text(source).ok())
2816            .unwrap_or("");
2817        if !attr_text.starts_with(kinds.assertion_attr_prefix) {
2818            return false;
2819        }
2820        raw.test_assertion_count += 1;
2821        true
2822    }
2823
2824    /// Walk the AST and populate `raw.functions`, `raw.classes`, `raw.test_count`,
2825    /// and `raw.test_assertion_count`.
2826    fn count_symbols(node: Node, source: &[u8], kinds: &SymbolKinds, raw: &mut RawLineCounts) {
2827        if try_count_function(node, source, kinds, raw) {
2828            return;
2829        }
2830        if try_count_class(node, source, kinds, raw) {
2831            return;
2832        }
2833        if try_count_assertion(node, source, kinds, raw) {
2834            return;
2835        }
2836        recurse_children(node, source, kinds, raw);
2837    }
2838
2839    /// Flags describing what kinds of content appear on a single line.
2840    // Four bools are the natural representation for these four independent properties.
2841    #[allow(clippy::struct_excessive_bools)]
2842    #[derive(Clone, Copy)]
2843    struct TsLineFlags {
2844        has_code: bool,
2845        has_comment: bool,
2846        comment_is_block: bool,
2847        has_docstring: bool,
2848    }
2849
2850    /// Classify a single tree-sitter-annotated line and accumulate into `raw`.
2851    const fn classify_ts_line(trimmed: &str, flags: TsLineFlags, raw: &mut RawLineCounts) {
2852        if trimmed.is_empty() {
2853            raw.blank_only_lines += 1;
2854        } else if flags.has_docstring && !flags.has_code {
2855            raw.docstring_comment_lines += 1;
2856        } else if flags.has_code && flags.has_comment {
2857            // Classify the mixed line as single or multi based on what kind of comment is on it.
2858            if flags.comment_is_block {
2859                raw.mixed_code_multi_comment_lines += 1;
2860            } else {
2861                raw.mixed_code_single_comment_lines += 1;
2862            }
2863        } else if flags.has_comment {
2864            if flags.comment_is_block {
2865                raw.multi_comment_only_lines += 1;
2866            } else {
2867                raw.single_comment_only_lines += 1;
2868            }
2869        } else {
2870            raw.code_only_lines += 1;
2871        }
2872    }
2873
2874    /// Classify each tree-sitter-annotated line and accumulate counts into `raw`.
2875    fn classify_ts_lines(
2876        lines: &[&str],
2877        has_code: &[bool],
2878        has_comment: &[bool],
2879        comment_is_block: &[bool],
2880        has_docstring: &[bool],
2881        raw: &mut RawLineCounts,
2882    ) {
2883        for i in 0..lines.len() {
2884            raw.total_physical_lines += 1;
2885            classify_ts_line(
2886                lines[i].trim(),
2887                TsLineFlags {
2888                    has_code: has_code[i],
2889                    has_comment: has_comment[i],
2890                    comment_is_block: comment_is_block[i],
2891                    has_docstring: has_docstring[i],
2892                },
2893                raw,
2894            );
2895        }
2896    }
2897
2898    struct VisitCtx<'a> {
2899        source: &'a [u8],
2900        comment_kinds: &'a [&'a str],
2901        docstring_stmt_kind: Option<&'a str>,
2902        has_code: &'a mut Vec<bool>,
2903        has_comment: &'a mut Vec<bool>,
2904        comment_is_block: &'a mut Vec<bool>,
2905        has_docstring: &'a mut Vec<bool>,
2906    }
2907
2908    /// Mark all rows of a comment node and detect whether it is a block comment.
2909    fn visit_comment_node(node: Node, ctx: &mut VisitCtx<'_>) {
2910        let start_row = node.start_position().row;
2911        let end_row = node.end_position().row;
2912        let first_two = node
2913            .utf8_text(ctx.source)
2914            .unwrap_or("")
2915            .get(..2)
2916            .unwrap_or("");
2917        let is_block = first_two == "/*" || first_two == "<#";
2918        for row in start_row..=end_row {
2919            if row < ctx.has_comment.len() {
2920                ctx.has_comment[row] = true;
2921                if is_block {
2922                    ctx.comment_is_block[row] = true;
2923                }
2924            }
2925        }
2926    }
2927
2928    /// If `node` is an `expression_statement` whose sole named child is a string literal,
2929    /// mark those rows as docstring and return `true`.
2930    fn visit_maybe_docstring(node: Node, kind: &str, ctx: &mut VisitCtx<'_>) -> bool {
2931        let Some(stmt_kind) = ctx.docstring_stmt_kind else {
2932            return false;
2933        };
2934        if kind != stmt_kind || node.named_child_count() != 1 {
2935            return false;
2936        }
2937        let Some(child) = node.named_child(0) else {
2938            return false;
2939        };
2940        if child.kind() != "string" {
2941            return false;
2942        }
2943        let child_start = child.start_position().row;
2944        let child_end = child.end_position().row;
2945        for row in child_start..=child_end {
2946            if row < ctx.has_docstring.len() {
2947                ctx.has_docstring[row] = true;
2948            }
2949        }
2950        true
2951    }
2952
2953    /// Mark all rows of a leaf (non-comment, non-extra) node as code.
2954    fn visit_leaf_code(node: Node, ctx: &mut VisitCtx<'_>) {
2955        let start_row = node.start_position().row;
2956        let end_row = node.end_position().row;
2957        for row in start_row..=end_row {
2958            if row < ctx.has_code.len() {
2959                ctx.has_code[row] = true;
2960            }
2961        }
2962    }
2963
2964    #[allow(clippy::too_many_lines)]
2965    fn visit(node: Node, ctx: &mut VisitCtx<'_>) {
2966        let kind = node.kind();
2967
2968        // Comment node — mark rows as comment, detect block vs. line comment.
2969        if ctx.comment_kinds.contains(&kind) {
2970            visit_comment_node(node, ctx);
2971            return;
2972        }
2973
2974        // Python docstring: expression_statement whose only named child is a string literal.
2975        if visit_maybe_docstring(node, kind, ctx) {
2976            return;
2977        }
2978
2979        // Leaf non-comment node: mark as code.
2980        if node.child_count() == 0 && !node.is_extra() {
2981            visit_leaf_code(node, ctx);
2982            return;
2983        }
2984
2985        for i in 0..node.child_count() {
2986            #[allow(clippy::cast_possible_truncation)]
2987            // child_count bounded by tree-sitter u32 capacity
2988            if let Some(child) = node.child(i as u32) {
2989                visit(child, ctx);
2990            }
2991        }
2992    }
2993
2994    const C_SYMBOLS: SymbolKinds = SymbolKinds::none();
2995
2996    const PYTHON_SYMBOLS: SymbolKinds = SymbolKinds {
2997        function_def: "function_definition",
2998        class_def: "class_definition",
2999        test_fn_prefix: "test_",
3000        test_class_prefix: "Test",
3001        assertion_attr_prefix: "assert",
3002    };
3003
3004    /// Parse C or C++ source with tree-sitter-c.
3005    #[must_use]
3006    pub fn analyze_c(text: &str) -> Option<RawFileAnalysis> {
3007        let lang: tree_sitter::Language = tree_sitter_c::LANGUAGE.into();
3008        analyze_lines(text, &lang, &["comment"], None, &C_SYMBOLS)
3009    }
3010
3011    /// Parse Python source with tree-sitter-python.
3012    #[must_use]
3013    pub fn analyze_python(text: &str) -> Option<RawFileAnalysis> {
3014        let lang: tree_sitter::Language = tree_sitter_python::LANGUAGE.into();
3015        analyze_lines(
3016            text,
3017            &lang,
3018            &["comment"],
3019            Some("expression_statement"),
3020            &PYTHON_SYMBOLS,
3021        )
3022    }
3023}
3024
3025#[cfg(test)]
3026mod tests {
3027    use super::*;
3028
3029    #[test]
3030    fn python_docstrings_are_separated() {
3031        let input = r#""""module docs"""
3032
3033
3034def fn_a():
3035    """function docs"""
3036    value = 1  # trailing comment
3037    return value
3038"#;
3039
3040        let result = analyze_text(Language::Python, input, AnalysisOptions::default());
3041        assert_eq!(result.raw.docstring_comment_lines, 2);
3042        assert_eq!(result.raw.mixed_code_single_comment_lines, 1);
3043        assert_eq!(result.raw.code_only_lines, 2);
3044    }
3045
3046    #[test]
3047    fn c_style_mixed_lines_are_captured() {
3048        let input = "int x = 1; // note\n/* block */\n";
3049        let result = analyze_text(Language::C, input, AnalysisOptions::default());
3050        assert_eq!(result.raw.mixed_code_single_comment_lines, 1);
3051        assert_eq!(result.raw.multi_comment_only_lines, 1);
3052    }
3053
3054    #[test]
3055    fn detect_language_by_shebang() {
3056        let language = detect_language(
3057            Path::new("script"),
3058            Some("#!/usr/bin/env bash"),
3059            &BTreeMap::new(),
3060            true,
3061        );
3062        assert_eq!(language, Some(Language::Shell));
3063    }
3064
3065    // ── count_symbols: no double-counting of test functions ──────────────────
3066
3067    fn sym(lang: Language, line: &str) -> (u64, u64, u64, u64, u64, u64, u64) {
3068        let result = analyze_text(lang, &format!("{line}\n"), AnalysisOptions::default());
3069        let r = &result.raw;
3070        (
3071            r.functions,
3072            r.classes,
3073            r.variables,
3074            r.imports,
3075            r.test_count,
3076            r.test_assertion_count,
3077            r.test_suite_count,
3078        )
3079    }
3080
3081    #[test]
3082    fn python_test_fn_not_double_counted() {
3083        // def test_ lines count as tests only, NOT as functions
3084        let (f, c, _, _, t, _, _) = sym(Language::Python, "def test_foo():");
3085        assert_eq!(f, 0, "test fn must not also increment functions");
3086        assert_eq!(t, 1, "must be counted as a test");
3087        assert_eq!(c, 0);
3088    }
3089
3090    #[test]
3091    fn python_test_class_not_double_counted() {
3092        // class Test* lines count as tests only, NOT as classes
3093        let (f, c, _, _, t, _, _) = sym(Language::Python, "class TestFoo:");
3094        assert_eq!(c, 0, "test class must not also increment classes");
3095        assert_eq!(t, 1, "must be counted as a test");
3096        assert_eq!(f, 0);
3097    }
3098
3099    #[test]
3100    fn python_regular_fn_counts_as_function() {
3101        let (f, c, _, _, t, _, _) = sym(Language::Python, "def regular():");
3102        assert_eq!(f, 1, "regular function must be counted");
3103        assert_eq!(t, 0);
3104        assert_eq!(c, 0);
3105    }
3106
3107    #[test]
3108    fn python_regular_class_counts_as_class() {
3109        let (f, c, _, _, t, _, _) = sym(Language::Python, "class Regular:");
3110        assert_eq!(c, 1, "regular class must be counted");
3111        assert_eq!(t, 0);
3112        assert_eq!(f, 0);
3113    }
3114
3115    #[test]
3116    fn go_test_fn_not_double_counted() {
3117        let (f, _, _, _, t, _, _) = sym(Language::Go, "func TestFoo(t *testing.T) {");
3118        assert_eq!(f, 0, "Go test func must not also increment functions");
3119        assert_eq!(t, 1, "must be counted as a test");
3120    }
3121
3122    #[test]
3123    fn go_benchmark_fn_not_double_counted() {
3124        let (f, _, _, _, t, _, _) = sym(Language::Go, "func BenchmarkBar(b *testing.B) {");
3125        assert_eq!(f, 0, "Go benchmark func must not also increment functions");
3126        assert_eq!(t, 1, "must be counted as a test");
3127    }
3128
3129    #[test]
3130    fn go_regular_fn_counts_as_function() {
3131        let (f, _, _, _, t, _, _) = sym(Language::Go, "func doSomething() {");
3132        assert_eq!(f, 1, "regular Go func must be counted");
3133        assert_eq!(t, 0);
3134    }
3135
3136    #[test]
3137    fn rust_test_attr_counts_as_test_not_function() {
3138        // #[test] is a standalone attribute line — counted as a test, never as a function
3139        let (f, _, _, _, t, _, _) = sym(Language::Rust, "#[test]");
3140        assert_eq!(t, 1, "#[test] must be counted as a test");
3141        assert_eq!(f, 0, "#[test] attribute must not be counted as a function");
3142    }
3143
3144    #[test]
3145    fn rust_fn_line_counts_as_function_not_test() {
3146        // The fn declaration after #[test] does NOT match any test pattern
3147        let (f, _, _, _, t, _, _) = sym(Language::Rust, "fn test_something() {");
3148        assert_eq!(f, 1, "fn declaration must count as a function");
3149        assert_eq!(
3150            t, 0,
3151            "fn declaration line must not be double-counted as a test"
3152        );
3153    }
3154
3155    #[test]
3156    fn js_describe_counts_as_test_not_function() {
3157        let (f, _, _, _, t, _, _) = sym(Language::JavaScript, "describe('suite', () => {");
3158        assert_eq!(t, 1, "describe must be counted as a test");
3159        assert_eq!(f, 0, "describe must not be counted as a function");
3160    }
3161
3162    #[test]
3163    fn js_regular_fn_counts_as_function() {
3164        let (f, _, _, _, t, _, _) = sym(Language::JavaScript, "function doWork() {");
3165        assert_eq!(f, 1, "JS function declaration must be counted");
3166        assert_eq!(t, 0);
3167    }
3168}