Skip to main content

sloc_languages/
lib.rs

1// SPDX-License-Identifier: AGPL-3.0-or-later
2// Copyright (C) 2026 Nima Shafie <nimzshafie@gmail.com>
3
4pub mod style;
5pub use style::{IndentStyle, StyleAnalysis, StyleGuideScore, StyleSignal};
6
7use std::collections::{BTreeMap, BTreeSet, HashSet};
8use std::path::Path;
9
10use serde::{Deserialize, Serialize};
11
12#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
13#[serde(rename_all = "snake_case")]
14pub enum Language {
15    C,
16    Cpp,
17    CSharp,
18    Go,
19    Java,
20    JavaScript,
21    Python,
22    Rust,
23    Shell,
24    PowerShell,
25    TypeScript,
26    // --- Extended language support ---
27    Assembly,
28    Clojure,
29    Css,
30    Dart,
31    Dockerfile,
32    Elixir,
33    Erlang,
34    FSharp,
35    Groovy,
36    Haskell,
37    Html,
38    Julia,
39    Kotlin,
40    Lua,
41    Makefile,
42    Nim,
43    ObjectiveC,
44    Ocaml,
45    Perl,
46    Php,
47    R,
48    Ruby,
49    Scala,
50    Scss,
51    Sql,
52    Svelte,
53    Swift,
54    Vue,
55    Xml,
56    Zig,
57}
58
59impl Language {
60    #[must_use]
61    pub const fn display_name(&self) -> &'static str {
62        match self {
63            Self::C => "C",
64            Self::Cpp => "C++",
65            Self::CSharp => "C#",
66            Self::Go => "Go",
67            Self::Java => "Java",
68            Self::JavaScript => "JavaScript",
69            Self::Python => "Python",
70            Self::Rust => "Rust",
71            Self::Shell => "Shell",
72            Self::PowerShell => "PowerShell",
73            Self::TypeScript => "TypeScript",
74            Self::Assembly => "Assembly",
75            Self::Clojure => "Clojure",
76            Self::Css => "CSS",
77            Self::Dart => "Dart",
78            Self::Dockerfile => "Dockerfile",
79            Self::Elixir => "Elixir",
80            Self::Erlang => "Erlang",
81            Self::FSharp => "F#",
82            Self::Groovy => "Groovy",
83            Self::Haskell => "Haskell",
84            Self::Html => "HTML",
85            Self::Julia => "Julia",
86            Self::Kotlin => "Kotlin",
87            Self::Lua => "Lua",
88            Self::Makefile => "Makefile",
89            Self::Nim => "Nim",
90            Self::ObjectiveC => "Objective-C",
91            Self::Ocaml => "OCaml",
92            Self::Perl => "Perl",
93            Self::Php => "PHP",
94            Self::R => "R",
95            Self::Ruby => "Ruby",
96            Self::Scala => "Scala",
97            Self::Scss => "SCSS",
98            Self::Sql => "SQL",
99            Self::Svelte => "Svelte",
100            Self::Swift => "Swift",
101            Self::Vue => "Vue",
102            Self::Xml => "XML",
103            Self::Zig => "Zig",
104        }
105    }
106
107    #[must_use]
108    pub const fn as_slug(&self) -> &'static str {
109        match self {
110            Self::C => "c",
111            Self::Cpp => "cpp",
112            Self::CSharp => "csharp",
113            Self::Go => "go",
114            Self::Java => "java",
115            Self::JavaScript => "javascript",
116            Self::Python => "python",
117            Self::Rust => "rust",
118            Self::Shell => "shell",
119            Self::PowerShell => "powershell",
120            Self::TypeScript => "typescript",
121            Self::Assembly => "assembly",
122            Self::Clojure => "clojure",
123            Self::Css => "css",
124            Self::Dart => "dart",
125            Self::Dockerfile => "dockerfile",
126            Self::Elixir => "elixir",
127            Self::Erlang => "erlang",
128            Self::FSharp => "fsharp",
129            Self::Groovy => "groovy",
130            Self::Haskell => "haskell",
131            Self::Html => "html",
132            Self::Julia => "julia",
133            Self::Kotlin => "kotlin",
134            Self::Lua => "lua",
135            Self::Makefile => "makefile",
136            Self::Nim => "nim",
137            Self::ObjectiveC => "objectivec",
138            Self::Ocaml => "ocaml",
139            Self::Perl => "perl",
140            Self::Php => "php",
141            Self::R => "r",
142            Self::Ruby => "ruby",
143            Self::Scala => "scala",
144            Self::Scss => "scss",
145            Self::Sql => "sql",
146            Self::Svelte => "svelte",
147            Self::Swift => "swift",
148            Self::Vue => "vue",
149            Self::Xml => "xml",
150            Self::Zig => "zig",
151        }
152    }
153
154    #[must_use]
155    pub fn from_name(name: &str) -> Option<Self> {
156        match name.trim().to_ascii_lowercase().as_str() {
157            "c" => Some(Self::C),
158            "cpp" | "c++" | "cplusplus" => Some(Self::Cpp),
159            "csharp" | "c#" | "cs" => Some(Self::CSharp),
160            "go" | "golang" => Some(Self::Go),
161            "java" => Some(Self::Java),
162            "javascript" | "js" => Some(Self::JavaScript),
163            "python" | "py" => Some(Self::Python),
164            "rust" | "rs" => Some(Self::Rust),
165            "shell" | "sh" | "bash" => Some(Self::Shell),
166            "powershell" | "pwsh" | "ps" => Some(Self::PowerShell),
167            "typescript" | "ts" => Some(Self::TypeScript),
168            "assembly" | "asm" => Some(Self::Assembly),
169            "clojure" | "clj" => Some(Self::Clojure),
170            "css" => Some(Self::Css),
171            "dart" => Some(Self::Dart),
172            "dockerfile" | "docker" => Some(Self::Dockerfile),
173            "elixir" | "ex" => Some(Self::Elixir),
174            "erlang" | "erl" => Some(Self::Erlang),
175            "fsharp" | "f#" | "fs" => Some(Self::FSharp),
176            "groovy" => Some(Self::Groovy),
177            "haskell" | "hs" => Some(Self::Haskell),
178            "html" | "htm" => Some(Self::Html),
179            "julia" | "jl" => Some(Self::Julia),
180            "kotlin" | "kt" => Some(Self::Kotlin),
181            "lua" => Some(Self::Lua),
182            "makefile" | "make" | "mk" => Some(Self::Makefile),
183            "nim" => Some(Self::Nim),
184            "objectivec" | "objc" | "objective-c" => Some(Self::ObjectiveC),
185            "ocaml" | "ml" => Some(Self::Ocaml),
186            "perl" | "pl" => Some(Self::Perl),
187            "php" => Some(Self::Php),
188            "r" => Some(Self::R),
189            "ruby" | "rb" => Some(Self::Ruby),
190            "scala" => Some(Self::Scala),
191            "scss" | "sass" => Some(Self::Scss),
192            "sql" => Some(Self::Sql),
193            "svelte" => Some(Self::Svelte),
194            "swift" => Some(Self::Swift),
195            "vue" => Some(Self::Vue),
196            "xml" => Some(Self::Xml),
197            "zig" => Some(Self::Zig),
198            _ => None,
199        }
200    }
201}
202
203#[derive(Debug, Clone, Serialize, Deserialize, Default)]
204pub struct RawLineCounts {
205    pub total_physical_lines: u64,
206    pub blank_only_lines: u64,
207    pub code_only_lines: u64,
208    pub single_comment_only_lines: u64,
209    pub multi_comment_only_lines: u64,
210    pub mixed_code_single_comment_lines: u64,
211    pub mixed_code_multi_comment_lines: u64,
212    pub docstring_comment_lines: u64,
213    pub skipped_unknown_lines: u64,
214    /// Best-effort count of function/method definition lines detected lexically.
215    #[serde(default)]
216    pub functions: u64,
217    /// Best-effort count of class/struct/trait/type definition lines detected lexically.
218    #[serde(default)]
219    pub classes: u64,
220    /// Best-effort count of variable declaration lines detected lexically.
221    #[serde(default)]
222    pub variables: u64,
223    /// Best-effort count of import/use/include statement lines detected lexically.
224    #[serde(default)]
225    pub imports: u64,
226    /// Lines consisting solely of preprocessor/compiler directives (e.g. `#include`, `#define`
227    /// in C/C++/Objective-C). Always a subset of `code_only_lines`. Controlled by
228    /// `AnalysisConfig::count_compiler_directives`. IEEE 1045-1992 §4.2.
229    #[serde(default)]
230    pub compiler_directive_lines: u64,
231    /// Best-effort count of test case / test function definition lines detected lexically
232    /// (`GTest`, Catch2, `PyTest`, `JUnit`, etc.). Always a subset of `code_only_lines`.
233    #[serde(default)]
234    pub test_count: u64,
235    /// Best-effort count of test assertion call lines detected lexically
236    /// (`ASSERT_EQ`, `EXPECT_TRUE`, assertEquals, Assert.AreEqual, `assert_eq`!, etc.).
237    #[serde(default)]
238    pub test_assertion_count: u64,
239    /// Best-effort count of test suite / fixture / group declaration lines detected lexically
240    /// (`TEST_GROUP`, `BOOST_AUTO_TEST_SUITE`, [`TestClass`], [`TestFixture`], etc.).
241    #[serde(default)]
242    pub test_suite_count: u64,
243}
244
245#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
246#[serde(rename_all = "snake_case")]
247pub enum ParseMode {
248    Lexical,
249    LexicalBestEffort,
250    TreeSitter,
251}
252
253#[derive(Debug, Clone, Serialize, Deserialize)]
254pub struct RawFileAnalysis {
255    pub raw: RawLineCounts,
256    pub parse_mode: ParseMode,
257    pub warnings: Vec<String>,
258    /// Lexical style-guide analysis for supported languages; `None` when no heuristics apply.
259    #[serde(default, skip_serializing_if = "Option::is_none")]
260    pub style_analysis: Option<StyleAnalysis>,
261}
262
263/// IEEE 1045-1992 counting options passed from `sloc-core` (built from `AnalysisConfig`).
264///
265/// `analyze_text` accepts this struct so that the caller can control behaviour that the
266/// standard defines as configurable parameters rather than fixed conventions.
267#[derive(Debug, Clone, Copy)]
268pub struct AnalysisOptions {
269    /// When `true` (IEEE 1045-1992 default), blank lines inside block comments count as
270    /// comment lines rather than blank lines.
271    pub blank_in_block_comment_as_comment: bool,
272    /// When `true`, backslash-continued physical lines are collapsed into a single logical
273    /// line for SLOC counting purposes (IEEE logical SLOC mode).
274    pub collapse_continuation_lines: bool,
275}
276
277impl Default for AnalysisOptions {
278    fn default() -> Self {
279        Self {
280            blank_in_block_comment_as_comment: true,
281            collapse_continuation_lines: false,
282        }
283    }
284}
285
286#[must_use]
287pub fn supported_languages() -> BTreeSet<Language> {
288    [
289        Language::Assembly,
290        Language::C,
291        Language::Clojure,
292        Language::Cpp,
293        Language::CSharp,
294        Language::Css,
295        Language::Dart,
296        Language::Dockerfile,
297        Language::Elixir,
298        Language::Erlang,
299        Language::FSharp,
300        Language::Go,
301        Language::Groovy,
302        Language::Haskell,
303        Language::Html,
304        Language::Java,
305        Language::JavaScript,
306        Language::Julia,
307        Language::Kotlin,
308        Language::Lua,
309        Language::Makefile,
310        Language::Nim,
311        Language::ObjectiveC,
312        Language::Ocaml,
313        Language::Perl,
314        Language::Php,
315        Language::PowerShell,
316        Language::Python,
317        Language::R,
318        Language::Ruby,
319        Language::Rust,
320        Language::Scala,
321        Language::Scss,
322        Language::Shell,
323        Language::Sql,
324        Language::Svelte,
325        Language::Swift,
326        Language::TypeScript,
327        Language::Vue,
328        Language::Xml,
329        Language::Zig,
330    ]
331    .into_iter()
332    .collect()
333}
334
335/// Detect language from a shebang line (e.g. `#!/usr/bin/env python3`).
336fn detect_by_shebang(line: &str) -> Option<Language> {
337    let lower = line.to_ascii_lowercase();
338    if !lower.starts_with("#!") {
339        return None;
340    }
341    if lower.contains("python") {
342        return Some(Language::Python);
343    }
344    if lower.contains("pwsh") || lower.contains("powershell") {
345        return Some(Language::PowerShell);
346    }
347    if lower.contains("bash")
348        || lower.contains("/sh")
349        || lower.contains("zsh")
350        || lower.contains("ksh")
351    {
352        return Some(Language::Shell);
353    }
354    if lower.contains("ruby") {
355        return Some(Language::Ruby);
356    }
357    if lower.contains("perl") {
358        return Some(Language::Perl);
359    }
360    if lower.contains("php") {
361        return Some(Language::Php);
362    }
363    if lower.contains("node") || lower.contains("nodejs") {
364        return Some(Language::JavaScript);
365    }
366    None
367}
368
369/// Detect language purely from a (lowercased) file extension.
370fn detect_by_extension(ext: &str) -> Option<Language> {
371    // Static table avoids a large match statement; each extension maps 1-to-1 to a language.
372    static EXT_MAP: &[(&str, Language)] = &[
373        ("c", Language::C),
374        ("h", Language::C),
375        ("cc", Language::Cpp),
376        ("cp", Language::Cpp),
377        ("cpp", Language::Cpp),
378        ("cxx", Language::Cpp),
379        ("hh", Language::Cpp),
380        ("hpp", Language::Cpp),
381        ("hxx", Language::Cpp),
382        ("cs", Language::CSharp),
383        ("go", Language::Go),
384        ("java", Language::Java),
385        ("js", Language::JavaScript),
386        ("mjs", Language::JavaScript),
387        ("cjs", Language::JavaScript),
388        ("py", Language::Python),
389        ("rs", Language::Rust),
390        ("sh", Language::Shell),
391        ("bash", Language::Shell),
392        ("zsh", Language::Shell),
393        ("ksh", Language::Shell),
394        ("ps1", Language::PowerShell),
395        ("psm1", Language::PowerShell),
396        ("psd1", Language::PowerShell),
397        ("ts", Language::TypeScript),
398        ("mts", Language::TypeScript),
399        ("cts", Language::TypeScript),
400        ("asm", Language::Assembly),
401        ("s", Language::Assembly),
402        ("clj", Language::Clojure),
403        ("cljs", Language::Clojure),
404        ("cljc", Language::Clojure),
405        ("edn", Language::Clojure),
406        ("css", Language::Css),
407        ("dart", Language::Dart),
408        ("ex", Language::Elixir),
409        ("exs", Language::Elixir),
410        ("erl", Language::Erlang),
411        ("hrl", Language::Erlang),
412        ("fs", Language::FSharp),
413        ("fsi", Language::FSharp),
414        ("fsx", Language::FSharp),
415        ("groovy", Language::Groovy),
416        ("gradle", Language::Groovy),
417        ("hs", Language::Haskell),
418        ("lhs", Language::Haskell),
419        ("html", Language::Html),
420        ("htm", Language::Html),
421        ("xhtml", Language::Html),
422        ("jl", Language::Julia),
423        ("kt", Language::Kotlin),
424        ("kts", Language::Kotlin),
425        ("lua", Language::Lua),
426        ("mk", Language::Makefile),
427        ("nim", Language::Nim),
428        ("nims", Language::Nim),
429        ("m", Language::ObjectiveC),
430        ("mm", Language::ObjectiveC),
431        ("ml", Language::Ocaml),
432        ("mli", Language::Ocaml),
433        ("pl", Language::Perl),
434        ("pm", Language::Perl),
435        ("t", Language::Perl),
436        ("php", Language::Php),
437        ("php3", Language::Php),
438        ("php4", Language::Php),
439        ("php5", Language::Php),
440        ("php7", Language::Php),
441        ("phtml", Language::Php),
442        ("r", Language::R),
443        ("rb", Language::Ruby),
444        ("rake", Language::Ruby),
445        ("scala", Language::Scala),
446        ("sc", Language::Scala),
447        ("scss", Language::Scss),
448        ("sass", Language::Scss),
449        ("sql", Language::Sql),
450        ("svelte", Language::Svelte),
451        ("swift", Language::Swift),
452        ("vue", Language::Vue),
453        ("xml", Language::Xml),
454        ("xsd", Language::Xml),
455        ("xsl", Language::Xml),
456        ("xslt", Language::Xml),
457        ("svg", Language::Xml),
458        ("zig", Language::Zig),
459    ];
460    EXT_MAP.iter().find_map(|&(e, l)| (e == ext).then_some(l))
461}
462
463/// Detect language from an exact filename (no extension) or well-known filename patterns.
464fn detect_by_filename(filename: &str, filename_lower: &str) -> Option<Language> {
465    // Dockerfile: exact name or Dockerfile.* variant
466    if filename == "Dockerfile"
467        || filename.starts_with("Dockerfile.")
468        || filename_lower == "dockerfile"
469    {
470        return Some(Language::Dockerfile);
471    }
472    // Makefile variants
473    if matches!(
474        filename,
475        "Makefile" | "GNUmakefile" | "makefile" | "BSDmakefile"
476    ) {
477        return Some(Language::Makefile);
478    }
479    // Ruby ecosystem files that have no extension
480    if matches!(
481        filename,
482        "Rakefile" | "Gemfile" | "Guardfile" | "Vagrantfile" | "Fastfile" | "Podfile"
483    ) {
484        return Some(Language::Ruby);
485    }
486    None
487}
488
489#[must_use]
490#[allow(clippy::too_many_lines)]
491pub fn detect_language(
492    path: &Path,
493    first_line: Option<&str>,
494    extension_overrides: &BTreeMap<String, String>,
495    shebang_detection: bool,
496) -> Option<Language> {
497    let extension = path
498        .extension()
499        .and_then(|ext| ext.to_str())
500        .map(str::to_ascii_lowercase);
501
502    // Extension override check (user-configured mappings win over everything)
503    if let Some(ext) = extension.as_ref() {
504        if let Some(override_name) = extension_overrides.get(ext.as_str()) {
505            if let Some(lang) = Language::from_name(override_name) {
506                return Some(lang);
507            }
508        }
509    }
510
511    // Filename-based detection for files that have no extension or use exact names
512    let filename = path.file_name().and_then(|s| s.to_str()).unwrap_or("");
513    let filename_lower = filename.to_ascii_lowercase();
514
515    if let Some(lang) = detect_by_filename(filename, &filename_lower) {
516        return Some(lang);
517    }
518
519    // Extension-based detection
520    if let Some(lang) = extension.as_deref().and_then(detect_by_extension) {
521        return Some(lang);
522    }
523
524    // Shebang detection (last resort — only for extensionless scripts)
525    if shebang_detection {
526        if let Some(line) = first_line {
527            if let Some(lang) = detect_by_shebang(line) {
528                return Some(lang);
529            }
530        }
531    }
532
533    None
534}
535
536#[must_use]
537pub fn analyze_text(language: Language, text: &str, options: AnalysisOptions) -> RawFileAnalysis {
538    // tree-sitter fast-paths (compiled out when feature is disabled)
539    #[cfg(feature = "tree-sitter")]
540    {
541        match language {
542            Language::C | Language::Cpp => {
543                if let Some(mut result) = ts::analyze_c(text) {
544                    result.style_analysis = style::analyze_style(language, text);
545                    return result;
546                }
547            }
548            Language::Python => {
549                if let Some(result) = ts::analyze_python(text) {
550                    return result;
551                }
552            }
553            _ => {}
554        }
555    }
556
557    let (mut config, has_preprocessor) = language_scan_config(language);
558
559    // Python docstring lines are computed from the text and cannot be a static constant.
560    if language == Language::Python {
561        config.skip_lines = detect_python_docstring_lines(text);
562    }
563
564    // C, C++, and Objective-C have a preprocessor whose directive lines are tracked separately
565    // per IEEE 1045-1992 §4.2; every other language uses base flags.
566    let flags = IeeeFlags {
567        has_preprocessor_directives: has_preprocessor,
568        blank_in_block_comment_as_comment: options.blank_in_block_comment_as_comment,
569        collapse_continuation_lines: options.collapse_continuation_lines,
570    };
571    let mut result = analyze_generic(text, config, flags);
572    result.style_analysis = style::analyze_style(language, text);
573    result
574}
575
576/// Returns the lexical scan configuration for `language` and whether it uses a C preprocessor.
577/// All fields are static constants except `skip_lines`, which is always empty here; callers that
578/// need non-empty skip sets (currently only Python) must populate the field after this call.
579///
580/// The implementation delegates to `LANG_SCAN_TABLE` (a static `&[(Language, StaticLangConfig)]`)
581/// defined below the `SP_*` symbol-pattern constants.  Each language appears exactly once in the
582/// table, so the linear scan is O(|languages|) but avoids a 41-arm `match` statement.
583fn language_scan_config(language: Language) -> (ScanConfig, bool) {
584    let cfg = LANG_SCAN_TABLE
585        .iter()
586        .find_map(|&(l, c)| (l == language).then_some(c))
587        .unwrap_or_else(|| panic!("language_scan_config: no entry for {language:?}"));
588    (
589        ScanConfig {
590            line_comments: cfg.line_comments,
591            block_comment: cfg.block_comment,
592            allow_single_quote_strings: cfg.allow_single_quote_strings,
593            allow_double_quote_strings: cfg.allow_double_quote_strings,
594            allow_triple_quote_strings: cfg.allow_triple_quote_strings,
595            allow_csharp_verbatim_strings: cfg.allow_csharp_verbatim_strings,
596            skip_lines: HashSet::new(),
597            symbol_patterns: cfg.symbol_patterns,
598        },
599        cfg.has_preprocessor,
600    )
601}
602
603/// Per-language keyword prefixes used for best-effort structural symbol detection.
604/// Each slice lists line prefixes (after leading whitespace is stripped) that indicate
605/// a definition of that category. Empty slice = detection disabled for that category.
606#[derive(Debug, Clone, Copy)]
607struct SymbolPatterns {
608    functions: &'static [&'static str],
609    /// Line prefixes that classify as a function only when the line ALSO contains `(`
610    /// AND there is no `=` between the prefix and the first `(`.  Used for C/C++ where
611    /// function definitions are led by the return type (`void`, `int`, `bool`, …) with
612    /// no dedicated keyword, so the paren guard distinguishes `void f(x)` from
613    /// `void* p = malloc(n)`.
614    functions_prefix_paren: &'static [&'static str],
615    classes: &'static [&'static str],
616    variables: &'static [&'static str],
617    imports: &'static [&'static str],
618    /// Line prefixes (after stripping leading whitespace) that indicate a test case or test
619    /// function definition. Matched against code lines only, same as other symbol categories.
620    tests: &'static [&'static str],
621    /// Line prefixes that indicate a test assertion call (`ASSERT_EQ`, assertEquals, `assert_eq`!,
622    /// Assert.AreEqual, etc.). Matched against code lines only.
623    assertions: &'static [&'static str],
624    /// Line prefixes that indicate a test suite / fixture / group declaration
625    /// (`TEST_GROUP`, `BOOST_AUTO_TEST_SUITE`, [`TestClass`], [`TestFixture`], etc.).
626    test_suites: &'static [&'static str],
627}
628
629impl SymbolPatterns {
630    const fn none() -> Self {
631        Self {
632            functions: &[],
633            functions_prefix_paren: &[],
634            classes: &[],
635            variables: &[],
636            imports: &[],
637            tests: &[],
638            assertions: &[],
639            test_suites: &[],
640        }
641    }
642}
643
644const SP_NONE: SymbolPatterns = SymbolPatterns::none(); // all fields are &[]
645
646const SP_RUST: SymbolPatterns = SymbolPatterns {
647    functions: &[
648        "fn ",
649        "pub fn ",
650        "pub(crate) fn ",
651        "pub(super) fn ",
652        "async fn ",
653        "pub async fn ",
654        "pub(crate) async fn ",
655        "unsafe fn ",
656        "pub unsafe fn ",
657        "pub(crate) unsafe fn ",
658        "const fn ",
659        "pub const fn ",
660        "pub(crate) const fn ",
661        "extern fn ",
662        "pub extern fn ",
663    ],
664    functions_prefix_paren: &[],
665    classes: &[
666        "struct ",
667        "pub struct ",
668        "pub(crate) struct ",
669        "enum ",
670        "pub enum ",
671        "pub(crate) enum ",
672        "trait ",
673        "pub trait ",
674        "pub(crate) trait ",
675        "impl ",
676        "impl<",
677        "type ",
678        "pub type ",
679        "pub(crate) type ",
680    ],
681    variables: &["let ", "let mut "],
682    imports: &["use ", "pub use ", "pub(crate) use ", "extern crate "],
683    // Built-in #[test], tokio/actix async test attributes, rstest
684    tests: &[
685        "#[test]",
686        "#[tokio::test]",
687        "#[actix_web::test]",
688        "#[rstest]",
689        "#[test_case",
690    ],
691    assertions: &[
692        "assert_eq!(",
693        "assert_ne!(",
694        "assert!(",
695        "assert_matches!(",
696        "assert_err!(",
697        "assert_ok!(",
698    ],
699    test_suites: &[],
700};
701
702const SP_PYTHON: SymbolPatterns = SymbolPatterns {
703    functions: &["def ", "async def "],
704    functions_prefix_paren: &[],
705    classes: &["class "],
706    variables: &[],
707    imports: &["import ", "from "],
708    // pytest: test_ prefix functions and Test* classes; unittest: test_ methods
709    tests: &["def test_", "async def test_", "class Test"],
710    assertions: &[
711        "self.assertEqual(",
712        "self.assertNotEqual(",
713        "self.assertTrue(",
714        "self.assertFalse(",
715        "self.assertIsNone(",
716        "self.assertIsNotNone(",
717        "self.assertIn(",
718        "self.assertNotIn(",
719        "self.assertRaises(",
720        "self.assertAlmostEqual(",
721    ],
722    test_suites: &[],
723};
724
725const SP_JS: SymbolPatterns = SymbolPatterns {
726    functions: &[
727        "function ",
728        "async function ",
729        "export function ",
730        "export async function ",
731        "export default function ",
732    ],
733    functions_prefix_paren: &[],
734    classes: &["class ", "export class ", "export default class "],
735    variables: &[
736        "var ",
737        "let ",
738        "const ",
739        "export var ",
740        "export let ",
741        "export const ",
742    ],
743    imports: &["import "],
744    // Jest/Mocha/Jasmine: describe/it/test block openers
745    tests: &[
746        "describe(",
747        "it(",
748        "test(",
749        "it.each(",
750        "test.each(",
751        "describe.each(",
752    ],
753    assertions: &["expect("],
754    test_suites: &[],
755};
756
757const SP_TS: SymbolPatterns = SymbolPatterns {
758    functions: &[
759        "function ",
760        "async function ",
761        "export function ",
762        "export async function ",
763        "export default function ",
764    ],
765    functions_prefix_paren: &[],
766    classes: &[
767        "class ",
768        "export class ",
769        "export default class ",
770        "abstract class ",
771        "export abstract class ",
772        "interface ",
773        "export interface ",
774        "declare class ",
775        "declare interface ",
776    ],
777    variables: &[
778        "var ",
779        "let ",
780        "const ",
781        "export var ",
782        "export let ",
783        "export const ",
784    ],
785    imports: &["import "],
786    // Jest/Mocha/Jasmine/Vitest: describe/it/test block openers
787    tests: &[
788        "describe(",
789        "it(",
790        "test(",
791        "it.each(",
792        "test.each(",
793        "describe.each(",
794    ],
795    assertions: &["expect("],
796    test_suites: &[],
797};
798
799const SP_GO: SymbolPatterns = SymbolPatterns {
800    functions: &["func "],
801    functions_prefix_paren: &[],
802    classes: &["type "],
803    variables: &["var "],
804    imports: &["import "],
805    // Go standard testing: Test* functions (convention is practically exclusive to _test.go files)
806    tests: &["func Test", "func Benchmark", "func Fuzz"],
807    assertions: &[],
808    test_suites: &[],
809};
810
811const SP_JAVA: SymbolPatterns = SymbolPatterns {
812    functions: &[],
813    functions_prefix_paren: &[],
814    classes: &[
815        "class ",
816        "public class ",
817        "private class ",
818        "protected class ",
819        "abstract class ",
820        "final class ",
821        "public abstract class ",
822        "public final class ",
823        "interface ",
824        "public interface ",
825        "enum ",
826        "public enum ",
827        "record ",
828        "public record ",
829        "@interface ",
830    ],
831    variables: &[],
832    imports: &["import "],
833    // JUnit 4 & 5, TestNG — annotations appear on their own line before the method
834    tests: &[
835        "@Test",
836        "@ParameterizedTest",
837        "@RepeatedTest",
838        "@TestFactory",
839        "@TestTemplate",
840    ],
841    assertions: &[
842        "assertEquals(",
843        "assertNotEquals(",
844        "assertTrue(",
845        "assertFalse(",
846        "assertNull(",
847        "assertNotNull(",
848        "assertThat(",
849        "assertThrows(",
850        "assertAll(",
851        "assertArrayEquals(",
852        "assertIterableEquals(",
853        "assertLinesMatch(",
854    ],
855    test_suites: &[],
856};
857
858const SP_CSHARP: SymbolPatterns = SymbolPatterns {
859    functions: &[],
860    functions_prefix_paren: &[],
861    classes: &[
862        "class ",
863        "public class ",
864        "private class ",
865        "protected class ",
866        "internal class ",
867        "abstract class ",
868        "sealed class ",
869        "static class ",
870        "partial class ",
871        "public abstract class ",
872        "public sealed class ",
873        "public static class ",
874        "interface ",
875        "public interface ",
876        "internal interface ",
877        "enum ",
878        "public enum ",
879        "struct ",
880        "public struct ",
881        "record ",
882        "public record ",
883    ],
884    variables: &["var "],
885    imports: &["using "],
886    // MSTest, NUnit, xUnit — attributes on their own line before the method
887    tests: &[
888        "[TestMethod]",
889        "[Test]",
890        "[Fact]",
891        "[Theory]",
892        "[TestCase(",
893        "[DataRow(",
894        "[InlineData(",
895        "[MemberData(",
896    ],
897    assertions: &[
898        "Assert.AreEqual(",
899        "Assert.AreNotEqual(",
900        "Assert.IsTrue(",
901        "Assert.IsFalse(",
902        "Assert.IsNull(",
903        "Assert.IsNotNull(",
904        "Assert.Equal(",
905        "Assert.NotEqual(",
906        "Assert.True(",
907        "Assert.False(",
908        "Assert.That(",
909        "Assert.Contains(",
910        "Assert.Throws(",
911        "Assert.ThrowsAsync(",
912        "Assert.IsInstanceOfType(",
913    ],
914    test_suites: &["[TestClass]", "[TestFixture]", "[SetUpFixture]"],
915};
916
917// GTest, Catch2/doctest, Boost.Test, Unity, Check, CMocka, CppUTest patterns for C and C++.
918const TEST_PATTERNS_C_CPP: &[&str] = &[
919    // Google Test
920    "TEST(",
921    "TEST_F(",
922    "TEST_P(",
923    "TYPED_TEST(",
924    "TYPED_TEST_P(",
925    "INSTANTIATE_TEST_SUITE_P(",
926    "INSTANTIATE_TYPED_TEST_SUITE_P(",
927    // Catch2 / doctest
928    "TEST_CASE(",
929    "SECTION(",
930    "SCENARIO(",
931    "SCENARIO_METHOD(",
932    "TEST_CASE_METHOD(",
933    // Boost.Test
934    "BOOST_AUTO_TEST_CASE(",
935    "BOOST_FIXTURE_TEST_CASE(",
936    "BOOST_AUTO_TEST_SUITE(",
937    "BOOST_PARAM_TEST_CASE(",
938    // CppUnit
939    "CPPUNIT_TEST(",
940    "CPPUNIT_TEST_SUITE(",
941    // Unity (embedded C)
942    "RUN_TEST(",
943    "TEST_IGNORE(",
944    "TEST_FAIL(",
945    // Check (libcheck — embedded C)
946    "START_TEST(",
947    "tcase_add_test(",
948    "suite_create(",
949    // CMocka (embedded C)
950    "cmocka_unit_test(",
951    "cmocka_run_group_tests(",
952    // CppUTest
953    "IGNORE_TEST(",
954    "TEST_GROUP(",
955    "TEST_GROUP_BASE(",
956];
957
958// Test assertion patterns shared by C and C++.
959const ASSERT_PATTERNS_C_CPP: &[&str] = &[
960    // Google Test ASSERT_* (test-stopping failures)
961    "ASSERT_EQ(",
962    "ASSERT_NE(",
963    "ASSERT_LT(",
964    "ASSERT_LE(",
965    "ASSERT_GT(",
966    "ASSERT_GE(",
967    "ASSERT_TRUE(",
968    "ASSERT_FALSE(",
969    "ASSERT_STREQ(",
970    "ASSERT_STRNE(",
971    "ASSERT_FLOAT_EQ(",
972    "ASSERT_DOUBLE_EQ(",
973    "ASSERT_NEAR(",
974    "ASSERT_THROW(",
975    "ASSERT_NO_THROW(",
976    "ASSERT_ANY_THROW(",
977    // Google Test EXPECT_* (non-stopping failures)
978    "EXPECT_EQ(",
979    "EXPECT_NE(",
980    "EXPECT_LT(",
981    "EXPECT_LE(",
982    "EXPECT_GT(",
983    "EXPECT_GE(",
984    "EXPECT_TRUE(",
985    "EXPECT_FALSE(",
986    "EXPECT_STREQ(",
987    "EXPECT_STRNE(",
988    "EXPECT_FLOAT_EQ(",
989    "EXPECT_DOUBLE_EQ(",
990    "EXPECT_NEAR(",
991    "EXPECT_THROW(",
992    "EXPECT_NO_THROW(",
993    "EXPECT_ANY_THROW(",
994    // Catch2 / doctest assertions
995    "REQUIRE(",
996    "CHECK(",
997    "REQUIRE_FALSE(",
998    "CHECK_FALSE(",
999    "REQUIRE_NOTHROW(",
1000    "CHECK_NOTHROW(",
1001    "REQUIRE_THROWS(",
1002    "CHECK_THROWS(",
1003    "REQUIRE_THAT(",
1004    "CHECK_THAT(",
1005    // Unity assertions (embedded C)
1006    "TEST_ASSERT_EQUAL(",
1007    "TEST_ASSERT_EQUAL_INT(",
1008    "TEST_ASSERT_EQUAL_STRING(",
1009    "TEST_ASSERT_EQUAL_FLOAT(",
1010    "TEST_ASSERT_EQUAL_DOUBLE(",
1011    "TEST_ASSERT_EQUAL_PTR(",
1012    "TEST_ASSERT_TRUE(",
1013    "TEST_ASSERT_FALSE(",
1014    "TEST_ASSERT_NULL(",
1015    "TEST_ASSERT_NOT_NULL(",
1016    "TEST_ASSERT_BITS_HIGH(",
1017    "TEST_ASSERT_BITS_LOW(",
1018    // CMocka assertions (embedded C)
1019    "assert_int_equal(",
1020    "assert_int_not_equal(",
1021    "assert_string_equal(",
1022    "assert_string_not_equal(",
1023    "assert_true(",
1024    "assert_false(",
1025    "assert_null(",
1026    "assert_non_null(",
1027    "assert_ptr_equal(",
1028    "assert_memory_equal(",
1029    "assert_return_code(",
1030];
1031
1032// Test suite/group declaration patterns for C and C++.
1033const SUITE_PATTERNS_C_CPP: &[&str] = &[
1034    "TEST_GROUP(",
1035    "TEST_GROUP_BASE(",
1036    "BOOST_AUTO_TEST_SUITE(",
1037    "CPPUNIT_TEST_SUITE(",
1038    "CPPUNIT_TEST_SUITE_END(",
1039];
1040
1041const SP_C: SymbolPatterns = SymbolPatterns {
1042    // C has no function keyword; detect by common return types that precede `(` with no `=`.
1043    functions: &[],
1044    functions_prefix_paren: &[
1045        "void ",
1046        "int ",
1047        "char ",
1048        "float ",
1049        "double ",
1050        "long ",
1051        "unsigned ",
1052        "size_t ",
1053        "static ",
1054        "inline ",
1055        "const ",
1056        "extern ",
1057    ],
1058    classes: &[
1059        "struct ",
1060        "typedef struct ",
1061        "union ",
1062        "typedef union ",
1063        "typedef enum ",
1064    ],
1065    variables: &[],
1066    imports: &["#include "],
1067    tests: TEST_PATTERNS_C_CPP,
1068    assertions: ASSERT_PATTERNS_C_CPP,
1069    test_suites: SUITE_PATTERNS_C_CPP,
1070};
1071
1072const SP_CPP: SymbolPatterns = SymbolPatterns {
1073    // C++ specific function keyword-prefixes; return-type-led patterns use functions_prefix_paren.
1074    functions: &[
1075        "virtual ",  // virtual method declaration/definition
1076        "explicit ", // explicit constructor modifier
1077        "~",         // destructor (e.g. ~MyClass())
1078        "operator",  // operator overload (operator==, operator+, …)
1079    ],
1080    functions_prefix_paren: &[
1081        "void ",
1082        "bool ",
1083        "int ",
1084        "char ",
1085        "float ",
1086        "double ",
1087        "long ",
1088        "unsigned ",
1089        "size_t ",
1090        "auto ",
1091        "static ",
1092        "inline ",
1093        "constexpr ",
1094        "const ",
1095        "extern ",
1096    ],
1097    // `template<` (no space) is the dominant modern style alongside `template ` (with space).
1098    classes: &["class ", "struct ", "namespace ", "template ", "template<"],
1099    variables: &[],
1100    imports: &["#include "],
1101    tests: TEST_PATTERNS_C_CPP,
1102    assertions: ASSERT_PATTERNS_C_CPP,
1103    test_suites: SUITE_PATTERNS_C_CPP,
1104};
1105
1106const SP_SHELL: SymbolPatterns = SymbolPatterns {
1107    functions: &["function "],
1108    functions_prefix_paren: &[],
1109    classes: &[],
1110    variables: &["declare ", "local ", "export "],
1111    imports: &["source ", ". "],
1112    tests: &[],
1113    assertions: &[],
1114    test_suites: &[],
1115};
1116
1117const SP_POWERSHELL: SymbolPatterns = SymbolPatterns {
1118    functions: &["function ", "Function "],
1119    functions_prefix_paren: &[],
1120    classes: &["class "],
1121    variables: &[],
1122    imports: &["Import-Module ", "using "],
1123    // Pester test framework
1124    tests: &["Describe ", "It ", "Context "],
1125    assertions: &[],
1126    test_suites: &[],
1127};
1128
1129const SP_KOTLIN: SymbolPatterns = SymbolPatterns {
1130    functions: &[
1131        "fun ",
1132        "private fun ",
1133        "public fun ",
1134        "protected fun ",
1135        "internal fun ",
1136        "override fun ",
1137        "suspend fun ",
1138        "abstract fun ",
1139        "open fun ",
1140        "private suspend fun ",
1141        "public suspend fun ",
1142    ],
1143    functions_prefix_paren: &[],
1144    classes: &[
1145        "class ",
1146        "data class ",
1147        "sealed class ",
1148        "abstract class ",
1149        "open class ",
1150        "object ",
1151        "companion object",
1152        "interface ",
1153        "enum class ",
1154        "annotation class ",
1155    ],
1156    variables: &["val ", "var ", "private val ", "private var ", "const val "],
1157    imports: &["import "],
1158    // JUnit 4/5, KotlinTest, Kotest
1159    tests: &[
1160        "@Test",
1161        "@ParameterizedTest",
1162        "@RepeatedTest",
1163        "\"should ",
1164        "\"it ",
1165    ],
1166    assertions: &[
1167        "assertEquals(",
1168        "assertNotEquals(",
1169        "assertTrue(",
1170        "assertFalse(",
1171        "assertNull(",
1172        "assertNotNull(",
1173        "assertThat(",
1174        "assertThrows(",
1175        "shouldBe(",
1176        "shouldNotBe(",
1177        "shouldThrow(",
1178    ],
1179    test_suites: &[],
1180};
1181
1182const SP_SWIFT: SymbolPatterns = SymbolPatterns {
1183    functions: &[
1184        "func ",
1185        "private func ",
1186        "public func ",
1187        "internal func ",
1188        "override func ",
1189        "open func ",
1190        "static func ",
1191        "class func ",
1192        "mutating func ",
1193        "private static func ",
1194        "public static func ",
1195    ],
1196    functions_prefix_paren: &[],
1197    classes: &[
1198        "class ",
1199        "struct ",
1200        "protocol ",
1201        "enum ",
1202        "extension ",
1203        "actor ",
1204        "public class ",
1205        "private class ",
1206        "open class ",
1207        "final class ",
1208        "public struct ",
1209        "private struct ",
1210        "public protocol ",
1211    ],
1212    variables: &[
1213        "var ",
1214        "let ",
1215        "private var ",
1216        "private let ",
1217        "static var ",
1218        "static let ",
1219    ],
1220    imports: &["import "],
1221    // XCTest: test functions are named test* by convention; Swift Testing: @Test attribute
1222    tests: &["func test", "func Test", "@Test"],
1223    assertions: &[
1224        "XCTAssertEqual(",
1225        "XCTAssertNotEqual(",
1226        "XCTAssertTrue(",
1227        "XCTAssertFalse(",
1228        "XCTAssertNil(",
1229        "XCTAssertNotNil(",
1230        "XCTAssertGreaterThan(",
1231        "XCTAssertLessThan(",
1232        "XCTAssertThrowsError(",
1233        "XCTAssertNoThrow(",
1234        "#expect(",
1235    ],
1236    test_suites: &[],
1237};
1238
1239const SP_RUBY: SymbolPatterns = SymbolPatterns {
1240    functions: &["def ", "private def ", "protected def "],
1241    functions_prefix_paren: &[],
1242    classes: &["class ", "module "],
1243    variables: &[],
1244    imports: &["require ", "require_relative "],
1245    // RSpec / minitest
1246    tests: &["it ", "it(", "describe ", "context ", "test "],
1247    assertions: &[],
1248    test_suites: &[],
1249};
1250
1251const SP_SCALA: SymbolPatterns = SymbolPatterns {
1252    functions: &["def ", "private def ", "protected def ", "override def "],
1253    functions_prefix_paren: &[],
1254    classes: &[
1255        "class ",
1256        "case class ",
1257        "abstract class ",
1258        "sealed class ",
1259        "object ",
1260        "trait ",
1261    ],
1262    variables: &["val ", "var ", "lazy val "],
1263    imports: &["import "],
1264    // ScalaTest / MUnit: FunSuite test("..."), FlatSpec it("..."), AnyWordSpec "..." should
1265    tests: &["test(", "it(", "describe("],
1266    assertions: &[],
1267    test_suites: &[],
1268};
1269
1270const SP_PHP: SymbolPatterns = SymbolPatterns {
1271    functions: &[
1272        "function ",
1273        "public function ",
1274        "private function ",
1275        "protected function ",
1276        "static function ",
1277        "abstract function ",
1278        "final function ",
1279        "public static function ",
1280        "private static function ",
1281        "protected static function ",
1282    ],
1283    functions_prefix_paren: &[],
1284    classes: &[
1285        "class ",
1286        "abstract class ",
1287        "final class ",
1288        "interface ",
1289        "trait ",
1290        "enum ",
1291    ],
1292    variables: &[],
1293    imports: &[
1294        "use ",
1295        "require ",
1296        "require_once ",
1297        "include ",
1298        "include_once ",
1299    ],
1300    // PHPUnit: test methods start with test, or use @test annotation
1301    tests: &[
1302        "public function test",
1303        "function test",
1304        "#[Test]",
1305        "#[DataProvider(",
1306    ],
1307    assertions: &[],
1308    test_suites: &[],
1309};
1310
1311const SP_ELIXIR: SymbolPatterns = SymbolPatterns {
1312    functions: &[
1313        "def ",
1314        "defp ",
1315        "defmacro ",
1316        "defmacrop ",
1317        "defguard ",
1318        "defguardp ",
1319    ],
1320    functions_prefix_paren: &[],
1321    classes: &["defmodule ", "defprotocol ", "defimpl "],
1322    variables: &[],
1323    imports: &["import ", "alias ", "use ", "require "],
1324    // ExUnit
1325    tests: &["test ", "describe "],
1326    assertions: &[],
1327    test_suites: &[],
1328};
1329
1330const SP_ERLANG: SymbolPatterns = SymbolPatterns {
1331    functions: &[],
1332    functions_prefix_paren: &[],
1333    classes: &["-module("],
1334    variables: &[],
1335    imports: &["-import(", "-include(", "-include_lib("],
1336    tests: &[],
1337    assertions: &[],
1338    test_suites: &[],
1339};
1340
1341const SP_FSHARP: SymbolPatterns = SymbolPatterns {
1342    functions: &[
1343        "let ",
1344        "let rec ",
1345        "member ",
1346        "override ",
1347        "abstract member ",
1348    ],
1349    functions_prefix_paren: &[],
1350    classes: &["type "],
1351    variables: &["let mutable "],
1352    imports: &["open "],
1353    // NUnit / xUnit attributes on their own line; FsUnit uses [<Test>] / [<Fact>]
1354    tests: &["[<Test>]", "[<Fact>]", "[<Theory>]", "[<TestCase("],
1355    assertions: &[],
1356    test_suites: &[],
1357};
1358
1359const SP_GROOVY: SymbolPatterns = SymbolPatterns {
1360    functions: &["def ", "private def ", "public def ", "protected def "],
1361    functions_prefix_paren: &[],
1362    classes: &["class ", "abstract class ", "interface ", "enum ", "trait "],
1363    variables: &[],
1364    imports: &["import "],
1365    // Spock framework: feature methods; JUnit annotations
1366    tests: &["def \"", "@Test", "given:", "when:", "then:", "expect:"],
1367    assertions: &[],
1368    test_suites: &[],
1369};
1370
1371const SP_HASKELL: SymbolPatterns = SymbolPatterns {
1372    functions: &[],
1373    functions_prefix_paren: &[],
1374    classes: &["class ", "data ", "newtype ", "type "],
1375    variables: &[],
1376    imports: &["import "],
1377    tests: &[],
1378    assertions: &[],
1379    test_suites: &[],
1380};
1381
1382const SP_LUA: SymbolPatterns = SymbolPatterns {
1383    functions: &["function ", "local function "],
1384    functions_prefix_paren: &[],
1385    classes: &[],
1386    variables: &["local "],
1387    imports: &[],
1388    // busted test framework
1389    tests: &["it(", "describe(", "pending("],
1390    assertions: &[],
1391    test_suites: &[],
1392};
1393
1394const SP_NIM: SymbolPatterns = SymbolPatterns {
1395    functions: &[
1396        "proc ",
1397        "func ",
1398        "method ",
1399        "iterator ",
1400        "converter ",
1401        "template ",
1402        "macro ",
1403    ],
1404    functions_prefix_paren: &[],
1405    classes: &["type "],
1406    variables: &["var ", "let ", "const "],
1407    imports: &["import ", "from "],
1408    // unittest module
1409    tests: &["test "],
1410    assertions: &[],
1411    test_suites: &[],
1412};
1413
1414const SP_OBJECTIVEC: SymbolPatterns = SymbolPatterns {
1415    functions: &["- (", "+ ("],
1416    functions_prefix_paren: &[],
1417    classes: &["@interface ", "@implementation ", "@protocol "],
1418    variables: &[],
1419    imports: &["#import ", "#include "],
1420    // XCTest: test methods start with - (void)test
1421    tests: &["- (void)test"],
1422    assertions: &[
1423        "XCTAssertEqual(",
1424        "XCTAssertNotEqual(",
1425        "XCTAssertTrue(",
1426        "XCTAssertFalse(",
1427        "XCTAssertNil(",
1428        "XCTAssertNotNil(",
1429        "XCTAssertGreaterThan(",
1430        "XCTAssertLessThan(",
1431        "XCTAssertThrowsError(",
1432        "XCTAssertNoThrow(",
1433    ],
1434    test_suites: &[],
1435};
1436
1437const SP_OCAML: SymbolPatterns = SymbolPatterns {
1438    functions: &["let ", "let rec "],
1439    functions_prefix_paren: &[],
1440    classes: &["type ", "module ", "class "],
1441    variables: &[],
1442    imports: &["open "],
1443    tests: &[],
1444    assertions: &[],
1445    test_suites: &[],
1446};
1447
1448const SP_PERL: SymbolPatterns = SymbolPatterns {
1449    functions: &["sub "],
1450    functions_prefix_paren: &[],
1451    classes: &["package "],
1452    variables: &["my ", "our ", "local "],
1453    imports: &["use ", "require "],
1454    tests: &[],
1455    assertions: &[],
1456    test_suites: &[],
1457};
1458
1459const SP_CLOJURE: SymbolPatterns = SymbolPatterns {
1460    functions: &["(defn ", "(defn- ", "(defmacro ", "(defmulti "],
1461    functions_prefix_paren: &[],
1462    classes: &[
1463        "(defrecord ",
1464        "(defprotocol ",
1465        "(deftype ",
1466        "(definterface ",
1467    ],
1468    variables: &["(def ", "(defonce "],
1469    imports: &["(ns ", "(require "],
1470    // clojure.test
1471    tests: &["(deftest ", "(testing "],
1472    assertions: &[],
1473    test_suites: &[],
1474};
1475
1476const SP_JULIA: SymbolPatterns = SymbolPatterns {
1477    functions: &["function ", "macro "],
1478    functions_prefix_paren: &[],
1479    classes: &[
1480        "struct ",
1481        "mutable struct ",
1482        "abstract type ",
1483        "primitive type ",
1484    ],
1485    variables: &["const "],
1486    imports: &["import ", "using "],
1487    // Test.jl standard library
1488    tests: &["@test ", "@testset "],
1489    assertions: &[],
1490    test_suites: &[],
1491};
1492
1493const SP_DART: SymbolPatterns = SymbolPatterns {
1494    functions: &[],
1495    functions_prefix_paren: &[],
1496    classes: &["class ", "abstract class ", "mixin ", "extension ", "enum "],
1497    variables: &["var ", "final ", "const ", "late "],
1498    imports: &["import "],
1499    // flutter_test / test package
1500    tests: &["test(", "testWidgets(", "group("],
1501    assertions: &[],
1502    test_suites: &[],
1503};
1504
1505const SP_R: SymbolPatterns = SymbolPatterns {
1506    functions: &[],
1507    functions_prefix_paren: &[],
1508    classes: &[],
1509    variables: &[],
1510    imports: &["library(", "source("],
1511    // testthat
1512    tests: &["test_that(", "it(", "describe(", "expect_"],
1513    assertions: &[],
1514    test_suites: &[],
1515};
1516
1517const SP_SQL: SymbolPatterns = SymbolPatterns {
1518    functions: &[
1519        "create function ",
1520        "create or replace function ",
1521        "create procedure ",
1522        "create or replace procedure ",
1523        "CREATE FUNCTION ",
1524        "CREATE OR REPLACE FUNCTION ",
1525        "CREATE PROCEDURE ",
1526        "CREATE OR REPLACE PROCEDURE ",
1527    ],
1528    functions_prefix_paren: &[],
1529    classes: &[
1530        "create table ",
1531        "create view ",
1532        "create schema ",
1533        "CREATE TABLE ",
1534        "CREATE VIEW ",
1535        "CREATE SCHEMA ",
1536    ],
1537    variables: &["declare ", "DECLARE "],
1538    imports: &[],
1539    tests: &[],
1540    assertions: &[],
1541    test_suites: &[],
1542};
1543
1544const SP_ASSEMBLY: SymbolPatterns = SymbolPatterns {
1545    functions: &["proc ", "PROC "],
1546    functions_prefix_paren: &[],
1547    classes: &[],
1548    variables: &[],
1549    imports: &["include ", "INCLUDE ", "%include "],
1550    tests: &[],
1551    assertions: &[],
1552    test_suites: &[],
1553};
1554
1555const SP_ZIG: SymbolPatterns = SymbolPatterns {
1556    functions: &[
1557        "fn ",
1558        "pub fn ",
1559        "export fn ",
1560        "inline fn ",
1561        "pub inline fn ",
1562    ],
1563    functions_prefix_paren: &[],
1564    classes: &[],
1565    variables: &["var ", "pub var "],
1566    imports: &[],
1567    // Zig built-in test blocks
1568    tests: &["test \"", "test{"],
1569    assertions: &[],
1570    test_suites: &[],
1571};
1572
1573/// Static (non-heap) language scanning parameters.  All fields are `'static` so this struct
1574/// can be stored in a `static` array.  The dynamic `skip_lines` set (used only for Python
1575/// docstring detection) is kept in `ScanConfig` and populated by the caller after lookup.
1576#[allow(clippy::struct_excessive_bools)]
1577#[derive(Clone, Copy)]
1578struct StaticLangConfig {
1579    line_comments: &'static [&'static str],
1580    block_comment: Option<(&'static str, &'static str)>,
1581    allow_single_quote_strings: bool,
1582    allow_double_quote_strings: bool,
1583    allow_triple_quote_strings: bool,
1584    allow_csharp_verbatim_strings: bool,
1585    symbol_patterns: SymbolPatterns,
1586    /// `true` for C, C++, and Objective-C (languages that have a C preprocessor).
1587    has_preprocessor: bool,
1588}
1589
1590#[allow(clippy::struct_excessive_bools)]
1591#[derive(Debug, Clone)]
1592struct ScanConfig {
1593    line_comments: &'static [&'static str],
1594    block_comment: Option<(&'static str, &'static str)>,
1595    allow_single_quote_strings: bool,
1596    allow_double_quote_strings: bool,
1597    allow_triple_quote_strings: bool,
1598    allow_csharp_verbatim_strings: bool,
1599    skip_lines: HashSet<usize>,
1600    symbol_patterns: SymbolPatterns,
1601}
1602
1603// ── Per-family base configurations ───────────────────────────────────────────
1604//
1605// Most languages share one of two comment styles.  Define a base `const` for
1606// each family; table entries override only the fields that differ (symbol
1607// patterns, preprocessor flag, verbatim-string flag, etc.).
1608//
1609// C-slash family: `//` line, `/* */` block, single + double quotes.
1610// Covers C, C++, Obj-C, C#, Go, Java, JS/TS/Svelte/Vue, Dart, Groovy, Kotlin,
1611// Scala, SCSS, Swift, Rust, and Zig (Zig has no block comment → overridden).
1612const C_SLASH_BASE: StaticLangConfig = StaticLangConfig {
1613    line_comments: &["//"],
1614    block_comment: Some(("/*", "*/")),
1615    allow_single_quote_strings: true,
1616    allow_double_quote_strings: true,
1617    allow_triple_quote_strings: false,
1618    allow_csharp_verbatim_strings: false,
1619    symbol_patterns: SP_NONE,
1620    has_preprocessor: false,
1621};
1622
1623// Hash-comment family: `#` line comment, no block comment, single + double
1624// quotes.  Covers Shell, Ruby, R, Perl, Elixir (each overrides only SP_*);
1625// Python overrides triple-quote; PowerShell and Nim override block_comment.
1626const HASH_BASE: StaticLangConfig = StaticLangConfig {
1627    line_comments: &["#"],
1628    block_comment: None,
1629    allow_single_quote_strings: true,
1630    allow_double_quote_strings: true,
1631    allow_triple_quote_strings: false,
1632    allow_csharp_verbatim_strings: false,
1633    symbol_patterns: SP_NONE,
1634    has_preprocessor: false,
1635};
1636
1637/// Static language-scan configuration table — one entry per supported language.
1638/// Used by `language_scan_config` to avoid a 41-arm match.  All `SP_*` constants
1639/// referenced here are defined above in the same module.
1640static LANG_SCAN_TABLE: &[(Language, StaticLangConfig)] = &[
1641    // ── C preprocessor family ─────────────────────────────────────────────────
1642    (
1643        Language::C,
1644        StaticLangConfig {
1645            symbol_patterns: SP_C,
1646            has_preprocessor: true,
1647            ..C_SLASH_BASE
1648        },
1649    ),
1650    (
1651        Language::Cpp,
1652        StaticLangConfig {
1653            symbol_patterns: SP_CPP,
1654            has_preprocessor: true,
1655            ..C_SLASH_BASE
1656        },
1657    ),
1658    (
1659        Language::ObjectiveC,
1660        StaticLangConfig {
1661            symbol_patterns: SP_OBJECTIVEC,
1662            has_preprocessor: true,
1663            ..C_SLASH_BASE
1664        },
1665    ),
1666    // ── C-slash family ────────────────────────────────────────────────────────
1667    (
1668        Language::CSharp,
1669        StaticLangConfig {
1670            symbol_patterns: SP_CSHARP,
1671            allow_csharp_verbatim_strings: true,
1672            ..C_SLASH_BASE
1673        },
1674    ),
1675    (
1676        Language::Go,
1677        StaticLangConfig {
1678            symbol_patterns: SP_GO,
1679            ..C_SLASH_BASE
1680        },
1681    ),
1682    (
1683        Language::Java,
1684        StaticLangConfig {
1685            symbol_patterns: SP_JAVA,
1686            ..C_SLASH_BASE
1687        },
1688    ),
1689    (
1690        Language::JavaScript,
1691        StaticLangConfig {
1692            symbol_patterns: SP_JS,
1693            ..C_SLASH_BASE
1694        },
1695    ),
1696    (
1697        Language::TypeScript,
1698        StaticLangConfig {
1699            symbol_patterns: SP_TS,
1700            ..C_SLASH_BASE
1701        },
1702    ),
1703    (
1704        Language::Svelte,
1705        StaticLangConfig {
1706            symbol_patterns: SP_JS,
1707            ..C_SLASH_BASE
1708        },
1709    ),
1710    (
1711        Language::Vue,
1712        StaticLangConfig {
1713            symbol_patterns: SP_JS,
1714            ..C_SLASH_BASE
1715        },
1716    ),
1717    (
1718        Language::Dart,
1719        StaticLangConfig {
1720            symbol_patterns: SP_DART,
1721            ..C_SLASH_BASE
1722        },
1723    ),
1724    (
1725        Language::Groovy,
1726        StaticLangConfig {
1727            symbol_patterns: SP_GROOVY,
1728            ..C_SLASH_BASE
1729        },
1730    ),
1731    (
1732        Language::Kotlin,
1733        StaticLangConfig {
1734            symbol_patterns: SP_KOTLIN,
1735            ..C_SLASH_BASE
1736        },
1737    ),
1738    (
1739        Language::Scala,
1740        StaticLangConfig {
1741            symbol_patterns: SP_SCALA,
1742            ..C_SLASH_BASE
1743        },
1744    ),
1745    (
1746        Language::Scss,
1747        StaticLangConfig {
1748            symbol_patterns: SP_NONE,
1749            ..C_SLASH_BASE
1750        },
1751    ),
1752    // Rust: no single-quote char literals (they're lifetime annotations)
1753    (
1754        Language::Rust,
1755        StaticLangConfig {
1756            symbol_patterns: SP_RUST,
1757            allow_single_quote_strings: false,
1758            ..C_SLASH_BASE
1759        },
1760    ),
1761    // Swift: no single-quote strings
1762    (
1763        Language::Swift,
1764        StaticLangConfig {
1765            symbol_patterns: SP_SWIFT,
1766            allow_single_quote_strings: false,
1767            ..C_SLASH_BASE
1768        },
1769    ),
1770    // Zig: no block comment
1771    (
1772        Language::Zig,
1773        StaticLangConfig {
1774            symbol_patterns: SP_ZIG,
1775            block_comment: None,
1776            ..C_SLASH_BASE
1777        },
1778    ),
1779    // F#: `(*` … `*)` block comment, no single-quote strings
1780    (
1781        Language::FSharp,
1782        StaticLangConfig {
1783            line_comments: &["//"],
1784            block_comment: Some(("(*", "*)")),
1785            allow_single_quote_strings: false,
1786            allow_double_quote_strings: true,
1787            symbol_patterns: SP_FSHARP,
1788            ..C_SLASH_BASE
1789        },
1790    ),
1791    // ── Hash-comment family ───────────────────────────────────────────────────
1792    (
1793        Language::Shell,
1794        StaticLangConfig {
1795            symbol_patterns: SP_SHELL,
1796            ..HASH_BASE
1797        },
1798    ),
1799    (
1800        Language::Elixir,
1801        StaticLangConfig {
1802            symbol_patterns: SP_ELIXIR,
1803            ..HASH_BASE
1804        },
1805    ),
1806    (
1807        Language::Perl,
1808        StaticLangConfig {
1809            symbol_patterns: SP_PERL,
1810            ..HASH_BASE
1811        },
1812    ),
1813    (
1814        Language::R,
1815        StaticLangConfig {
1816            symbol_patterns: SP_R,
1817            ..HASH_BASE
1818        },
1819    ),
1820    (
1821        Language::Ruby,
1822        StaticLangConfig {
1823            symbol_patterns: SP_RUBY,
1824            ..HASH_BASE
1825        },
1826    ),
1827    // Python: triple-quote string literals
1828    (
1829        Language::Python,
1830        StaticLangConfig {
1831            symbol_patterns: SP_PYTHON,
1832            allow_triple_quote_strings: true,
1833            ..HASH_BASE
1834        },
1835    ),
1836    // PowerShell: `<# … #>` block comment
1837    (
1838        Language::PowerShell,
1839        StaticLangConfig {
1840            symbol_patterns: SP_POWERSHELL,
1841            block_comment: Some(("<#", "#>")),
1842            ..HASH_BASE
1843        },
1844    ),
1845    // Nim: `#[` … `]#` block comment
1846    (
1847        Language::Nim,
1848        StaticLangConfig {
1849            symbol_patterns: SP_NIM,
1850            block_comment: Some(("#[", "]#")),
1851            ..HASH_BASE
1852        },
1853    ),
1854    // Makefile / Dockerfile: `#` only, no string literals
1855    (
1856        Language::Makefile,
1857        StaticLangConfig {
1858            symbol_patterns: SP_NONE,
1859            allow_single_quote_strings: false,
1860            allow_double_quote_strings: false,
1861            ..HASH_BASE
1862        },
1863    ),
1864    (
1865        Language::Dockerfile,
1866        StaticLangConfig {
1867            symbol_patterns: SP_NONE,
1868            allow_single_quote_strings: false,
1869            allow_double_quote_strings: false,
1870            ..HASH_BASE
1871        },
1872    ),
1873    // ── Other unique comment styles ───────────────────────────────────────────
1874    // CSS / SCSS: only `/* */` block, no line comment
1875    (
1876        Language::Css,
1877        StaticLangConfig {
1878            line_comments: &[],
1879            block_comment: Some(("/*", "*/")),
1880            symbol_patterns: SP_NONE,
1881            ..C_SLASH_BASE
1882        },
1883    ),
1884    // HTML / XML: `<!-- -->` block, no line comment, no string literals
1885    (
1886        Language::Html,
1887        StaticLangConfig {
1888            line_comments: &[],
1889            block_comment: Some(("<!--", "-->")),
1890            allow_single_quote_strings: false,
1891            allow_double_quote_strings: false,
1892            symbol_patterns: SP_NONE,
1893            ..C_SLASH_BASE
1894        },
1895    ),
1896    (
1897        Language::Xml,
1898        StaticLangConfig {
1899            line_comments: &[],
1900            block_comment: Some(("<!--", "-->")),
1901            allow_single_quote_strings: false,
1902            allow_double_quote_strings: false,
1903            symbol_patterns: SP_NONE,
1904            ..C_SLASH_BASE
1905        },
1906    ),
1907    // Lua: `--` line, `--[[ ]]` block
1908    (
1909        Language::Lua,
1910        StaticLangConfig {
1911            line_comments: &["--"],
1912            block_comment: Some(("--[[", "]]")),
1913            symbol_patterns: SP_LUA,
1914            ..C_SLASH_BASE
1915        },
1916    ),
1917    // Haskell: `--` line, `{- -}` block
1918    (
1919        Language::Haskell,
1920        StaticLangConfig {
1921            line_comments: &["--"],
1922            block_comment: Some(("{-", "-}")),
1923            symbol_patterns: SP_HASKELL,
1924            ..C_SLASH_BASE
1925        },
1926    ),
1927    // SQL: `--` line, `/* */` block, single quote only
1928    (
1929        Language::Sql,
1930        StaticLangConfig {
1931            line_comments: &["--"],
1932            block_comment: Some(("/*", "*/")),
1933            allow_single_quote_strings: true,
1934            allow_double_quote_strings: false,
1935            symbol_patterns: SP_SQL,
1936            ..C_SLASH_BASE
1937        },
1938    ),
1939    // OCaml: `(*` … `*)` only, no line comment, no single-quote strings
1940    (
1941        Language::Ocaml,
1942        StaticLangConfig {
1943            line_comments: &[],
1944            block_comment: Some(("(*", "*)")),
1945            allow_single_quote_strings: false,
1946            symbol_patterns: SP_OCAML,
1947            ..C_SLASH_BASE
1948        },
1949    ),
1950    // Assembly / Clojure: `;` line comment, no block, no string literals
1951    (
1952        Language::Assembly,
1953        StaticLangConfig {
1954            line_comments: &[";"],
1955            block_comment: None,
1956            allow_single_quote_strings: false,
1957            allow_double_quote_strings: false,
1958            symbol_patterns: SP_ASSEMBLY,
1959            ..C_SLASH_BASE
1960        },
1961    ),
1962    (
1963        Language::Clojure,
1964        StaticLangConfig {
1965            line_comments: &[";"],
1966            block_comment: None,
1967            allow_single_quote_strings: false,
1968            symbol_patterns: SP_CLOJURE,
1969            ..C_SLASH_BASE
1970        },
1971    ),
1972    // Erlang: `%` line comment, no block, no single-quote strings
1973    (
1974        Language::Erlang,
1975        StaticLangConfig {
1976            line_comments: &["%"],
1977            block_comment: None,
1978            allow_single_quote_strings: false,
1979            symbol_patterns: SP_ERLANG,
1980            ..C_SLASH_BASE
1981        },
1982    ),
1983    // PHP: `//` or `#` line, `/* */` block
1984    (
1985        Language::Php,
1986        StaticLangConfig {
1987            line_comments: &["//", "#"],
1988            block_comment: Some(("/*", "*/")),
1989            symbol_patterns: SP_PHP,
1990            ..C_SLASH_BASE
1991        },
1992    ),
1993    // Julia: `#` line, `#= =#` block, double + triple quotes, no single
1994    (
1995        Language::Julia,
1996        StaticLangConfig {
1997            line_comments: &["#"],
1998            block_comment: Some(("#=", "=#")),
1999            allow_single_quote_strings: false,
2000            allow_triple_quote_strings: true,
2001            symbol_patterns: SP_JULIA,
2002            ..C_SLASH_BASE
2003        },
2004    ),
2005];
2006
2007/// Per-call IEEE 1045-1992 flags derived from `AnalysisOptions` plus per-language properties.
2008/// Private to this crate; constructed inside `analyze_text`.
2009#[derive(Debug, Clone, Copy)]
2010struct IeeeFlags {
2011    /// True for C, C++, and Objective-C — languages with a C preprocessor.
2012    has_preprocessor_directives: bool,
2013    /// Mirrors `AnalysisOptions::blank_in_block_comment_as_comment`.
2014    blank_in_block_comment_as_comment: bool,
2015    /// Mirrors `AnalysisOptions::collapse_continuation_lines`.
2016    collapse_continuation_lines: bool,
2017}
2018
2019#[derive(Debug, Clone, Copy)]
2020enum StringState {
2021    Single(char),
2022    Triple(&'static str),
2023    VerbatimDouble,
2024}
2025
2026#[allow(clippy::struct_excessive_bools)]
2027#[derive(Debug, Default)]
2028struct LineFacts {
2029    has_code: bool,
2030    has_single_comment: bool,
2031    has_multi_comment: bool,
2032    has_docstring: bool,
2033}
2034
2035/// Process one character while the lexer is inside a string literal.
2036///
2037/// Returns `(new_string_state, advance)` where `advance` is the number of chars to skip.
2038fn process_string_char(
2039    state: StringState,
2040    chars: &[char],
2041    i: usize,
2042) -> (Option<StringState>, usize) {
2043    match state {
2044        StringState::Single(delim) => {
2045            if chars[i] == '\\' {
2046                return (Some(state), 2); // skip escaped character
2047            }
2048            if chars[i] == delim {
2049                (None, 1)
2050            } else {
2051                (Some(state), 1)
2052            }
2053        }
2054        StringState::Triple(delim) => {
2055            if starts_with(chars, i, delim) {
2056                (None, delim.len())
2057            } else {
2058                (Some(state), 1)
2059            }
2060        }
2061        StringState::VerbatimDouble => {
2062            if starts_with(chars, i, "\"\"") {
2063                return (Some(state), 2); // escaped quote-quote inside verbatim string
2064            }
2065            if chars[i] == '"' {
2066                (None, 1)
2067            } else {
2068                (Some(state), 1)
2069            }
2070        }
2071    }
2072}
2073
2074/// Process one character while the lexer is inside a block comment.
2075///
2076/// Returns `(still_in_block_comment, advance)`.
2077fn process_block_comment_char(chars: &[char], i: usize, close: &str) -> (bool, usize) {
2078    if starts_with(chars, i, close) {
2079        (false, close.len())
2080    } else {
2081        (true, 1)
2082    }
2083}
2084
2085/// Attempt to begin a new string literal at position `i`.
2086///
2087/// Returns `Some((new_state, advance))` when a string opener is detected, else `None`.
2088fn try_open_string(chars: &[char], i: usize, config: &ScanConfig) -> Option<(StringState, usize)> {
2089    if config.allow_csharp_verbatim_strings && starts_with(chars, i, "@\"") {
2090        return Some((StringState::VerbatimDouble, 2));
2091    }
2092    if config.allow_triple_quote_strings {
2093        if starts_with(chars, i, "\"\"\"") {
2094            return Some((StringState::Triple("\"\"\""), 3));
2095        }
2096        if starts_with(chars, i, "'''") {
2097            return Some((StringState::Triple("'''"), 3));
2098        }
2099    }
2100    if config.allow_single_quote_strings && chars[i] == '\'' {
2101        return Some((StringState::Single('\''), 1));
2102    }
2103    if config.allow_double_quote_strings && chars[i] == '"' {
2104        return Some((StringState::Single('"'), 1));
2105    }
2106    None
2107}
2108
2109/// Advance past one character position while inside a block comment.
2110///
2111/// Updates `in_block_comment` if the closing delimiter is found and returns the
2112/// number of characters consumed. Returns 0 when no block-comment config is set
2113/// (preserving the caller's `continue`-without-advance behaviour for that impossible state).
2114fn step_through_block_comment(
2115    chars: &[char],
2116    i: usize,
2117    block_comment: Option<(&'static str, &'static str)>,
2118    in_block_comment: &mut bool,
2119) -> usize {
2120    if let Some((_, close)) = block_comment {
2121        let (still_in, advance) = process_block_comment_char(chars, i, close);
2122        *in_block_comment = still_in;
2123        return advance;
2124    }
2125    0
2126}
2127
2128/// If the character at `i` starts a block comment, return the length of the opening
2129/// delimiter so the caller can advance past it. Returns `None` if no match.
2130fn try_open_block_comment(
2131    chars: &[char],
2132    i: usize,
2133    block_comment: Option<(&'static str, &'static str)>,
2134) -> Option<usize> {
2135    let (open, _) = block_comment?;
2136    starts_with(chars, i, open).then_some(open.len())
2137}
2138
2139/// Scan a single physical line and update `facts`, `in_block_comment`, and `string_state`.
2140///
2141/// Returns `true` when the caller should break out of the per-line loop early (line comment hit).
2142fn scan_line(
2143    chars: &[char],
2144    config: &ScanConfig,
2145    facts: &mut LineFacts,
2146    in_block_comment: &mut bool,
2147    string_state: &mut Option<StringState>,
2148) {
2149    let mut i = 0usize;
2150    while i < chars.len() {
2151        // Inside a string literal — advance until the closing delimiter.
2152        if let Some(state) = *string_state {
2153            facts.has_code = true;
2154            let (new_state, advance) = process_string_char(state, chars, i);
2155            *string_state = new_state;
2156            i += advance;
2157            continue;
2158        }
2159
2160        // Inside a block comment — advance until the closing delimiter.
2161        if *in_block_comment {
2162            facts.has_multi_comment = true;
2163            i += step_through_block_comment(chars, i, config.block_comment, in_block_comment);
2164            continue;
2165        }
2166
2167        // Whitespace outside any string/comment — skip.
2168        if chars[i].is_whitespace() {
2169            i += 1;
2170            continue;
2171        }
2172
2173        // Attempt to open a string literal.
2174        if let Some((new_state, advance)) = try_open_string(chars, i, config) {
2175            facts.has_code = true;
2176            *string_state = Some(new_state);
2177            i += advance;
2178            continue;
2179        }
2180
2181        // Attempt to open a block comment.
2182        if let Some(advance) = try_open_block_comment(chars, i, config.block_comment) {
2183            facts.has_multi_comment = true;
2184            *in_block_comment = true;
2185            i += advance;
2186            continue;
2187        }
2188
2189        // Line comment — rest of the line is a comment; stop scanning.
2190        if config
2191            .line_comments
2192            .iter()
2193            .any(|prefix| starts_with(chars, i, prefix))
2194        {
2195            facts.has_single_comment = true;
2196            break;
2197        }
2198
2199        // Plain code character.
2200        facts.has_code = true;
2201        i += 1;
2202    }
2203}
2204
2205/// Apply IEEE 1045-1992 §4.2 preprocessor-directive tracking and continuation-line merging,
2206/// then emit the finalized `LineFacts` for this physical line.
2207///
2208/// Returns `None` when the line is part of a continuation sequence and should be deferred.
2209fn finalize_line_facts(
2210    facts: LineFacts,
2211    trimmed: &str,
2212    raw: &mut RawLineCounts,
2213    ieee: IeeeFlags,
2214    in_block_comment: bool,
2215    string_state: Option<StringState>,
2216    pending_continuation: &mut Option<LineFacts>,
2217) -> Option<LineFacts> {
2218    // IEEE 1045-1992 §4.2: track preprocessor/compiler directive lines (C/C++/ObjC).
2219    // A directive line is a pure code line (no comment on the same physical line) whose
2220    // trimmed content starts with '#'.
2221    if ieee.has_preprocessor_directives
2222        && facts.has_code
2223        && !facts.has_single_comment
2224        && !facts.has_multi_comment
2225        && trimmed.starts_with('#')
2226    {
2227        raw.compiler_directive_lines += 1;
2228    }
2229
2230    // IEEE 1045-1992 continuation-line handling.
2231    // A line is a continuation starter when it ends with '\' outside any comment or string.
2232    let is_continuation = ieee.collapse_continuation_lines
2233        && !in_block_comment
2234        && string_state.is_none()
2235        && trimmed.ends_with('\\');
2236
2237    if is_continuation {
2238        let pending = pending_continuation.get_or_insert_with(LineFacts::default);
2239        pending.has_code |= facts.has_code;
2240        pending.has_single_comment |= facts.has_single_comment;
2241        pending.has_multi_comment |= facts.has_multi_comment;
2242        pending.has_docstring |= facts.has_docstring;
2243        return None; // defer classification until the sequence ends
2244    }
2245
2246    // Merge any accumulated continuation facts into the final line.
2247    let emit = if let Some(pending) = pending_continuation.take() {
2248        LineFacts {
2249            has_code: pending.has_code | facts.has_code,
2250            has_single_comment: pending.has_single_comment | facts.has_single_comment,
2251            has_multi_comment: pending.has_multi_comment | facts.has_multi_comment,
2252            has_docstring: pending.has_docstring | facts.has_docstring,
2253        }
2254    } else {
2255        facts
2256    };
2257    Some(emit)
2258}
2259
2260/// Scan and classify one physical line, updating all running state in place.
2261///
2262/// Pre-classified lines (present in `config.skip_lines`) are counted as docstring-comment
2263/// lines and returned early without further analysis.
2264#[allow(clippy::needless_pass_by_value)]
2265#[allow(clippy::too_many_arguments)]
2266#[allow(clippy::many_single_char_names)] // destructuring return from count_symbols; names match field roles
2267fn process_physical_line(
2268    line: &str,
2269    line_idx: usize,
2270    config: &ScanConfig,
2271    raw: &mut RawLineCounts,
2272    in_block_comment: &mut bool,
2273    string_state: &mut Option<StringState>,
2274    pending_continuation: &mut Option<LineFacts>,
2275    ieee: IeeeFlags,
2276) {
2277    raw.total_physical_lines += 1;
2278
2279    if config.skip_lines.contains(&line_idx) {
2280        raw.docstring_comment_lines += 1;
2281        return;
2282    }
2283
2284    let trimmed = line.trim();
2285    let mut facts = LineFacts::default();
2286
2287    // IEEE 1045-1992: blank lines inside block comments are comment lines by default.
2288    // When blank_in_block_comment_as_comment is false, blank lines keep their blank
2289    // classification even while inside a block comment.
2290    if *in_block_comment && (ieee.blank_in_block_comment_as_comment || !trimmed.is_empty()) {
2291        facts.has_multi_comment = true;
2292    }
2293
2294    let chars: Vec<char> = line.chars().collect();
2295    scan_line(&chars, config, &mut facts, in_block_comment, string_state);
2296
2297    let Some(emit) = finalize_line_facts(
2298        facts,
2299        trimmed,
2300        raw,
2301        ieee,
2302        *in_block_comment,
2303        *string_state,
2304        pending_continuation,
2305    ) else {
2306        return;
2307    };
2308
2309    classify_line(raw, &emit, trimmed);
2310
2311    if emit.has_code {
2312        let (f, c, v, i, t, a, s) = count_symbols(&config.symbol_patterns, trimmed);
2313        raw.functions += f;
2314        raw.classes += c;
2315        raw.variables += v;
2316        raw.imports += i;
2317        raw.test_count += t;
2318        raw.test_assertion_count += a;
2319        raw.test_suite_count += s;
2320    }
2321}
2322
2323#[allow(clippy::needless_pass_by_value)]
2324fn analyze_generic(text: &str, config: ScanConfig, ieee: IeeeFlags) -> RawFileAnalysis {
2325    let normalized = text.replace("\r\n", "\n").replace('\r', "\n");
2326    let lines: Vec<&str> = normalized.split_terminator('\n').collect();
2327
2328    let mut raw = RawLineCounts::default();
2329    let mut warnings = Vec::new();
2330
2331    let mut in_block_comment = false;
2332    let mut string_state: Option<StringState> = None;
2333    // IEEE continuation-line state: accumulates facts across a backslash-continued sequence.
2334    let mut pending_continuation: Option<LineFacts> = None;
2335
2336    for (line_idx, line) in lines.iter().enumerate() {
2337        process_physical_line(
2338            line,
2339            line_idx,
2340            &config,
2341            &mut raw,
2342            &mut in_block_comment,
2343            &mut string_state,
2344            &mut pending_continuation,
2345            ieee,
2346        );
2347    }
2348
2349    // Flush any pending continuation that reaches end-of-file without a closing line.
2350    if let Some(pending) = pending_continuation.take() {
2351        classify_line(&mut raw, &pending, "");
2352    }
2353
2354    if in_block_comment {
2355        warnings.push("unclosed block comment detected; result is best effort".into());
2356    }
2357    if string_state.is_some() {
2358        warnings.push("unclosed string literal detected; result is best effort".into());
2359    }
2360
2361    RawFileAnalysis {
2362        raw,
2363        parse_mode: if warnings.is_empty() {
2364            ParseMode::Lexical
2365        } else {
2366            ParseMode::LexicalBestEffort
2367        },
2368        warnings,
2369        style_analysis: None,
2370    }
2371}
2372
2373const fn classify_line(raw: &mut RawLineCounts, facts: &LineFacts, trimmed: &str) {
2374    if facts.has_docstring {
2375        raw.docstring_comment_lines += 1;
2376    } else if !facts.has_code
2377        && !facts.has_single_comment
2378        && !facts.has_multi_comment
2379        && trimmed.is_empty()
2380    {
2381        raw.blank_only_lines += 1;
2382    } else if facts.has_code && facts.has_single_comment {
2383        raw.mixed_code_single_comment_lines += 1;
2384    } else if facts.has_code && facts.has_multi_comment {
2385        raw.mixed_code_multi_comment_lines += 1;
2386    } else if facts.has_code {
2387        raw.code_only_lines += 1;
2388    } else if facts.has_single_comment {
2389        raw.single_comment_only_lines += 1;
2390    } else if facts.has_multi_comment {
2391        raw.multi_comment_only_lines += 1;
2392    } else if trimmed.is_empty() {
2393        raw.blank_only_lines += 1;
2394    } else {
2395        raw.skipped_unknown_lines += 1;
2396    }
2397}
2398
2399fn count_symbols(patterns: &SymbolPatterns, trimmed: &str) -> (u64, u64, u64, u64, u64, u64, u64) {
2400    let hit = |pats: &[&str]| u64::from(pats.iter().any(|p| trimmed.starts_with(p)));
2401    // For return-type-led languages (C/C++): match prefix AND `(` present AND no `=` sits
2402    // between the prefix start and the first `(` (guards against `void* p = malloc(n)`).
2403    let fn_pp = if patterns.functions_prefix_paren.is_empty() {
2404        0
2405    } else if let Some(paren_pos) = trimmed.find('(') {
2406        if trimmed[..paren_pos].contains('=') {
2407            0
2408        } else {
2409            hit(patterns.functions_prefix_paren)
2410        }
2411    } else {
2412        0
2413    };
2414    let test_hit = hit(patterns.tests);
2415    // Lines matching a test pattern count as tests, not as plain functions or classes.
2416    // This prevents double-counting in Python (`def test_` / `class Test`) and Go
2417    // (`func Test` / `func Benchmark` / `func Fuzz`) where the same line satisfies both
2418    // a function/class prefix and a test pattern. Rust is unaffected: `#[test]` is a
2419    // standalone attribute line; the `fn` declaration on the next line does not match any
2420    // test pattern and still increments functions correctly.
2421    let fn_hit = if test_hit == 0 {
2422        hit(patterns.functions) | fn_pp
2423    } else {
2424        0
2425    };
2426    let class_hit = if test_hit == 0 {
2427        hit(patterns.classes)
2428    } else {
2429        0
2430    };
2431    (
2432        fn_hit,
2433        class_hit,
2434        hit(patterns.variables),
2435        hit(patterns.imports),
2436        test_hit,
2437        hit(patterns.assertions),
2438        hit(patterns.test_suites),
2439    )
2440}
2441
2442fn starts_with(chars: &[char], index: usize, needle: &str) -> bool {
2443    let needle_chars: Vec<char> = needle.chars().collect();
2444    chars.get(index..index + needle_chars.len()) == Some(needle_chars.as_slice())
2445}
2446
2447#[derive(Debug, Clone)]
2448struct PyContext {
2449    indent: usize,
2450    expect_docstring: bool,
2451}
2452
2453/// Update `contexts` to pop any scopes that the current `indent` has outdented past.
2454fn py_pop_outdented_contexts(contexts: &mut Vec<PyContext>, indent: usize) {
2455    while contexts.len() > 1 && indent < contexts.last().map_or(0, |c| c.indent) {
2456        contexts.pop();
2457    }
2458}
2459
2460/// Handle `pending_block_indent` transition: push a new docstring-expecting context when we
2461/// detect the first indented line of a new block, or cancel the pending state otherwise.
2462fn py_handle_pending_indent(
2463    pending_block_indent: &mut Option<usize>,
2464    contexts: &mut Vec<PyContext>,
2465    indent: usize,
2466    trimmed: &str,
2467) {
2468    let Some(base_indent) = *pending_block_indent else {
2469        return;
2470    };
2471    if indent > base_indent {
2472        contexts.push(PyContext {
2473            indent,
2474            expect_docstring: true,
2475        });
2476        *pending_block_indent = None;
2477    } else if !trimmed.starts_with('@') {
2478        *pending_block_indent = None;
2479    }
2480}
2481
2482/// Check whether the current line is a docstring opener in the current context.
2483///
2484/// If it is, records the line, adjusts `ctx.expect_docstring`, and optionally sets
2485/// `active_docstring` for multi-line docstrings. Returns `true` when the caller should
2486/// `continue` to the next line.
2487fn py_try_record_docstring(
2488    ctx: &mut PyContext,
2489    trimmed: &str,
2490    idx: usize,
2491    docstring_lines: &mut HashSet<usize>,
2492    active_docstring: &mut Option<(&'static str, usize)>,
2493) -> bool {
2494    if !ctx.expect_docstring {
2495        return false;
2496    }
2497    if let Some(delim) = docstring_delimiter(trimmed) {
2498        docstring_lines.insert(idx);
2499        ctx.expect_docstring = false;
2500        if !closes_triple_docstring(trimmed, delim, true) {
2501            *active_docstring = Some((delim, idx));
2502        }
2503        return true;
2504    }
2505    ctx.expect_docstring = false;
2506    false
2507}
2508
2509/// Advance through an active multi-line docstring: marks the current line and clears
2510/// `active_docstring` when the closing delimiter is found. Returns `true` when the caller
2511/// should `continue` to the next line (i.e. we were inside a docstring).
2512fn track_active_docstring(
2513    active_docstring: &mut Option<(&'static str, usize)>,
2514    docstring_lines: &mut HashSet<usize>,
2515    idx: usize,
2516    trimmed: &str,
2517) -> bool {
2518    let Some((delim, start_line)) = *active_docstring else {
2519        return false;
2520    };
2521    docstring_lines.insert(idx);
2522    if closes_triple_docstring(trimmed, delim, idx == start_line) {
2523        *active_docstring = None;
2524    }
2525    true
2526}
2527
2528/// Attempt to record a docstring opener using the top of the context stack.
2529/// Returns `true` when the caller should `continue` to the next line.
2530fn try_record_docstring_if_context(
2531    contexts: &mut [PyContext],
2532    trimmed: &str,
2533    idx: usize,
2534    docstring_lines: &mut HashSet<usize>,
2535    active_docstring: &mut Option<(&'static str, usize)>,
2536) -> bool {
2537    let Some(ctx) = contexts.last_mut() else {
2538        return false;
2539    };
2540    py_try_record_docstring(ctx, trimmed, idx, docstring_lines, active_docstring)
2541}
2542
2543/// If an unclosed docstring is still active at end-of-file, mark all remaining lines.
2544fn mark_unclosed_docstring_lines(
2545    active_docstring: Option<&(&'static str, usize)>,
2546    docstring_lines: &mut HashSet<usize>,
2547    num_lines: usize,
2548) {
2549    if let Some(&(_, start_line)) = active_docstring {
2550        for idx in start_line..num_lines {
2551            docstring_lines.insert(idx);
2552        }
2553    }
2554}
2555
2556fn detect_python_docstring_lines(text: &str) -> HashSet<usize> {
2557    let normalized = text.replace("\r\n", "\n").replace('\r', "\n");
2558    let lines: Vec<&str> = normalized.split_terminator('\n').collect();
2559
2560    let mut docstring_lines = HashSet::new();
2561    let mut contexts = vec![PyContext {
2562        indent: 0,
2563        expect_docstring: true,
2564    }];
2565    let mut pending_block_indent: Option<usize> = None;
2566    let mut active_docstring: Option<(&'static str, usize)> = None;
2567
2568    for (idx, line) in lines.iter().enumerate() {
2569        let trimmed = line.trim();
2570        let indent = leading_indent(line);
2571
2572        if track_active_docstring(&mut active_docstring, &mut docstring_lines, idx, trimmed) {
2573            continue;
2574        }
2575
2576        // Blank lines and comment lines don't affect docstring detection.
2577        if trimmed.is_empty() || trimmed.starts_with('#') {
2578            continue;
2579        }
2580
2581        py_pop_outdented_contexts(&mut contexts, indent);
2582        py_handle_pending_indent(&mut pending_block_indent, &mut contexts, indent, trimmed);
2583
2584        if try_record_docstring_if_context(
2585            &mut contexts,
2586            trimmed,
2587            idx,
2588            &mut docstring_lines,
2589            &mut active_docstring,
2590        ) {
2591            continue;
2592        }
2593
2594        if is_python_block_header(trimmed) {
2595            pending_block_indent = Some(indent);
2596        }
2597    }
2598
2599    mark_unclosed_docstring_lines(active_docstring.as_ref(), &mut docstring_lines, lines.len());
2600
2601    docstring_lines
2602}
2603
2604fn leading_indent(line: &str) -> usize {
2605    line.chars().take_while(|c| c.is_whitespace()).count()
2606}
2607
2608fn is_python_block_header(trimmed: &str) -> bool {
2609    (trimmed.starts_with("def ")
2610        || trimmed.starts_with("async def ")
2611        || trimmed.starts_with("class "))
2612        && trimmed.ends_with(':')
2613}
2614
2615fn docstring_delimiter(trimmed: &str) -> Option<&'static str> {
2616    let mut idx = 0usize;
2617    let bytes = trimmed.as_bytes();
2618    while idx < bytes.len() {
2619        let c = bytes[idx] as char;
2620        if matches!(c, 'r' | 'R' | 'u' | 'U' | 'b' | 'B' | 'f' | 'F') {
2621            idx += 1;
2622            continue;
2623        }
2624        break;
2625    }
2626
2627    let rest = &trimmed[idx..];
2628    if rest.starts_with("\"\"\"") {
2629        Some("\"\"\"")
2630    } else if rest.starts_with("'''") {
2631        Some("'''")
2632    } else {
2633        None
2634    }
2635}
2636
2637fn closes_triple_docstring(trimmed: &str, delim: &str, same_line_as_start: bool) -> bool {
2638    let mut occurrences = 0usize;
2639    let mut search = trimmed;
2640    while let Some(index) = search.find(delim) {
2641        occurrences += 1;
2642        search = &search[index + delim.len()..];
2643    }
2644
2645    if same_line_as_start {
2646        occurrences >= 2
2647    } else {
2648        occurrences >= 1
2649    }
2650}
2651
2652/// Tree-sitter-backed adapters (compiled only when the `tree-sitter` feature is enabled).
2653///
2654/// When parsing succeeds the result is used directly; on any failure the caller falls back
2655/// to the lexical state machine.
2656#[cfg(feature = "tree-sitter")]
2657pub mod ts {
2658    use tree_sitter::Node;
2659
2660    use super::{ParseMode, RawFileAnalysis, RawLineCounts};
2661
2662    /// Configuration for which AST node kinds map to symbols in this grammar.
2663    struct SymbolKinds {
2664        /// Node kind name for function definitions (e.g. `"function_definition"`).
2665        function_def: &'static str,
2666        /// Node kind name for class definitions (e.g. `"class_definition"`).
2667        class_def: &'static str,
2668        /// Name field of a function node that, when it starts with this prefix, marks a test.
2669        /// Empty string disables test-prefix detection.
2670        test_fn_prefix: &'static str,
2671        /// Name field of a class node that, when it starts with this prefix, marks a test.
2672        /// Empty string disables test-prefix detection.
2673        test_class_prefix: &'static str,
2674        /// When non-empty, `call` nodes whose `function` is an `attribute` access and whose
2675        /// attribute identifier starts with this prefix are counted as test assertions.
2676        /// Used for Python `self.assertXxx(...)` detection.
2677        assertion_attr_prefix: &'static str,
2678    }
2679
2680    impl SymbolKinds {
2681        const fn none() -> Self {
2682            Self {
2683                function_def: "",
2684                class_def: "",
2685                test_fn_prefix: "",
2686                test_class_prefix: "",
2687                assertion_attr_prefix: "",
2688            }
2689        }
2690    }
2691
2692    /// Classify every line of `text` using a tree-sitter grammar.
2693    ///
2694    /// `comment_node_kinds` — node type names that represent comments in this grammar
2695    /// `docstring_stmt_kind` — optional parent node type whose direct `string` child is a docstring
2696    /// `symbols` — AST node kinds used to populate symbol counters
2697    fn analyze_lines(
2698        text: &str,
2699        ts_language: &tree_sitter::Language,
2700        comment_node_kinds: &[&str],
2701        docstring_stmt_kind: Option<&str>,
2702        symbols: &SymbolKinds,
2703    ) -> Option<RawFileAnalysis> {
2704        let mut parser = tree_sitter::Parser::new();
2705        parser.set_language(ts_language).ok()?;
2706        let tree = parser.parse(text, None)?;
2707
2708        let lines: Vec<&str> = text.split_terminator('\n').collect();
2709        let n = lines.len();
2710
2711        let mut has_code = vec![false; n];
2712        let mut has_comment = vec![false; n];
2713        let mut comment_is_block = vec![false; n];
2714        let mut has_docstring = vec![false; n];
2715
2716        // Walk every node in the tree and mark line arrays.
2717        let mut ctx = VisitCtx {
2718            source: text.as_bytes(),
2719            comment_kinds: comment_node_kinds,
2720            docstring_stmt_kind,
2721            has_code: &mut has_code,
2722            has_comment: &mut has_comment,
2723            comment_is_block: &mut comment_is_block,
2724            has_docstring: &mut has_docstring,
2725        };
2726        visit(tree.root_node(), &mut ctx);
2727
2728        let mut raw = RawLineCounts::default();
2729        classify_ts_lines(
2730            &lines,
2731            &has_code,
2732            &has_comment,
2733            &comment_is_block,
2734            &has_docstring,
2735            &mut raw,
2736        );
2737
2738        // Symbol counting: walk the AST a second time to collect function/class/test counts.
2739        if !symbols.function_def.is_empty() || !symbols.class_def.is_empty() {
2740            count_symbols(tree.root_node(), text.as_bytes(), symbols, &mut raw);
2741        }
2742
2743        Some(RawFileAnalysis {
2744            raw,
2745            parse_mode: ParseMode::TreeSitter,
2746            warnings: Vec::new(),
2747        })
2748    }
2749
2750    /// Recurse into every direct child of `node`.
2751    fn recurse_children(node: Node, source: &[u8], kinds: &SymbolKinds, raw: &mut RawLineCounts) {
2752        for i in 0..node.child_count() {
2753            #[allow(clippy::cast_possible_truncation)]
2754            if let Some(child) = node.child(i as u32) {
2755                count_symbols(child, source, kinds, raw);
2756            }
2757        }
2758    }
2759
2760    /// Handle a function-definition node. Returns `true` if the node matched.
2761    fn try_count_function(
2762        node: Node,
2763        source: &[u8],
2764        kinds: &SymbolKinds,
2765        raw: &mut RawLineCounts,
2766    ) -> bool {
2767        if kinds.function_def.is_empty() || node.kind() != kinds.function_def {
2768            return false;
2769        }
2770        let name = node
2771            .child_by_field_name("name")
2772            .and_then(|n| n.utf8_text(source).ok())
2773            .unwrap_or("");
2774        if !kinds.test_fn_prefix.is_empty() && name.starts_with(kinds.test_fn_prefix) {
2775            raw.test_count += 1;
2776        } else {
2777            raw.functions += 1;
2778        }
2779        recurse_children(node, source, kinds, raw);
2780        true
2781    }
2782
2783    /// Handle a class-definition node. Returns `true` if the node matched.
2784    fn try_count_class(
2785        node: Node,
2786        source: &[u8],
2787        kinds: &SymbolKinds,
2788        raw: &mut RawLineCounts,
2789    ) -> bool {
2790        if kinds.class_def.is_empty() || node.kind() != kinds.class_def {
2791            return false;
2792        }
2793        let name = node
2794            .child_by_field_name("name")
2795            .and_then(|n| n.utf8_text(source).ok())
2796            .unwrap_or("");
2797        if !kinds.test_class_prefix.is_empty() && name.starts_with(kinds.test_class_prefix) {
2798            raw.test_count += 1;
2799        } else {
2800            raw.classes += 1;
2801        }
2802        recurse_children(node, source, kinds, raw);
2803        true
2804    }
2805
2806    /// Handle an assertion call node. Returns `true` if the node matched (skips recursion
2807    /// into arguments, preserving "don't double-count test bodies" semantics).
2808    fn try_count_assertion(
2809        node: Node,
2810        source: &[u8],
2811        kinds: &SymbolKinds,
2812        raw: &mut RawLineCounts,
2813    ) -> bool {
2814        if kinds.assertion_attr_prefix.is_empty() || node.kind() != "call" {
2815            return false;
2816        }
2817        let Some(func) = node.child_by_field_name("function") else {
2818            return false;
2819        };
2820        if func.kind() != "attribute" {
2821            return false;
2822        }
2823        let attr_text = func
2824            .child_by_field_name("attribute")
2825            .and_then(|n| n.utf8_text(source).ok())
2826            .unwrap_or("");
2827        if !attr_text.starts_with(kinds.assertion_attr_prefix) {
2828            return false;
2829        }
2830        raw.test_assertion_count += 1;
2831        true
2832    }
2833
2834    /// Walk the AST and populate `raw.functions`, `raw.classes`, `raw.test_count`,
2835    /// and `raw.test_assertion_count`.
2836    fn count_symbols(node: Node, source: &[u8], kinds: &SymbolKinds, raw: &mut RawLineCounts) {
2837        if try_count_function(node, source, kinds, raw) {
2838            return;
2839        }
2840        if try_count_class(node, source, kinds, raw) {
2841            return;
2842        }
2843        if try_count_assertion(node, source, kinds, raw) {
2844            return;
2845        }
2846        recurse_children(node, source, kinds, raw);
2847    }
2848
2849    /// Flags describing what kinds of content appear on a single line.
2850    // Four bools are the natural representation for these four independent properties.
2851    #[allow(clippy::struct_excessive_bools)]
2852    #[derive(Clone, Copy)]
2853    struct TsLineFlags {
2854        has_code: bool,
2855        has_comment: bool,
2856        comment_is_block: bool,
2857        has_docstring: bool,
2858    }
2859
2860    /// Classify a single tree-sitter-annotated line and accumulate into `raw`.
2861    const fn classify_ts_line(trimmed: &str, flags: TsLineFlags, raw: &mut RawLineCounts) {
2862        if trimmed.is_empty() {
2863            raw.blank_only_lines += 1;
2864        } else if flags.has_docstring && !flags.has_code {
2865            raw.docstring_comment_lines += 1;
2866        } else if flags.has_code && flags.has_comment {
2867            // Classify the mixed line as single or multi based on what kind of comment is on it.
2868            if flags.comment_is_block {
2869                raw.mixed_code_multi_comment_lines += 1;
2870            } else {
2871                raw.mixed_code_single_comment_lines += 1;
2872            }
2873        } else if flags.has_comment {
2874            if flags.comment_is_block {
2875                raw.multi_comment_only_lines += 1;
2876            } else {
2877                raw.single_comment_only_lines += 1;
2878            }
2879        } else {
2880            raw.code_only_lines += 1;
2881        }
2882    }
2883
2884    /// Classify each tree-sitter-annotated line and accumulate counts into `raw`.
2885    fn classify_ts_lines(
2886        lines: &[&str],
2887        has_code: &[bool],
2888        has_comment: &[bool],
2889        comment_is_block: &[bool],
2890        has_docstring: &[bool],
2891        raw: &mut RawLineCounts,
2892    ) {
2893        for i in 0..lines.len() {
2894            raw.total_physical_lines += 1;
2895            classify_ts_line(
2896                lines[i].trim(),
2897                TsLineFlags {
2898                    has_code: has_code[i],
2899                    has_comment: has_comment[i],
2900                    comment_is_block: comment_is_block[i],
2901                    has_docstring: has_docstring[i],
2902                },
2903                raw,
2904            );
2905        }
2906    }
2907
2908    struct VisitCtx<'a> {
2909        source: &'a [u8],
2910        comment_kinds: &'a [&'a str],
2911        docstring_stmt_kind: Option<&'a str>,
2912        has_code: &'a mut Vec<bool>,
2913        has_comment: &'a mut Vec<bool>,
2914        comment_is_block: &'a mut Vec<bool>,
2915        has_docstring: &'a mut Vec<bool>,
2916    }
2917
2918    /// Mark all rows of a comment node and detect whether it is a block comment.
2919    fn visit_comment_node(node: Node, ctx: &mut VisitCtx<'_>) {
2920        let start_row = node.start_position().row;
2921        let end_row = node.end_position().row;
2922        let first_two = node
2923            .utf8_text(ctx.source)
2924            .unwrap_or("")
2925            .get(..2)
2926            .unwrap_or("");
2927        let is_block = first_two == "/*" || first_two == "<#";
2928        for row in start_row..=end_row {
2929            if row < ctx.has_comment.len() {
2930                ctx.has_comment[row] = true;
2931                if is_block {
2932                    ctx.comment_is_block[row] = true;
2933                }
2934            }
2935        }
2936    }
2937
2938    /// If `node` is an `expression_statement` whose sole named child is a string literal,
2939    /// mark those rows as docstring and return `true`.
2940    fn visit_maybe_docstring(node: Node, kind: &str, ctx: &mut VisitCtx<'_>) -> bool {
2941        let Some(stmt_kind) = ctx.docstring_stmt_kind else {
2942            return false;
2943        };
2944        if kind != stmt_kind || node.named_child_count() != 1 {
2945            return false;
2946        }
2947        let Some(child) = node.named_child(0) else {
2948            return false;
2949        };
2950        if child.kind() != "string" {
2951            return false;
2952        }
2953        let child_start = child.start_position().row;
2954        let child_end = child.end_position().row;
2955        for row in child_start..=child_end {
2956            if row < ctx.has_docstring.len() {
2957                ctx.has_docstring[row] = true;
2958            }
2959        }
2960        true
2961    }
2962
2963    /// Mark all rows of a leaf (non-comment, non-extra) node as code.
2964    fn visit_leaf_code(node: Node, ctx: &mut VisitCtx<'_>) {
2965        let start_row = node.start_position().row;
2966        let end_row = node.end_position().row;
2967        for row in start_row..=end_row {
2968            if row < ctx.has_code.len() {
2969                ctx.has_code[row] = true;
2970            }
2971        }
2972    }
2973
2974    #[allow(clippy::too_many_lines)]
2975    fn visit(node: Node, ctx: &mut VisitCtx<'_>) {
2976        let kind = node.kind();
2977
2978        // Comment node — mark rows as comment, detect block vs. line comment.
2979        if ctx.comment_kinds.contains(&kind) {
2980            visit_comment_node(node, ctx);
2981            return;
2982        }
2983
2984        // Python docstring: expression_statement whose only named child is a string literal.
2985        if visit_maybe_docstring(node, kind, ctx) {
2986            return;
2987        }
2988
2989        // Leaf non-comment node: mark as code.
2990        if node.child_count() == 0 && !node.is_extra() {
2991            visit_leaf_code(node, ctx);
2992            return;
2993        }
2994
2995        for i in 0..node.child_count() {
2996            #[allow(clippy::cast_possible_truncation)]
2997            // child_count bounded by tree-sitter u32 capacity
2998            if let Some(child) = node.child(i as u32) {
2999                visit(child, ctx);
3000            }
3001        }
3002    }
3003
3004    const C_SYMBOLS: SymbolKinds = SymbolKinds::none();
3005
3006    const PYTHON_SYMBOLS: SymbolKinds = SymbolKinds {
3007        function_def: "function_definition",
3008        class_def: "class_definition",
3009        test_fn_prefix: "test_",
3010        test_class_prefix: "Test",
3011        assertion_attr_prefix: "assert",
3012    };
3013
3014    /// Parse C or C++ source with tree-sitter-c.
3015    #[must_use]
3016    pub fn analyze_c(text: &str) -> Option<RawFileAnalysis> {
3017        let lang: tree_sitter::Language = tree_sitter_c::LANGUAGE.into();
3018        analyze_lines(text, &lang, &["comment"], None, &C_SYMBOLS)
3019    }
3020
3021    /// Parse Python source with tree-sitter-python.
3022    #[must_use]
3023    pub fn analyze_python(text: &str) -> Option<RawFileAnalysis> {
3024        let lang: tree_sitter::Language = tree_sitter_python::LANGUAGE.into();
3025        analyze_lines(
3026            text,
3027            &lang,
3028            &["comment"],
3029            Some("expression_statement"),
3030            &PYTHON_SYMBOLS,
3031        )
3032    }
3033}
3034
3035#[cfg(test)]
3036mod tests {
3037    use super::*;
3038
3039    #[test]
3040    fn python_docstrings_are_separated() {
3041        let input = r#""""module docs"""
3042
3043
3044def fn_a():
3045    """function docs"""
3046    value = 1  # trailing comment
3047    return value
3048"#;
3049
3050        let result = analyze_text(Language::Python, input, AnalysisOptions::default());
3051        assert_eq!(result.raw.docstring_comment_lines, 2);
3052        assert_eq!(result.raw.mixed_code_single_comment_lines, 1);
3053        assert_eq!(result.raw.code_only_lines, 2);
3054    }
3055
3056    #[test]
3057    fn c_style_mixed_lines_are_captured() {
3058        let input = "int x = 1; // note\n/* block */\n";
3059        let result = analyze_text(Language::C, input, AnalysisOptions::default());
3060        assert_eq!(result.raw.mixed_code_single_comment_lines, 1);
3061        assert_eq!(result.raw.multi_comment_only_lines, 1);
3062    }
3063
3064    #[test]
3065    fn detect_language_by_shebang() {
3066        let language = detect_language(
3067            Path::new("script"),
3068            Some("#!/usr/bin/env bash"),
3069            &BTreeMap::new(),
3070            true,
3071        );
3072        assert_eq!(language, Some(Language::Shell));
3073    }
3074
3075    // ── count_symbols: no double-counting of test functions ──────────────────
3076
3077    fn sym(lang: Language, line: &str) -> (u64, u64, u64, u64, u64, u64, u64) {
3078        let result = analyze_text(lang, &format!("{line}\n"), AnalysisOptions::default());
3079        let r = &result.raw;
3080        (
3081            r.functions,
3082            r.classes,
3083            r.variables,
3084            r.imports,
3085            r.test_count,
3086            r.test_assertion_count,
3087            r.test_suite_count,
3088        )
3089    }
3090
3091    #[test]
3092    fn python_test_fn_not_double_counted() {
3093        // def test_ lines count as tests only, NOT as functions
3094        let (f, c, _, _, t, _, _) = sym(Language::Python, "def test_foo():");
3095        assert_eq!(f, 0, "test fn must not also increment functions");
3096        assert_eq!(t, 1, "must be counted as a test");
3097        assert_eq!(c, 0);
3098    }
3099
3100    #[test]
3101    fn python_test_class_not_double_counted() {
3102        // class Test* lines count as tests only, NOT as classes
3103        let (f, c, _, _, t, _, _) = sym(Language::Python, "class TestFoo:");
3104        assert_eq!(c, 0, "test class must not also increment classes");
3105        assert_eq!(t, 1, "must be counted as a test");
3106        assert_eq!(f, 0);
3107    }
3108
3109    #[test]
3110    fn python_regular_fn_counts_as_function() {
3111        let (f, c, _, _, t, _, _) = sym(Language::Python, "def regular():");
3112        assert_eq!(f, 1, "regular function must be counted");
3113        assert_eq!(t, 0);
3114        assert_eq!(c, 0);
3115    }
3116
3117    #[test]
3118    fn python_regular_class_counts_as_class() {
3119        let (f, c, _, _, t, _, _) = sym(Language::Python, "class Regular:");
3120        assert_eq!(c, 1, "regular class must be counted");
3121        assert_eq!(t, 0);
3122        assert_eq!(f, 0);
3123    }
3124
3125    #[test]
3126    fn go_test_fn_not_double_counted() {
3127        let (f, _, _, _, t, _, _) = sym(Language::Go, "func TestFoo(t *testing.T) {");
3128        assert_eq!(f, 0, "Go test func must not also increment functions");
3129        assert_eq!(t, 1, "must be counted as a test");
3130    }
3131
3132    #[test]
3133    fn go_benchmark_fn_not_double_counted() {
3134        let (f, _, _, _, t, _, _) = sym(Language::Go, "func BenchmarkBar(b *testing.B) {");
3135        assert_eq!(f, 0, "Go benchmark func must not also increment functions");
3136        assert_eq!(t, 1, "must be counted as a test");
3137    }
3138
3139    #[test]
3140    fn go_regular_fn_counts_as_function() {
3141        let (f, _, _, _, t, _, _) = sym(Language::Go, "func doSomething() {");
3142        assert_eq!(f, 1, "regular Go func must be counted");
3143        assert_eq!(t, 0);
3144    }
3145
3146    #[test]
3147    fn rust_test_attr_counts_as_test_not_function() {
3148        // #[test] is a standalone attribute line — counted as a test, never as a function
3149        let (f, _, _, _, t, _, _) = sym(Language::Rust, "#[test]");
3150        assert_eq!(t, 1, "#[test] must be counted as a test");
3151        assert_eq!(f, 0, "#[test] attribute must not be counted as a function");
3152    }
3153
3154    #[test]
3155    fn rust_fn_line_counts_as_function_not_test() {
3156        // The fn declaration after #[test] does NOT match any test pattern
3157        let (f, _, _, _, t, _, _) = sym(Language::Rust, "fn test_something() {");
3158        assert_eq!(f, 1, "fn declaration must count as a function");
3159        assert_eq!(
3160            t, 0,
3161            "fn declaration line must not be double-counted as a test"
3162        );
3163    }
3164
3165    #[test]
3166    fn js_describe_counts_as_test_not_function() {
3167        let (f, _, _, _, t, _, _) = sym(Language::JavaScript, "describe('suite', () => {");
3168        assert_eq!(t, 1, "describe must be counted as a test");
3169        assert_eq!(f, 0, "describe must not be counted as a function");
3170    }
3171
3172    #[test]
3173    fn js_regular_fn_counts_as_function() {
3174        let (f, _, _, _, t, _, _) = sym(Language::JavaScript, "function doWork() {");
3175        assert_eq!(f, 1, "JS function declaration must be counted");
3176        assert_eq!(t, 0);
3177    }
3178}