Skip to main content

sloc_languages/
lib.rs

1// SPDX-License-Identifier: AGPL-3.0-or-later
2// Copyright (C) 2026 Nima Shafie <nimzshafie@gmail.com>
3
4pub mod cpp_style;
5pub mod style;
6pub use cpp_style::{BraceStyle, CppStyleAnalysis, IndentStyle, PointerStyle, StyleGuideScore};
7pub use style::{StyleAnalysis, StyleSignal};
8
9use std::collections::{BTreeMap, BTreeSet, HashSet};
10use std::path::Path;
11
12use serde::{Deserialize, Serialize};
13
14#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
15#[serde(rename_all = "snake_case")]
16pub enum Language {
17    C,
18    Cpp,
19    CSharp,
20    Go,
21    Java,
22    JavaScript,
23    Python,
24    Rust,
25    Shell,
26    PowerShell,
27    TypeScript,
28    // --- Extended language support ---
29    Assembly,
30    Clojure,
31    Css,
32    Dart,
33    Dockerfile,
34    Elixir,
35    Erlang,
36    FSharp,
37    Groovy,
38    Haskell,
39    Html,
40    Julia,
41    Kotlin,
42    Lua,
43    Makefile,
44    Nim,
45    ObjectiveC,
46    Ocaml,
47    Perl,
48    Php,
49    R,
50    Ruby,
51    Scala,
52    Scss,
53    Sql,
54    Svelte,
55    Swift,
56    Vue,
57    Xml,
58    Zig,
59}
60
61impl Language {
62    #[must_use]
63    pub const fn display_name(&self) -> &'static str {
64        match self {
65            Self::C => "C",
66            Self::Cpp => "C++",
67            Self::CSharp => "C#",
68            Self::Go => "Go",
69            Self::Java => "Java",
70            Self::JavaScript => "JavaScript",
71            Self::Python => "Python",
72            Self::Rust => "Rust",
73            Self::Shell => "Shell",
74            Self::PowerShell => "PowerShell",
75            Self::TypeScript => "TypeScript",
76            Self::Assembly => "Assembly",
77            Self::Clojure => "Clojure",
78            Self::Css => "CSS",
79            Self::Dart => "Dart",
80            Self::Dockerfile => "Dockerfile",
81            Self::Elixir => "Elixir",
82            Self::Erlang => "Erlang",
83            Self::FSharp => "F#",
84            Self::Groovy => "Groovy",
85            Self::Haskell => "Haskell",
86            Self::Html => "HTML",
87            Self::Julia => "Julia",
88            Self::Kotlin => "Kotlin",
89            Self::Lua => "Lua",
90            Self::Makefile => "Makefile",
91            Self::Nim => "Nim",
92            Self::ObjectiveC => "Objective-C",
93            Self::Ocaml => "OCaml",
94            Self::Perl => "Perl",
95            Self::Php => "PHP",
96            Self::R => "R",
97            Self::Ruby => "Ruby",
98            Self::Scala => "Scala",
99            Self::Scss => "SCSS",
100            Self::Sql => "SQL",
101            Self::Svelte => "Svelte",
102            Self::Swift => "Swift",
103            Self::Vue => "Vue",
104            Self::Xml => "XML",
105            Self::Zig => "Zig",
106        }
107    }
108
109    #[must_use]
110    pub const fn as_slug(&self) -> &'static str {
111        match self {
112            Self::C => "c",
113            Self::Cpp => "cpp",
114            Self::CSharp => "csharp",
115            Self::Go => "go",
116            Self::Java => "java",
117            Self::JavaScript => "javascript",
118            Self::Python => "python",
119            Self::Rust => "rust",
120            Self::Shell => "shell",
121            Self::PowerShell => "powershell",
122            Self::TypeScript => "typescript",
123            Self::Assembly => "assembly",
124            Self::Clojure => "clojure",
125            Self::Css => "css",
126            Self::Dart => "dart",
127            Self::Dockerfile => "dockerfile",
128            Self::Elixir => "elixir",
129            Self::Erlang => "erlang",
130            Self::FSharp => "fsharp",
131            Self::Groovy => "groovy",
132            Self::Haskell => "haskell",
133            Self::Html => "html",
134            Self::Julia => "julia",
135            Self::Kotlin => "kotlin",
136            Self::Lua => "lua",
137            Self::Makefile => "makefile",
138            Self::Nim => "nim",
139            Self::ObjectiveC => "objectivec",
140            Self::Ocaml => "ocaml",
141            Self::Perl => "perl",
142            Self::Php => "php",
143            Self::R => "r",
144            Self::Ruby => "ruby",
145            Self::Scala => "scala",
146            Self::Scss => "scss",
147            Self::Sql => "sql",
148            Self::Svelte => "svelte",
149            Self::Swift => "swift",
150            Self::Vue => "vue",
151            Self::Xml => "xml",
152            Self::Zig => "zig",
153        }
154    }
155
156    #[must_use]
157    pub fn from_name(name: &str) -> Option<Self> {
158        match name.trim().to_ascii_lowercase().as_str() {
159            "c" => Some(Self::C),
160            "cpp" | "c++" | "cplusplus" => Some(Self::Cpp),
161            "csharp" | "c#" | "cs" => Some(Self::CSharp),
162            "go" | "golang" => Some(Self::Go),
163            "java" => Some(Self::Java),
164            "javascript" | "js" => Some(Self::JavaScript),
165            "python" | "py" => Some(Self::Python),
166            "rust" | "rs" => Some(Self::Rust),
167            "shell" | "sh" | "bash" => Some(Self::Shell),
168            "powershell" | "pwsh" | "ps" => Some(Self::PowerShell),
169            "typescript" | "ts" => Some(Self::TypeScript),
170            "assembly" | "asm" => Some(Self::Assembly),
171            "clojure" | "clj" => Some(Self::Clojure),
172            "css" => Some(Self::Css),
173            "dart" => Some(Self::Dart),
174            "dockerfile" | "docker" => Some(Self::Dockerfile),
175            "elixir" | "ex" => Some(Self::Elixir),
176            "erlang" | "erl" => Some(Self::Erlang),
177            "fsharp" | "f#" | "fs" => Some(Self::FSharp),
178            "groovy" => Some(Self::Groovy),
179            "haskell" | "hs" => Some(Self::Haskell),
180            "html" | "htm" => Some(Self::Html),
181            "julia" | "jl" => Some(Self::Julia),
182            "kotlin" | "kt" => Some(Self::Kotlin),
183            "lua" => Some(Self::Lua),
184            "makefile" | "make" | "mk" => Some(Self::Makefile),
185            "nim" => Some(Self::Nim),
186            "objectivec" | "objc" | "objective-c" => Some(Self::ObjectiveC),
187            "ocaml" | "ml" => Some(Self::Ocaml),
188            "perl" | "pl" => Some(Self::Perl),
189            "php" => Some(Self::Php),
190            "r" => Some(Self::R),
191            "ruby" | "rb" => Some(Self::Ruby),
192            "scala" => Some(Self::Scala),
193            "scss" | "sass" => Some(Self::Scss),
194            "sql" => Some(Self::Sql),
195            "svelte" => Some(Self::Svelte),
196            "swift" => Some(Self::Swift),
197            "vue" => Some(Self::Vue),
198            "xml" => Some(Self::Xml),
199            "zig" => Some(Self::Zig),
200            _ => None,
201        }
202    }
203}
204
205#[derive(Debug, Clone, Serialize, Deserialize, Default)]
206pub struct RawLineCounts {
207    pub total_physical_lines: u64,
208    pub blank_only_lines: u64,
209    pub code_only_lines: u64,
210    pub single_comment_only_lines: u64,
211    pub multi_comment_only_lines: u64,
212    pub mixed_code_single_comment_lines: u64,
213    pub mixed_code_multi_comment_lines: u64,
214    pub docstring_comment_lines: u64,
215    pub skipped_unknown_lines: u64,
216    /// Best-effort count of function/method definition lines detected lexically.
217    #[serde(default)]
218    pub functions: u64,
219    /// Best-effort count of class/struct/trait/type definition lines detected lexically.
220    #[serde(default)]
221    pub classes: u64,
222    /// Best-effort count of variable declaration lines detected lexically.
223    #[serde(default)]
224    pub variables: u64,
225    /// Best-effort count of import/use/include statement lines detected lexically.
226    #[serde(default)]
227    pub imports: u64,
228    /// Lines consisting solely of preprocessor/compiler directives (e.g. `#include`, `#define`
229    /// in C/C++/Objective-C). Always a subset of `code_only_lines`. Controlled by
230    /// `AnalysisConfig::count_compiler_directives`. IEEE 1045-1992 §4.2.
231    #[serde(default)]
232    pub compiler_directive_lines: u64,
233    /// Best-effort count of test case / test function definition lines detected lexically
234    /// (`GTest`, Catch2, `PyTest`, `JUnit`, etc.). Always a subset of `code_only_lines`.
235    #[serde(default)]
236    pub test_count: u64,
237    /// Best-effort count of test assertion call lines detected lexically
238    /// (`ASSERT_EQ`, `EXPECT_TRUE`, assertEquals, Assert.AreEqual, `assert_eq`!, etc.).
239    #[serde(default)]
240    pub test_assertion_count: u64,
241    /// Best-effort count of test suite / fixture / group declaration lines detected lexically
242    /// (`TEST_GROUP`, `BOOST_AUTO_TEST_SUITE`, [`TestClass`], [`TestFixture`], etc.).
243    #[serde(default)]
244    pub test_suite_count: u64,
245}
246
247#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
248#[serde(rename_all = "snake_case")]
249pub enum ParseMode {
250    Lexical,
251    LexicalBestEffort,
252    TreeSitter,
253}
254
255#[derive(Debug, Clone, Serialize, Deserialize)]
256pub struct RawFileAnalysis {
257    pub raw: RawLineCounts,
258    pub parse_mode: ParseMode,
259    pub warnings: Vec<String>,
260    /// Lexical style-guide analysis for supported languages; `None` when no heuristics apply.
261    #[serde(default, skip_serializing_if = "Option::is_none")]
262    pub style_analysis: Option<StyleAnalysis>,
263}
264
265/// IEEE 1045-1992 counting options passed from `sloc-core` (built from `AnalysisConfig`).
266///
267/// `analyze_text` accepts this struct so that the caller can control behaviour that the
268/// standard defines as configurable parameters rather than fixed conventions.
269#[derive(Debug, Clone, Copy)]
270pub struct AnalysisOptions {
271    /// When `true` (IEEE 1045-1992 default), blank lines inside block comments count as
272    /// comment lines rather than blank lines.
273    pub blank_in_block_comment_as_comment: bool,
274    /// When `true`, backslash-continued physical lines are collapsed into a single logical
275    /// line for SLOC counting purposes (IEEE logical SLOC mode).
276    pub collapse_continuation_lines: bool,
277}
278
279impl Default for AnalysisOptions {
280    fn default() -> Self {
281        Self {
282            blank_in_block_comment_as_comment: true,
283            collapse_continuation_lines: false,
284        }
285    }
286}
287
288#[must_use]
289pub fn supported_languages() -> BTreeSet<Language> {
290    [
291        Language::Assembly,
292        Language::C,
293        Language::Clojure,
294        Language::Cpp,
295        Language::CSharp,
296        Language::Css,
297        Language::Dart,
298        Language::Dockerfile,
299        Language::Elixir,
300        Language::Erlang,
301        Language::FSharp,
302        Language::Go,
303        Language::Groovy,
304        Language::Haskell,
305        Language::Html,
306        Language::Java,
307        Language::JavaScript,
308        Language::Julia,
309        Language::Kotlin,
310        Language::Lua,
311        Language::Makefile,
312        Language::Nim,
313        Language::ObjectiveC,
314        Language::Ocaml,
315        Language::Perl,
316        Language::Php,
317        Language::PowerShell,
318        Language::Python,
319        Language::R,
320        Language::Ruby,
321        Language::Rust,
322        Language::Scala,
323        Language::Scss,
324        Language::Shell,
325        Language::Sql,
326        Language::Svelte,
327        Language::Swift,
328        Language::TypeScript,
329        Language::Vue,
330        Language::Xml,
331        Language::Zig,
332    ]
333    .into_iter()
334    .collect()
335}
336
337/// Detect language from a shebang line (e.g. `#!/usr/bin/env python3`).
338fn detect_by_shebang(line: &str) -> Option<Language> {
339    let lower = line.to_ascii_lowercase();
340    if !lower.starts_with("#!") {
341        return None;
342    }
343    if lower.contains("python") {
344        return Some(Language::Python);
345    }
346    if lower.contains("pwsh") || lower.contains("powershell") {
347        return Some(Language::PowerShell);
348    }
349    if lower.contains("bash")
350        || lower.contains("/sh")
351        || lower.contains("zsh")
352        || lower.contains("ksh")
353    {
354        return Some(Language::Shell);
355    }
356    if lower.contains("ruby") {
357        return Some(Language::Ruby);
358    }
359    if lower.contains("perl") {
360        return Some(Language::Perl);
361    }
362    if lower.contains("php") {
363        return Some(Language::Php);
364    }
365    if lower.contains("node") || lower.contains("nodejs") {
366        return Some(Language::JavaScript);
367    }
368    None
369}
370
371/// Detect language purely from a (lowercased) file extension.
372fn detect_by_extension(ext: &str) -> Option<Language> {
373    // Static table avoids a large match statement; each extension maps 1-to-1 to a language.
374    static EXT_MAP: &[(&str, Language)] = &[
375        ("c", Language::C),
376        ("h", Language::C),
377        ("cc", Language::Cpp),
378        ("cp", Language::Cpp),
379        ("cpp", Language::Cpp),
380        ("cxx", Language::Cpp),
381        ("hh", Language::Cpp),
382        ("hpp", Language::Cpp),
383        ("hxx", Language::Cpp),
384        ("cs", Language::CSharp),
385        ("go", Language::Go),
386        ("java", Language::Java),
387        ("js", Language::JavaScript),
388        ("mjs", Language::JavaScript),
389        ("cjs", Language::JavaScript),
390        ("py", Language::Python),
391        ("rs", Language::Rust),
392        ("sh", Language::Shell),
393        ("bash", Language::Shell),
394        ("zsh", Language::Shell),
395        ("ksh", Language::Shell),
396        ("ps1", Language::PowerShell),
397        ("psm1", Language::PowerShell),
398        ("psd1", Language::PowerShell),
399        ("ts", Language::TypeScript),
400        ("mts", Language::TypeScript),
401        ("cts", Language::TypeScript),
402        ("asm", Language::Assembly),
403        ("s", Language::Assembly),
404        ("clj", Language::Clojure),
405        ("cljs", Language::Clojure),
406        ("cljc", Language::Clojure),
407        ("edn", Language::Clojure),
408        ("css", Language::Css),
409        ("dart", Language::Dart),
410        ("ex", Language::Elixir),
411        ("exs", Language::Elixir),
412        ("erl", Language::Erlang),
413        ("hrl", Language::Erlang),
414        ("fs", Language::FSharp),
415        ("fsi", Language::FSharp),
416        ("fsx", Language::FSharp),
417        ("groovy", Language::Groovy),
418        ("gradle", Language::Groovy),
419        ("hs", Language::Haskell),
420        ("lhs", Language::Haskell),
421        ("html", Language::Html),
422        ("htm", Language::Html),
423        ("xhtml", Language::Html),
424        ("jl", Language::Julia),
425        ("kt", Language::Kotlin),
426        ("kts", Language::Kotlin),
427        ("lua", Language::Lua),
428        ("mk", Language::Makefile),
429        ("nim", Language::Nim),
430        ("nims", Language::Nim),
431        ("m", Language::ObjectiveC),
432        ("mm", Language::ObjectiveC),
433        ("ml", Language::Ocaml),
434        ("mli", Language::Ocaml),
435        ("pl", Language::Perl),
436        ("pm", Language::Perl),
437        ("t", Language::Perl),
438        ("php", Language::Php),
439        ("php3", Language::Php),
440        ("php4", Language::Php),
441        ("php5", Language::Php),
442        ("php7", Language::Php),
443        ("phtml", Language::Php),
444        ("r", Language::R),
445        ("rb", Language::Ruby),
446        ("rake", Language::Ruby),
447        ("scala", Language::Scala),
448        ("sc", Language::Scala),
449        ("scss", Language::Scss),
450        ("sass", Language::Scss),
451        ("sql", Language::Sql),
452        ("svelte", Language::Svelte),
453        ("swift", Language::Swift),
454        ("vue", Language::Vue),
455        ("xml", Language::Xml),
456        ("xsd", Language::Xml),
457        ("xsl", Language::Xml),
458        ("xslt", Language::Xml),
459        ("svg", Language::Xml),
460        ("zig", Language::Zig),
461    ];
462    EXT_MAP.iter().find_map(|&(e, l)| (e == ext).then_some(l))
463}
464
465/// Detect language from an exact filename (no extension) or well-known filename patterns.
466fn detect_by_filename(filename: &str, filename_lower: &str) -> Option<Language> {
467    // Dockerfile: exact name or Dockerfile.* variant
468    if filename == "Dockerfile"
469        || filename.starts_with("Dockerfile.")
470        || filename_lower == "dockerfile"
471    {
472        return Some(Language::Dockerfile);
473    }
474    // Makefile variants
475    if matches!(
476        filename,
477        "Makefile" | "GNUmakefile" | "makefile" | "BSDmakefile"
478    ) {
479        return Some(Language::Makefile);
480    }
481    // Ruby ecosystem files that have no extension
482    if matches!(
483        filename,
484        "Rakefile" | "Gemfile" | "Guardfile" | "Vagrantfile" | "Fastfile" | "Podfile"
485    ) {
486        return Some(Language::Ruby);
487    }
488    None
489}
490
491#[must_use]
492#[allow(clippy::too_many_lines)]
493pub fn detect_language(
494    path: &Path,
495    first_line: Option<&str>,
496    extension_overrides: &BTreeMap<String, String>,
497    shebang_detection: bool,
498) -> Option<Language> {
499    let extension = path
500        .extension()
501        .and_then(|ext| ext.to_str())
502        .map(str::to_ascii_lowercase);
503
504    // Extension override check (user-configured mappings win over everything)
505    if let Some(ext) = extension.as_ref() {
506        if let Some(override_name) = extension_overrides.get(ext.as_str()) {
507            if let Some(lang) = Language::from_name(override_name) {
508                return Some(lang);
509            }
510        }
511    }
512
513    // Filename-based detection for files that have no extension or use exact names
514    let filename = path.file_name().and_then(|s| s.to_str()).unwrap_or("");
515    let filename_lower = filename.to_ascii_lowercase();
516
517    if let Some(lang) = detect_by_filename(filename, &filename_lower) {
518        return Some(lang);
519    }
520
521    // Extension-based detection
522    if let Some(lang) = extension.as_deref().and_then(detect_by_extension) {
523        return Some(lang);
524    }
525
526    // Shebang detection (last resort — only for extensionless scripts)
527    if shebang_detection {
528        if let Some(line) = first_line {
529            if let Some(lang) = detect_by_shebang(line) {
530                return Some(lang);
531            }
532        }
533    }
534
535    None
536}
537
538#[must_use]
539pub fn analyze_text(language: Language, text: &str, options: AnalysisOptions) -> RawFileAnalysis {
540    // tree-sitter fast-paths (compiled out when feature is disabled)
541    #[cfg(feature = "tree-sitter")]
542    {
543        match language {
544            Language::C | Language::Cpp => {
545                if let Some(mut result) = ts::analyze_c(text) {
546                    result.style_analysis = style::analyze_style(language, text);
547                    return result;
548                }
549            }
550            Language::Python => {
551                if let Some(result) = ts::analyze_python(text) {
552                    return result;
553                }
554            }
555            _ => {}
556        }
557    }
558
559    let (mut config, has_preprocessor) = language_scan_config(language);
560
561    // Python docstring lines are computed from the text and cannot be a static constant.
562    if language == Language::Python {
563        config.skip_lines = detect_python_docstring_lines(text);
564    }
565
566    // C, C++, and Objective-C have a preprocessor whose directive lines are tracked separately
567    // per IEEE 1045-1992 §4.2; every other language uses base flags.
568    let flags = IeeeFlags {
569        has_preprocessor_directives: has_preprocessor,
570        blank_in_block_comment_as_comment: options.blank_in_block_comment_as_comment,
571        collapse_continuation_lines: options.collapse_continuation_lines,
572    };
573    let mut result = analyze_generic(text, config, flags);
574    result.style_analysis = style::analyze_style(language, text);
575    result
576}
577
578/// Returns the lexical scan configuration for `language` and whether it uses a C preprocessor.
579/// All fields are static constants except `skip_lines`, which is always empty here; callers that
580/// need non-empty skip sets (currently only Python) must populate the field after this call.
581///
582/// The implementation delegates to `LANG_SCAN_TABLE` (a static `&[(Language, StaticLangConfig)]`)
583/// defined below the `SP_*` symbol-pattern constants.  Each language appears exactly once in the
584/// table, so the linear scan is O(|languages|) but avoids a 41-arm `match` statement.
585fn language_scan_config(language: Language) -> (ScanConfig, bool) {
586    let cfg = LANG_SCAN_TABLE
587        .iter()
588        .find_map(|&(l, c)| (l == language).then_some(c))
589        .unwrap_or_else(|| panic!("language_scan_config: no entry for {language:?}"));
590    (
591        ScanConfig {
592            line_comments: cfg.line_comments,
593            block_comment: cfg.block_comment,
594            allow_single_quote_strings: cfg.allow_single_quote_strings,
595            allow_double_quote_strings: cfg.allow_double_quote_strings,
596            allow_triple_quote_strings: cfg.allow_triple_quote_strings,
597            allow_csharp_verbatim_strings: cfg.allow_csharp_verbatim_strings,
598            skip_lines: HashSet::new(),
599            symbol_patterns: cfg.symbol_patterns,
600        },
601        cfg.has_preprocessor,
602    )
603}
604
605/// Per-language keyword prefixes used for best-effort structural symbol detection.
606/// Each slice lists line prefixes (after leading whitespace is stripped) that indicate
607/// a definition of that category. Empty slice = detection disabled for that category.
608#[derive(Debug, Clone, Copy)]
609struct SymbolPatterns {
610    functions: &'static [&'static str],
611    /// Line prefixes that classify as a function only when the line ALSO contains `(`
612    /// AND there is no `=` between the prefix and the first `(`.  Used for C/C++ where
613    /// function definitions are led by the return type (`void`, `int`, `bool`, …) with
614    /// no dedicated keyword, so the paren guard distinguishes `void f(x)` from
615    /// `void* p = malloc(n)`.
616    functions_prefix_paren: &'static [&'static str],
617    classes: &'static [&'static str],
618    variables: &'static [&'static str],
619    imports: &'static [&'static str],
620    /// Line prefixes (after stripping leading whitespace) that indicate a test case or test
621    /// function definition. Matched against code lines only, same as other symbol categories.
622    tests: &'static [&'static str],
623    /// Line prefixes that indicate a test assertion call (`ASSERT_EQ`, assertEquals, `assert_eq`!,
624    /// Assert.AreEqual, etc.). Matched against code lines only.
625    assertions: &'static [&'static str],
626    /// Line prefixes that indicate a test suite / fixture / group declaration
627    /// (`TEST_GROUP`, `BOOST_AUTO_TEST_SUITE`, [`TestClass`], [`TestFixture`], etc.).
628    test_suites: &'static [&'static str],
629}
630
631impl SymbolPatterns {
632    const fn none() -> Self {
633        Self {
634            functions: &[],
635            functions_prefix_paren: &[],
636            classes: &[],
637            variables: &[],
638            imports: &[],
639            tests: &[],
640            assertions: &[],
641            test_suites: &[],
642        }
643    }
644}
645
646const SP_NONE: SymbolPatterns = SymbolPatterns::none(); // all fields are &[]
647
648const SP_RUST: SymbolPatterns = SymbolPatterns {
649    functions: &[
650        "fn ",
651        "pub fn ",
652        "pub(crate) fn ",
653        "pub(super) fn ",
654        "async fn ",
655        "pub async fn ",
656        "pub(crate) async fn ",
657        "unsafe fn ",
658        "pub unsafe fn ",
659        "pub(crate) unsafe fn ",
660        "const fn ",
661        "pub const fn ",
662        "pub(crate) const fn ",
663        "extern fn ",
664        "pub extern fn ",
665    ],
666    functions_prefix_paren: &[],
667    classes: &[
668        "struct ",
669        "pub struct ",
670        "pub(crate) struct ",
671        "enum ",
672        "pub enum ",
673        "pub(crate) enum ",
674        "trait ",
675        "pub trait ",
676        "pub(crate) trait ",
677        "impl ",
678        "impl<",
679        "type ",
680        "pub type ",
681        "pub(crate) type ",
682    ],
683    variables: &["let ", "let mut "],
684    imports: &["use ", "pub use ", "pub(crate) use ", "extern crate "],
685    // Built-in #[test], tokio/actix async test attributes, rstest
686    tests: &[
687        "#[test]",
688        "#[tokio::test]",
689        "#[actix_web::test]",
690        "#[rstest]",
691        "#[test_case",
692    ],
693    assertions: &[
694        "assert_eq!(",
695        "assert_ne!(",
696        "assert!(",
697        "assert_matches!(",
698        "assert_err!(",
699        "assert_ok!(",
700    ],
701    test_suites: &[],
702};
703
704const SP_PYTHON: SymbolPatterns = SymbolPatterns {
705    functions: &["def ", "async def "],
706    functions_prefix_paren: &[],
707    classes: &["class "],
708    variables: &[],
709    imports: &["import ", "from "],
710    // pytest: test_ prefix functions and Test* classes; unittest: test_ methods
711    tests: &["def test_", "async def test_", "class Test"],
712    assertions: &[
713        "self.assertEqual(",
714        "self.assertNotEqual(",
715        "self.assertTrue(",
716        "self.assertFalse(",
717        "self.assertIsNone(",
718        "self.assertIsNotNone(",
719        "self.assertIn(",
720        "self.assertNotIn(",
721        "self.assertRaises(",
722        "self.assertAlmostEqual(",
723    ],
724    test_suites: &[],
725};
726
727const SP_JS: SymbolPatterns = SymbolPatterns {
728    functions: &[
729        "function ",
730        "async function ",
731        "export function ",
732        "export async function ",
733        "export default function ",
734    ],
735    functions_prefix_paren: &[],
736    classes: &["class ", "export class ", "export default class "],
737    variables: &[
738        "var ",
739        "let ",
740        "const ",
741        "export var ",
742        "export let ",
743        "export const ",
744    ],
745    imports: &["import "],
746    // Jest/Mocha/Jasmine: describe/it/test block openers
747    tests: &[
748        "describe(",
749        "it(",
750        "test(",
751        "it.each(",
752        "test.each(",
753        "describe.each(",
754    ],
755    assertions: &["expect("],
756    test_suites: &[],
757};
758
759const SP_TS: SymbolPatterns = SymbolPatterns {
760    functions: &[
761        "function ",
762        "async function ",
763        "export function ",
764        "export async function ",
765        "export default function ",
766    ],
767    functions_prefix_paren: &[],
768    classes: &[
769        "class ",
770        "export class ",
771        "export default class ",
772        "abstract class ",
773        "export abstract class ",
774        "interface ",
775        "export interface ",
776        "declare class ",
777        "declare interface ",
778    ],
779    variables: &[
780        "var ",
781        "let ",
782        "const ",
783        "export var ",
784        "export let ",
785        "export const ",
786    ],
787    imports: &["import "],
788    // Jest/Mocha/Jasmine/Vitest: describe/it/test block openers
789    tests: &[
790        "describe(",
791        "it(",
792        "test(",
793        "it.each(",
794        "test.each(",
795        "describe.each(",
796    ],
797    assertions: &["expect("],
798    test_suites: &[],
799};
800
801const SP_GO: SymbolPatterns = SymbolPatterns {
802    functions: &["func "],
803    functions_prefix_paren: &[],
804    classes: &["type "],
805    variables: &["var "],
806    imports: &["import "],
807    // Go standard testing: Test* functions (convention is practically exclusive to _test.go files)
808    tests: &["func Test", "func Benchmark", "func Fuzz"],
809    assertions: &[],
810    test_suites: &[],
811};
812
813const SP_JAVA: SymbolPatterns = SymbolPatterns {
814    functions: &[],
815    functions_prefix_paren: &[],
816    classes: &[
817        "class ",
818        "public class ",
819        "private class ",
820        "protected class ",
821        "abstract class ",
822        "final class ",
823        "public abstract class ",
824        "public final class ",
825        "interface ",
826        "public interface ",
827        "enum ",
828        "public enum ",
829        "record ",
830        "public record ",
831        "@interface ",
832    ],
833    variables: &[],
834    imports: &["import "],
835    // JUnit 4 & 5, TestNG — annotations appear on their own line before the method
836    tests: &[
837        "@Test",
838        "@ParameterizedTest",
839        "@RepeatedTest",
840        "@TestFactory",
841        "@TestTemplate",
842    ],
843    assertions: &[
844        "assertEquals(",
845        "assertNotEquals(",
846        "assertTrue(",
847        "assertFalse(",
848        "assertNull(",
849        "assertNotNull(",
850        "assertThat(",
851        "assertThrows(",
852        "assertAll(",
853        "assertArrayEquals(",
854        "assertIterableEquals(",
855        "assertLinesMatch(",
856    ],
857    test_suites: &[],
858};
859
860const SP_CSHARP: SymbolPatterns = SymbolPatterns {
861    functions: &[],
862    functions_prefix_paren: &[],
863    classes: &[
864        "class ",
865        "public class ",
866        "private class ",
867        "protected class ",
868        "internal class ",
869        "abstract class ",
870        "sealed class ",
871        "static class ",
872        "partial class ",
873        "public abstract class ",
874        "public sealed class ",
875        "public static class ",
876        "interface ",
877        "public interface ",
878        "internal interface ",
879        "enum ",
880        "public enum ",
881        "struct ",
882        "public struct ",
883        "record ",
884        "public record ",
885    ],
886    variables: &["var "],
887    imports: &["using "],
888    // MSTest, NUnit, xUnit — attributes on their own line before the method
889    tests: &[
890        "[TestMethod]",
891        "[Test]",
892        "[Fact]",
893        "[Theory]",
894        "[TestCase(",
895        "[DataRow(",
896        "[InlineData(",
897        "[MemberData(",
898    ],
899    assertions: &[
900        "Assert.AreEqual(",
901        "Assert.AreNotEqual(",
902        "Assert.IsTrue(",
903        "Assert.IsFalse(",
904        "Assert.IsNull(",
905        "Assert.IsNotNull(",
906        "Assert.Equal(",
907        "Assert.NotEqual(",
908        "Assert.True(",
909        "Assert.False(",
910        "Assert.That(",
911        "Assert.Contains(",
912        "Assert.Throws(",
913        "Assert.ThrowsAsync(",
914        "Assert.IsInstanceOfType(",
915    ],
916    test_suites: &["[TestClass]", "[TestFixture]", "[SetUpFixture]"],
917};
918
919// GTest, Catch2/doctest, Boost.Test, Unity, Check, CMocka, CppUTest patterns for C and C++.
920const TEST_PATTERNS_C_CPP: &[&str] = &[
921    // Google Test
922    "TEST(",
923    "TEST_F(",
924    "TEST_P(",
925    "TYPED_TEST(",
926    "TYPED_TEST_P(",
927    "INSTANTIATE_TEST_SUITE_P(",
928    "INSTANTIATE_TYPED_TEST_SUITE_P(",
929    // Catch2 / doctest
930    "TEST_CASE(",
931    "SECTION(",
932    "SCENARIO(",
933    "SCENARIO_METHOD(",
934    "TEST_CASE_METHOD(",
935    // Boost.Test
936    "BOOST_AUTO_TEST_CASE(",
937    "BOOST_FIXTURE_TEST_CASE(",
938    "BOOST_AUTO_TEST_SUITE(",
939    "BOOST_PARAM_TEST_CASE(",
940    // CppUnit
941    "CPPUNIT_TEST(",
942    "CPPUNIT_TEST_SUITE(",
943    // Unity (embedded C)
944    "RUN_TEST(",
945    "TEST_IGNORE(",
946    "TEST_FAIL(",
947    // Check (libcheck — embedded C)
948    "START_TEST(",
949    "tcase_add_test(",
950    "suite_create(",
951    // CMocka (embedded C)
952    "cmocka_unit_test(",
953    "cmocka_run_group_tests(",
954    // CppUTest
955    "IGNORE_TEST(",
956    "TEST_GROUP(",
957    "TEST_GROUP_BASE(",
958];
959
960// Test assertion patterns shared by C and C++.
961const ASSERT_PATTERNS_C_CPP: &[&str] = &[
962    // Google Test ASSERT_* (test-stopping failures)
963    "ASSERT_EQ(",
964    "ASSERT_NE(",
965    "ASSERT_LT(",
966    "ASSERT_LE(",
967    "ASSERT_GT(",
968    "ASSERT_GE(",
969    "ASSERT_TRUE(",
970    "ASSERT_FALSE(",
971    "ASSERT_STREQ(",
972    "ASSERT_STRNE(",
973    "ASSERT_FLOAT_EQ(",
974    "ASSERT_DOUBLE_EQ(",
975    "ASSERT_NEAR(",
976    "ASSERT_THROW(",
977    "ASSERT_NO_THROW(",
978    "ASSERT_ANY_THROW(",
979    // Google Test EXPECT_* (non-stopping failures)
980    "EXPECT_EQ(",
981    "EXPECT_NE(",
982    "EXPECT_LT(",
983    "EXPECT_LE(",
984    "EXPECT_GT(",
985    "EXPECT_GE(",
986    "EXPECT_TRUE(",
987    "EXPECT_FALSE(",
988    "EXPECT_STREQ(",
989    "EXPECT_STRNE(",
990    "EXPECT_FLOAT_EQ(",
991    "EXPECT_DOUBLE_EQ(",
992    "EXPECT_NEAR(",
993    "EXPECT_THROW(",
994    "EXPECT_NO_THROW(",
995    "EXPECT_ANY_THROW(",
996    // Catch2 / doctest assertions
997    "REQUIRE(",
998    "CHECK(",
999    "REQUIRE_FALSE(",
1000    "CHECK_FALSE(",
1001    "REQUIRE_NOTHROW(",
1002    "CHECK_NOTHROW(",
1003    "REQUIRE_THROWS(",
1004    "CHECK_THROWS(",
1005    "REQUIRE_THAT(",
1006    "CHECK_THAT(",
1007    // Unity assertions (embedded C)
1008    "TEST_ASSERT_EQUAL(",
1009    "TEST_ASSERT_EQUAL_INT(",
1010    "TEST_ASSERT_EQUAL_STRING(",
1011    "TEST_ASSERT_EQUAL_FLOAT(",
1012    "TEST_ASSERT_EQUAL_DOUBLE(",
1013    "TEST_ASSERT_EQUAL_PTR(",
1014    "TEST_ASSERT_TRUE(",
1015    "TEST_ASSERT_FALSE(",
1016    "TEST_ASSERT_NULL(",
1017    "TEST_ASSERT_NOT_NULL(",
1018    "TEST_ASSERT_BITS_HIGH(",
1019    "TEST_ASSERT_BITS_LOW(",
1020    // CMocka assertions (embedded C)
1021    "assert_int_equal(",
1022    "assert_int_not_equal(",
1023    "assert_string_equal(",
1024    "assert_string_not_equal(",
1025    "assert_true(",
1026    "assert_false(",
1027    "assert_null(",
1028    "assert_non_null(",
1029    "assert_ptr_equal(",
1030    "assert_memory_equal(",
1031    "assert_return_code(",
1032];
1033
1034// Test suite/group declaration patterns for C and C++.
1035const SUITE_PATTERNS_C_CPP: &[&str] = &[
1036    "TEST_GROUP(",
1037    "TEST_GROUP_BASE(",
1038    "BOOST_AUTO_TEST_SUITE(",
1039    "CPPUNIT_TEST_SUITE(",
1040    "CPPUNIT_TEST_SUITE_END(",
1041];
1042
1043const SP_C: SymbolPatterns = SymbolPatterns {
1044    // C has no function keyword; detect by common return types that precede `(` with no `=`.
1045    functions: &[],
1046    functions_prefix_paren: &[
1047        "void ",
1048        "int ",
1049        "char ",
1050        "float ",
1051        "double ",
1052        "long ",
1053        "unsigned ",
1054        "size_t ",
1055        "static ",
1056        "inline ",
1057        "const ",
1058        "extern ",
1059    ],
1060    classes: &[
1061        "struct ",
1062        "typedef struct ",
1063        "union ",
1064        "typedef union ",
1065        "typedef enum ",
1066    ],
1067    variables: &[],
1068    imports: &["#include "],
1069    tests: TEST_PATTERNS_C_CPP,
1070    assertions: ASSERT_PATTERNS_C_CPP,
1071    test_suites: SUITE_PATTERNS_C_CPP,
1072};
1073
1074const SP_CPP: SymbolPatterns = SymbolPatterns {
1075    // C++ specific function keyword-prefixes; return-type-led patterns use functions_prefix_paren.
1076    functions: &[
1077        "virtual ",  // virtual method declaration/definition
1078        "explicit ", // explicit constructor modifier
1079        "~",         // destructor (e.g. ~MyClass())
1080        "operator",  // operator overload (operator==, operator+, …)
1081    ],
1082    functions_prefix_paren: &[
1083        "void ",
1084        "bool ",
1085        "int ",
1086        "char ",
1087        "float ",
1088        "double ",
1089        "long ",
1090        "unsigned ",
1091        "size_t ",
1092        "auto ",
1093        "static ",
1094        "inline ",
1095        "constexpr ",
1096        "const ",
1097        "extern ",
1098    ],
1099    // `template<` (no space) is the dominant modern style alongside `template ` (with space).
1100    classes: &["class ", "struct ", "namespace ", "template ", "template<"],
1101    variables: &[],
1102    imports: &["#include "],
1103    tests: TEST_PATTERNS_C_CPP,
1104    assertions: ASSERT_PATTERNS_C_CPP,
1105    test_suites: SUITE_PATTERNS_C_CPP,
1106};
1107
1108const SP_SHELL: SymbolPatterns = SymbolPatterns {
1109    functions: &["function "],
1110    functions_prefix_paren: &[],
1111    classes: &[],
1112    variables: &["declare ", "local ", "export "],
1113    imports: &["source ", ". "],
1114    tests: &[],
1115    assertions: &[],
1116    test_suites: &[],
1117};
1118
1119const SP_POWERSHELL: SymbolPatterns = SymbolPatterns {
1120    functions: &["function ", "Function "],
1121    functions_prefix_paren: &[],
1122    classes: &["class "],
1123    variables: &[],
1124    imports: &["Import-Module ", "using "],
1125    // Pester test framework
1126    tests: &["Describe ", "It ", "Context "],
1127    assertions: &[],
1128    test_suites: &[],
1129};
1130
1131const SP_KOTLIN: SymbolPatterns = SymbolPatterns {
1132    functions: &[
1133        "fun ",
1134        "private fun ",
1135        "public fun ",
1136        "protected fun ",
1137        "internal fun ",
1138        "override fun ",
1139        "suspend fun ",
1140        "abstract fun ",
1141        "open fun ",
1142        "private suspend fun ",
1143        "public suspend fun ",
1144    ],
1145    functions_prefix_paren: &[],
1146    classes: &[
1147        "class ",
1148        "data class ",
1149        "sealed class ",
1150        "abstract class ",
1151        "open class ",
1152        "object ",
1153        "companion object",
1154        "interface ",
1155        "enum class ",
1156        "annotation class ",
1157    ],
1158    variables: &["val ", "var ", "private val ", "private var ", "const val "],
1159    imports: &["import "],
1160    // JUnit 4/5, KotlinTest, Kotest
1161    tests: &[
1162        "@Test",
1163        "@ParameterizedTest",
1164        "@RepeatedTest",
1165        "\"should ",
1166        "\"it ",
1167    ],
1168    assertions: &[
1169        "assertEquals(",
1170        "assertNotEquals(",
1171        "assertTrue(",
1172        "assertFalse(",
1173        "assertNull(",
1174        "assertNotNull(",
1175        "assertThat(",
1176        "assertThrows(",
1177        "shouldBe(",
1178        "shouldNotBe(",
1179        "shouldThrow(",
1180    ],
1181    test_suites: &[],
1182};
1183
1184const SP_SWIFT: SymbolPatterns = SymbolPatterns {
1185    functions: &[
1186        "func ",
1187        "private func ",
1188        "public func ",
1189        "internal func ",
1190        "override func ",
1191        "open func ",
1192        "static func ",
1193        "class func ",
1194        "mutating func ",
1195        "private static func ",
1196        "public static func ",
1197    ],
1198    functions_prefix_paren: &[],
1199    classes: &[
1200        "class ",
1201        "struct ",
1202        "protocol ",
1203        "enum ",
1204        "extension ",
1205        "actor ",
1206        "public class ",
1207        "private class ",
1208        "open class ",
1209        "final class ",
1210        "public struct ",
1211        "private struct ",
1212        "public protocol ",
1213    ],
1214    variables: &[
1215        "var ",
1216        "let ",
1217        "private var ",
1218        "private let ",
1219        "static var ",
1220        "static let ",
1221    ],
1222    imports: &["import "],
1223    // XCTest: test functions are named test* by convention; Swift Testing: @Test attribute
1224    tests: &["func test", "func Test", "@Test"],
1225    assertions: &[
1226        "XCTAssertEqual(",
1227        "XCTAssertNotEqual(",
1228        "XCTAssertTrue(",
1229        "XCTAssertFalse(",
1230        "XCTAssertNil(",
1231        "XCTAssertNotNil(",
1232        "XCTAssertGreaterThan(",
1233        "XCTAssertLessThan(",
1234        "XCTAssertThrowsError(",
1235        "XCTAssertNoThrow(",
1236        "#expect(",
1237    ],
1238    test_suites: &[],
1239};
1240
1241const SP_RUBY: SymbolPatterns = SymbolPatterns {
1242    functions: &["def ", "private def ", "protected def "],
1243    functions_prefix_paren: &[],
1244    classes: &["class ", "module "],
1245    variables: &[],
1246    imports: &["require ", "require_relative "],
1247    // RSpec / minitest
1248    tests: &["it ", "it(", "describe ", "context ", "test "],
1249    assertions: &[],
1250    test_suites: &[],
1251};
1252
1253const SP_SCALA: SymbolPatterns = SymbolPatterns {
1254    functions: &["def ", "private def ", "protected def ", "override def "],
1255    functions_prefix_paren: &[],
1256    classes: &[
1257        "class ",
1258        "case class ",
1259        "abstract class ",
1260        "sealed class ",
1261        "object ",
1262        "trait ",
1263    ],
1264    variables: &["val ", "var ", "lazy val "],
1265    imports: &["import "],
1266    // ScalaTest / MUnit: FunSuite test("..."), FlatSpec it("..."), AnyWordSpec "..." should
1267    tests: &["test(", "it(", "describe("],
1268    assertions: &[],
1269    test_suites: &[],
1270};
1271
1272const SP_PHP: SymbolPatterns = SymbolPatterns {
1273    functions: &[
1274        "function ",
1275        "public function ",
1276        "private function ",
1277        "protected function ",
1278        "static function ",
1279        "abstract function ",
1280        "final function ",
1281        "public static function ",
1282        "private static function ",
1283        "protected static function ",
1284    ],
1285    functions_prefix_paren: &[],
1286    classes: &[
1287        "class ",
1288        "abstract class ",
1289        "final class ",
1290        "interface ",
1291        "trait ",
1292        "enum ",
1293    ],
1294    variables: &[],
1295    imports: &[
1296        "use ",
1297        "require ",
1298        "require_once ",
1299        "include ",
1300        "include_once ",
1301    ],
1302    // PHPUnit: test methods start with test, or use @test annotation
1303    tests: &[
1304        "public function test",
1305        "function test",
1306        "#[Test]",
1307        "#[DataProvider(",
1308    ],
1309    assertions: &[],
1310    test_suites: &[],
1311};
1312
1313const SP_ELIXIR: SymbolPatterns = SymbolPatterns {
1314    functions: &[
1315        "def ",
1316        "defp ",
1317        "defmacro ",
1318        "defmacrop ",
1319        "defguard ",
1320        "defguardp ",
1321    ],
1322    functions_prefix_paren: &[],
1323    classes: &["defmodule ", "defprotocol ", "defimpl "],
1324    variables: &[],
1325    imports: &["import ", "alias ", "use ", "require "],
1326    // ExUnit
1327    tests: &["test ", "describe "],
1328    assertions: &[],
1329    test_suites: &[],
1330};
1331
1332const SP_ERLANG: SymbolPatterns = SymbolPatterns {
1333    functions: &[],
1334    functions_prefix_paren: &[],
1335    classes: &["-module("],
1336    variables: &[],
1337    imports: &["-import(", "-include(", "-include_lib("],
1338    tests: &[],
1339    assertions: &[],
1340    test_suites: &[],
1341};
1342
1343const SP_FSHARP: SymbolPatterns = SymbolPatterns {
1344    functions: &[
1345        "let ",
1346        "let rec ",
1347        "member ",
1348        "override ",
1349        "abstract member ",
1350    ],
1351    functions_prefix_paren: &[],
1352    classes: &["type "],
1353    variables: &["let mutable "],
1354    imports: &["open "],
1355    // NUnit / xUnit attributes on their own line; FsUnit uses [<Test>] / [<Fact>]
1356    tests: &["[<Test>]", "[<Fact>]", "[<Theory>]", "[<TestCase("],
1357    assertions: &[],
1358    test_suites: &[],
1359};
1360
1361const SP_GROOVY: SymbolPatterns = SymbolPatterns {
1362    functions: &["def ", "private def ", "public def ", "protected def "],
1363    functions_prefix_paren: &[],
1364    classes: &["class ", "abstract class ", "interface ", "enum ", "trait "],
1365    variables: &[],
1366    imports: &["import "],
1367    // Spock framework: feature methods; JUnit annotations
1368    tests: &["def \"", "@Test", "given:", "when:", "then:", "expect:"],
1369    assertions: &[],
1370    test_suites: &[],
1371};
1372
1373const SP_HASKELL: SymbolPatterns = SymbolPatterns {
1374    functions: &[],
1375    functions_prefix_paren: &[],
1376    classes: &["class ", "data ", "newtype ", "type "],
1377    variables: &[],
1378    imports: &["import "],
1379    tests: &[],
1380    assertions: &[],
1381    test_suites: &[],
1382};
1383
1384const SP_LUA: SymbolPatterns = SymbolPatterns {
1385    functions: &["function ", "local function "],
1386    functions_prefix_paren: &[],
1387    classes: &[],
1388    variables: &["local "],
1389    imports: &[],
1390    // busted test framework
1391    tests: &["it(", "describe(", "pending("],
1392    assertions: &[],
1393    test_suites: &[],
1394};
1395
1396const SP_NIM: SymbolPatterns = SymbolPatterns {
1397    functions: &[
1398        "proc ",
1399        "func ",
1400        "method ",
1401        "iterator ",
1402        "converter ",
1403        "template ",
1404        "macro ",
1405    ],
1406    functions_prefix_paren: &[],
1407    classes: &["type "],
1408    variables: &["var ", "let ", "const "],
1409    imports: &["import ", "from "],
1410    // unittest module
1411    tests: &["test "],
1412    assertions: &[],
1413    test_suites: &[],
1414};
1415
1416const SP_OBJECTIVEC: SymbolPatterns = SymbolPatterns {
1417    functions: &["- (", "+ ("],
1418    functions_prefix_paren: &[],
1419    classes: &["@interface ", "@implementation ", "@protocol "],
1420    variables: &[],
1421    imports: &["#import ", "#include "],
1422    // XCTest: test methods start with - (void)test
1423    tests: &["- (void)test"],
1424    assertions: &[
1425        "XCTAssertEqual(",
1426        "XCTAssertNotEqual(",
1427        "XCTAssertTrue(",
1428        "XCTAssertFalse(",
1429        "XCTAssertNil(",
1430        "XCTAssertNotNil(",
1431        "XCTAssertGreaterThan(",
1432        "XCTAssertLessThan(",
1433        "XCTAssertThrowsError(",
1434        "XCTAssertNoThrow(",
1435    ],
1436    test_suites: &[],
1437};
1438
1439const SP_OCAML: SymbolPatterns = SymbolPatterns {
1440    functions: &["let ", "let rec "],
1441    functions_prefix_paren: &[],
1442    classes: &["type ", "module ", "class "],
1443    variables: &[],
1444    imports: &["open "],
1445    tests: &[],
1446    assertions: &[],
1447    test_suites: &[],
1448};
1449
1450const SP_PERL: SymbolPatterns = SymbolPatterns {
1451    functions: &["sub "],
1452    functions_prefix_paren: &[],
1453    classes: &["package "],
1454    variables: &["my ", "our ", "local "],
1455    imports: &["use ", "require "],
1456    tests: &[],
1457    assertions: &[],
1458    test_suites: &[],
1459};
1460
1461const SP_CLOJURE: SymbolPatterns = SymbolPatterns {
1462    functions: &["(defn ", "(defn- ", "(defmacro ", "(defmulti "],
1463    functions_prefix_paren: &[],
1464    classes: &[
1465        "(defrecord ",
1466        "(defprotocol ",
1467        "(deftype ",
1468        "(definterface ",
1469    ],
1470    variables: &["(def ", "(defonce "],
1471    imports: &["(ns ", "(require "],
1472    // clojure.test
1473    tests: &["(deftest ", "(testing "],
1474    assertions: &[],
1475    test_suites: &[],
1476};
1477
1478const SP_JULIA: SymbolPatterns = SymbolPatterns {
1479    functions: &["function ", "macro "],
1480    functions_prefix_paren: &[],
1481    classes: &[
1482        "struct ",
1483        "mutable struct ",
1484        "abstract type ",
1485        "primitive type ",
1486    ],
1487    variables: &["const "],
1488    imports: &["import ", "using "],
1489    // Test.jl standard library
1490    tests: &["@test ", "@testset "],
1491    assertions: &[],
1492    test_suites: &[],
1493};
1494
1495const SP_DART: SymbolPatterns = SymbolPatterns {
1496    functions: &[],
1497    functions_prefix_paren: &[],
1498    classes: &["class ", "abstract class ", "mixin ", "extension ", "enum "],
1499    variables: &["var ", "final ", "const ", "late "],
1500    imports: &["import "],
1501    // flutter_test / test package
1502    tests: &["test(", "testWidgets(", "group("],
1503    assertions: &[],
1504    test_suites: &[],
1505};
1506
1507const SP_R: SymbolPatterns = SymbolPatterns {
1508    functions: &[],
1509    functions_prefix_paren: &[],
1510    classes: &[],
1511    variables: &[],
1512    imports: &["library(", "source("],
1513    // testthat
1514    tests: &["test_that(", "it(", "describe(", "expect_"],
1515    assertions: &[],
1516    test_suites: &[],
1517};
1518
1519const SP_SQL: SymbolPatterns = SymbolPatterns {
1520    functions: &[
1521        "create function ",
1522        "create or replace function ",
1523        "create procedure ",
1524        "create or replace procedure ",
1525        "CREATE FUNCTION ",
1526        "CREATE OR REPLACE FUNCTION ",
1527        "CREATE PROCEDURE ",
1528        "CREATE OR REPLACE PROCEDURE ",
1529    ],
1530    functions_prefix_paren: &[],
1531    classes: &[
1532        "create table ",
1533        "create view ",
1534        "create schema ",
1535        "CREATE TABLE ",
1536        "CREATE VIEW ",
1537        "CREATE SCHEMA ",
1538    ],
1539    variables: &["declare ", "DECLARE "],
1540    imports: &[],
1541    tests: &[],
1542    assertions: &[],
1543    test_suites: &[],
1544};
1545
1546const SP_ASSEMBLY: SymbolPatterns = SymbolPatterns {
1547    functions: &["proc ", "PROC "],
1548    functions_prefix_paren: &[],
1549    classes: &[],
1550    variables: &[],
1551    imports: &["include ", "INCLUDE ", "%include "],
1552    tests: &[],
1553    assertions: &[],
1554    test_suites: &[],
1555};
1556
1557const SP_ZIG: SymbolPatterns = SymbolPatterns {
1558    functions: &[
1559        "fn ",
1560        "pub fn ",
1561        "export fn ",
1562        "inline fn ",
1563        "pub inline fn ",
1564    ],
1565    functions_prefix_paren: &[],
1566    classes: &[],
1567    variables: &["var ", "pub var "],
1568    imports: &[],
1569    // Zig built-in test blocks
1570    tests: &["test \"", "test{"],
1571    assertions: &[],
1572    test_suites: &[],
1573};
1574
1575/// Static (non-heap) language scanning parameters.  All fields are `'static` so this struct
1576/// can be stored in a `static` array.  The dynamic `skip_lines` set (used only for Python
1577/// docstring detection) is kept in `ScanConfig` and populated by the caller after lookup.
1578#[allow(clippy::struct_excessive_bools)]
1579#[derive(Clone, Copy)]
1580struct StaticLangConfig {
1581    line_comments: &'static [&'static str],
1582    block_comment: Option<(&'static str, &'static str)>,
1583    allow_single_quote_strings: bool,
1584    allow_double_quote_strings: bool,
1585    allow_triple_quote_strings: bool,
1586    allow_csharp_verbatim_strings: bool,
1587    symbol_patterns: SymbolPatterns,
1588    /// `true` for C, C++, and Objective-C (languages that have a C preprocessor).
1589    has_preprocessor: bool,
1590}
1591
1592#[allow(clippy::struct_excessive_bools)]
1593#[derive(Debug, Clone)]
1594struct ScanConfig {
1595    line_comments: &'static [&'static str],
1596    block_comment: Option<(&'static str, &'static str)>,
1597    allow_single_quote_strings: bool,
1598    allow_double_quote_strings: bool,
1599    allow_triple_quote_strings: bool,
1600    allow_csharp_verbatim_strings: bool,
1601    skip_lines: HashSet<usize>,
1602    symbol_patterns: SymbolPatterns,
1603}
1604
1605// ── Per-family base configurations ───────────────────────────────────────────
1606//
1607// Most languages share one of two comment styles.  Define a base `const` for
1608// each family; table entries override only the fields that differ (symbol
1609// patterns, preprocessor flag, verbatim-string flag, etc.).
1610//
1611// C-slash family: `//` line, `/* */` block, single + double quotes.
1612// Covers C, C++, Obj-C, C#, Go, Java, JS/TS/Svelte/Vue, Dart, Groovy, Kotlin,
1613// Scala, SCSS, Swift, Rust, and Zig (Zig has no block comment → overridden).
1614const C_SLASH_BASE: StaticLangConfig = StaticLangConfig {
1615    line_comments: &["//"],
1616    block_comment: Some(("/*", "*/")),
1617    allow_single_quote_strings: true,
1618    allow_double_quote_strings: true,
1619    allow_triple_quote_strings: false,
1620    allow_csharp_verbatim_strings: false,
1621    symbol_patterns: SP_NONE,
1622    has_preprocessor: false,
1623};
1624
1625// Hash-comment family: `#` line comment, no block comment, single + double
1626// quotes.  Covers Shell, Ruby, R, Perl, Elixir (each overrides only SP_*);
1627// Python overrides triple-quote; PowerShell and Nim override block_comment.
1628const HASH_BASE: StaticLangConfig = StaticLangConfig {
1629    line_comments: &["#"],
1630    block_comment: None,
1631    allow_single_quote_strings: true,
1632    allow_double_quote_strings: true,
1633    allow_triple_quote_strings: false,
1634    allow_csharp_verbatim_strings: false,
1635    symbol_patterns: SP_NONE,
1636    has_preprocessor: false,
1637};
1638
1639/// Static language-scan configuration table — one entry per supported language.
1640/// Used by `language_scan_config` to avoid a 41-arm match.  All `SP_*` constants
1641/// referenced here are defined above in the same module.
1642static LANG_SCAN_TABLE: &[(Language, StaticLangConfig)] = &[
1643    // ── C preprocessor family ─────────────────────────────────────────────────
1644    (
1645        Language::C,
1646        StaticLangConfig {
1647            symbol_patterns: SP_C,
1648            has_preprocessor: true,
1649            ..C_SLASH_BASE
1650        },
1651    ),
1652    (
1653        Language::Cpp,
1654        StaticLangConfig {
1655            symbol_patterns: SP_CPP,
1656            has_preprocessor: true,
1657            ..C_SLASH_BASE
1658        },
1659    ),
1660    (
1661        Language::ObjectiveC,
1662        StaticLangConfig {
1663            symbol_patterns: SP_OBJECTIVEC,
1664            has_preprocessor: true,
1665            ..C_SLASH_BASE
1666        },
1667    ),
1668    // ── C-slash family ────────────────────────────────────────────────────────
1669    (
1670        Language::CSharp,
1671        StaticLangConfig {
1672            symbol_patterns: SP_CSHARP,
1673            allow_csharp_verbatim_strings: true,
1674            ..C_SLASH_BASE
1675        },
1676    ),
1677    (
1678        Language::Go,
1679        StaticLangConfig {
1680            symbol_patterns: SP_GO,
1681            ..C_SLASH_BASE
1682        },
1683    ),
1684    (
1685        Language::Java,
1686        StaticLangConfig {
1687            symbol_patterns: SP_JAVA,
1688            ..C_SLASH_BASE
1689        },
1690    ),
1691    (
1692        Language::JavaScript,
1693        StaticLangConfig {
1694            symbol_patterns: SP_JS,
1695            ..C_SLASH_BASE
1696        },
1697    ),
1698    (
1699        Language::TypeScript,
1700        StaticLangConfig {
1701            symbol_patterns: SP_TS,
1702            ..C_SLASH_BASE
1703        },
1704    ),
1705    (
1706        Language::Svelte,
1707        StaticLangConfig {
1708            symbol_patterns: SP_JS,
1709            ..C_SLASH_BASE
1710        },
1711    ),
1712    (
1713        Language::Vue,
1714        StaticLangConfig {
1715            symbol_patterns: SP_JS,
1716            ..C_SLASH_BASE
1717        },
1718    ),
1719    (
1720        Language::Dart,
1721        StaticLangConfig {
1722            symbol_patterns: SP_DART,
1723            ..C_SLASH_BASE
1724        },
1725    ),
1726    (
1727        Language::Groovy,
1728        StaticLangConfig {
1729            symbol_patterns: SP_GROOVY,
1730            ..C_SLASH_BASE
1731        },
1732    ),
1733    (
1734        Language::Kotlin,
1735        StaticLangConfig {
1736            symbol_patterns: SP_KOTLIN,
1737            ..C_SLASH_BASE
1738        },
1739    ),
1740    (
1741        Language::Scala,
1742        StaticLangConfig {
1743            symbol_patterns: SP_SCALA,
1744            ..C_SLASH_BASE
1745        },
1746    ),
1747    (
1748        Language::Scss,
1749        StaticLangConfig {
1750            symbol_patterns: SP_NONE,
1751            ..C_SLASH_BASE
1752        },
1753    ),
1754    // Rust: no single-quote char literals (they're lifetime annotations)
1755    (
1756        Language::Rust,
1757        StaticLangConfig {
1758            symbol_patterns: SP_RUST,
1759            allow_single_quote_strings: false,
1760            ..C_SLASH_BASE
1761        },
1762    ),
1763    // Swift: no single-quote strings
1764    (
1765        Language::Swift,
1766        StaticLangConfig {
1767            symbol_patterns: SP_SWIFT,
1768            allow_single_quote_strings: false,
1769            ..C_SLASH_BASE
1770        },
1771    ),
1772    // Zig: no block comment
1773    (
1774        Language::Zig,
1775        StaticLangConfig {
1776            symbol_patterns: SP_ZIG,
1777            block_comment: None,
1778            ..C_SLASH_BASE
1779        },
1780    ),
1781    // F#: `(*` … `*)` block comment, no single-quote strings
1782    (
1783        Language::FSharp,
1784        StaticLangConfig {
1785            line_comments: &["//"],
1786            block_comment: Some(("(*", "*)")),
1787            allow_single_quote_strings: false,
1788            allow_double_quote_strings: true,
1789            symbol_patterns: SP_FSHARP,
1790            ..C_SLASH_BASE
1791        },
1792    ),
1793    // ── Hash-comment family ───────────────────────────────────────────────────
1794    (
1795        Language::Shell,
1796        StaticLangConfig {
1797            symbol_patterns: SP_SHELL,
1798            ..HASH_BASE
1799        },
1800    ),
1801    (
1802        Language::Elixir,
1803        StaticLangConfig {
1804            symbol_patterns: SP_ELIXIR,
1805            ..HASH_BASE
1806        },
1807    ),
1808    (
1809        Language::Perl,
1810        StaticLangConfig {
1811            symbol_patterns: SP_PERL,
1812            ..HASH_BASE
1813        },
1814    ),
1815    (
1816        Language::R,
1817        StaticLangConfig {
1818            symbol_patterns: SP_R,
1819            ..HASH_BASE
1820        },
1821    ),
1822    (
1823        Language::Ruby,
1824        StaticLangConfig {
1825            symbol_patterns: SP_RUBY,
1826            ..HASH_BASE
1827        },
1828    ),
1829    // Python: triple-quote string literals
1830    (
1831        Language::Python,
1832        StaticLangConfig {
1833            symbol_patterns: SP_PYTHON,
1834            allow_triple_quote_strings: true,
1835            ..HASH_BASE
1836        },
1837    ),
1838    // PowerShell: `<# … #>` block comment
1839    (
1840        Language::PowerShell,
1841        StaticLangConfig {
1842            symbol_patterns: SP_POWERSHELL,
1843            block_comment: Some(("<#", "#>")),
1844            ..HASH_BASE
1845        },
1846    ),
1847    // Nim: `#[` … `]#` block comment
1848    (
1849        Language::Nim,
1850        StaticLangConfig {
1851            symbol_patterns: SP_NIM,
1852            block_comment: Some(("#[", "]#")),
1853            ..HASH_BASE
1854        },
1855    ),
1856    // Makefile / Dockerfile: `#` only, no string literals
1857    (
1858        Language::Makefile,
1859        StaticLangConfig {
1860            symbol_patterns: SP_NONE,
1861            allow_single_quote_strings: false,
1862            allow_double_quote_strings: false,
1863            ..HASH_BASE
1864        },
1865    ),
1866    (
1867        Language::Dockerfile,
1868        StaticLangConfig {
1869            symbol_patterns: SP_NONE,
1870            allow_single_quote_strings: false,
1871            allow_double_quote_strings: false,
1872            ..HASH_BASE
1873        },
1874    ),
1875    // ── Other unique comment styles ───────────────────────────────────────────
1876    // CSS / SCSS: only `/* */` block, no line comment
1877    (
1878        Language::Css,
1879        StaticLangConfig {
1880            line_comments: &[],
1881            block_comment: Some(("/*", "*/")),
1882            symbol_patterns: SP_NONE,
1883            ..C_SLASH_BASE
1884        },
1885    ),
1886    // HTML / XML: `<!-- -->` block, no line comment, no string literals
1887    (
1888        Language::Html,
1889        StaticLangConfig {
1890            line_comments: &[],
1891            block_comment: Some(("<!--", "-->")),
1892            allow_single_quote_strings: false,
1893            allow_double_quote_strings: false,
1894            symbol_patterns: SP_NONE,
1895            ..C_SLASH_BASE
1896        },
1897    ),
1898    (
1899        Language::Xml,
1900        StaticLangConfig {
1901            line_comments: &[],
1902            block_comment: Some(("<!--", "-->")),
1903            allow_single_quote_strings: false,
1904            allow_double_quote_strings: false,
1905            symbol_patterns: SP_NONE,
1906            ..C_SLASH_BASE
1907        },
1908    ),
1909    // Lua: `--` line, `--[[ ]]` block
1910    (
1911        Language::Lua,
1912        StaticLangConfig {
1913            line_comments: &["--"],
1914            block_comment: Some(("--[[", "]]")),
1915            symbol_patterns: SP_LUA,
1916            ..C_SLASH_BASE
1917        },
1918    ),
1919    // Haskell: `--` line, `{- -}` block
1920    (
1921        Language::Haskell,
1922        StaticLangConfig {
1923            line_comments: &["--"],
1924            block_comment: Some(("{-", "-}")),
1925            symbol_patterns: SP_HASKELL,
1926            ..C_SLASH_BASE
1927        },
1928    ),
1929    // SQL: `--` line, `/* */` block, single quote only
1930    (
1931        Language::Sql,
1932        StaticLangConfig {
1933            line_comments: &["--"],
1934            block_comment: Some(("/*", "*/")),
1935            allow_single_quote_strings: true,
1936            allow_double_quote_strings: false,
1937            symbol_patterns: SP_SQL,
1938            ..C_SLASH_BASE
1939        },
1940    ),
1941    // OCaml: `(*` … `*)` only, no line comment, no single-quote strings
1942    (
1943        Language::Ocaml,
1944        StaticLangConfig {
1945            line_comments: &[],
1946            block_comment: Some(("(*", "*)")),
1947            allow_single_quote_strings: false,
1948            symbol_patterns: SP_OCAML,
1949            ..C_SLASH_BASE
1950        },
1951    ),
1952    // Assembly / Clojure: `;` line comment, no block, no string literals
1953    (
1954        Language::Assembly,
1955        StaticLangConfig {
1956            line_comments: &[";"],
1957            block_comment: None,
1958            allow_single_quote_strings: false,
1959            allow_double_quote_strings: false,
1960            symbol_patterns: SP_ASSEMBLY,
1961            ..C_SLASH_BASE
1962        },
1963    ),
1964    (
1965        Language::Clojure,
1966        StaticLangConfig {
1967            line_comments: &[";"],
1968            block_comment: None,
1969            allow_single_quote_strings: false,
1970            symbol_patterns: SP_CLOJURE,
1971            ..C_SLASH_BASE
1972        },
1973    ),
1974    // Erlang: `%` line comment, no block, no single-quote strings
1975    (
1976        Language::Erlang,
1977        StaticLangConfig {
1978            line_comments: &["%"],
1979            block_comment: None,
1980            allow_single_quote_strings: false,
1981            symbol_patterns: SP_ERLANG,
1982            ..C_SLASH_BASE
1983        },
1984    ),
1985    // PHP: `//` or `#` line, `/* */` block
1986    (
1987        Language::Php,
1988        StaticLangConfig {
1989            line_comments: &["//", "#"],
1990            block_comment: Some(("/*", "*/")),
1991            symbol_patterns: SP_PHP,
1992            ..C_SLASH_BASE
1993        },
1994    ),
1995    // Julia: `#` line, `#= =#` block, double + triple quotes, no single
1996    (
1997        Language::Julia,
1998        StaticLangConfig {
1999            line_comments: &["#"],
2000            block_comment: Some(("#=", "=#")),
2001            allow_single_quote_strings: false,
2002            allow_triple_quote_strings: true,
2003            symbol_patterns: SP_JULIA,
2004            ..C_SLASH_BASE
2005        },
2006    ),
2007];
2008
2009/// Per-call IEEE 1045-1992 flags derived from `AnalysisOptions` plus per-language properties.
2010/// Private to this crate; constructed inside `analyze_text`.
2011#[derive(Debug, Clone, Copy)]
2012struct IeeeFlags {
2013    /// True for C, C++, and Objective-C — languages with a C preprocessor.
2014    has_preprocessor_directives: bool,
2015    /// Mirrors `AnalysisOptions::blank_in_block_comment_as_comment`.
2016    blank_in_block_comment_as_comment: bool,
2017    /// Mirrors `AnalysisOptions::collapse_continuation_lines`.
2018    collapse_continuation_lines: bool,
2019}
2020
2021#[derive(Debug, Clone, Copy)]
2022enum StringState {
2023    Single(char),
2024    Triple(&'static str),
2025    VerbatimDouble,
2026}
2027
2028#[allow(clippy::struct_excessive_bools)]
2029#[derive(Debug, Default)]
2030struct LineFacts {
2031    has_code: bool,
2032    has_single_comment: bool,
2033    has_multi_comment: bool,
2034    has_docstring: bool,
2035}
2036
2037/// Process one character while the lexer is inside a string literal.
2038///
2039/// Returns `(new_string_state, advance)` where `advance` is the number of chars to skip.
2040fn process_string_char(
2041    state: StringState,
2042    chars: &[char],
2043    i: usize,
2044) -> (Option<StringState>, usize) {
2045    match state {
2046        StringState::Single(delim) => {
2047            if chars[i] == '\\' {
2048                return (Some(state), 2); // skip escaped character
2049            }
2050            if chars[i] == delim {
2051                (None, 1)
2052            } else {
2053                (Some(state), 1)
2054            }
2055        }
2056        StringState::Triple(delim) => {
2057            if starts_with(chars, i, delim) {
2058                (None, delim.len())
2059            } else {
2060                (Some(state), 1)
2061            }
2062        }
2063        StringState::VerbatimDouble => {
2064            if starts_with(chars, i, "\"\"") {
2065                return (Some(state), 2); // escaped quote-quote inside verbatim string
2066            }
2067            if chars[i] == '"' {
2068                (None, 1)
2069            } else {
2070                (Some(state), 1)
2071            }
2072        }
2073    }
2074}
2075
2076/// Process one character while the lexer is inside a block comment.
2077///
2078/// Returns `(still_in_block_comment, advance)`.
2079fn process_block_comment_char(chars: &[char], i: usize, close: &str) -> (bool, usize) {
2080    if starts_with(chars, i, close) {
2081        (false, close.len())
2082    } else {
2083        (true, 1)
2084    }
2085}
2086
2087/// Attempt to begin a new string literal at position `i`.
2088///
2089/// Returns `Some((new_state, advance))` when a string opener is detected, else `None`.
2090fn try_open_string(chars: &[char], i: usize, config: &ScanConfig) -> Option<(StringState, usize)> {
2091    if config.allow_csharp_verbatim_strings && starts_with(chars, i, "@\"") {
2092        return Some((StringState::VerbatimDouble, 2));
2093    }
2094    if config.allow_triple_quote_strings {
2095        if starts_with(chars, i, "\"\"\"") {
2096            return Some((StringState::Triple("\"\"\""), 3));
2097        }
2098        if starts_with(chars, i, "'''") {
2099            return Some((StringState::Triple("'''"), 3));
2100        }
2101    }
2102    if config.allow_single_quote_strings && chars[i] == '\'' {
2103        return Some((StringState::Single('\''), 1));
2104    }
2105    if config.allow_double_quote_strings && chars[i] == '"' {
2106        return Some((StringState::Single('"'), 1));
2107    }
2108    None
2109}
2110
2111/// Advance past one character position while inside a block comment.
2112///
2113/// Updates `in_block_comment` if the closing delimiter is found and returns the
2114/// number of characters consumed. Returns 0 when no block-comment config is set
2115/// (preserving the caller's `continue`-without-advance behaviour for that impossible state).
2116fn step_through_block_comment(
2117    chars: &[char],
2118    i: usize,
2119    block_comment: Option<(&'static str, &'static str)>,
2120    in_block_comment: &mut bool,
2121) -> usize {
2122    if let Some((_, close)) = block_comment {
2123        let (still_in, advance) = process_block_comment_char(chars, i, close);
2124        *in_block_comment = still_in;
2125        return advance;
2126    }
2127    0
2128}
2129
2130/// If the character at `i` starts a block comment, return the length of the opening
2131/// delimiter so the caller can advance past it. Returns `None` if no match.
2132fn try_open_block_comment(
2133    chars: &[char],
2134    i: usize,
2135    block_comment: Option<(&'static str, &'static str)>,
2136) -> Option<usize> {
2137    let (open, _) = block_comment?;
2138    starts_with(chars, i, open).then_some(open.len())
2139}
2140
2141/// Scan a single physical line and update `facts`, `in_block_comment`, and `string_state`.
2142///
2143/// Returns `true` when the caller should break out of the per-line loop early (line comment hit).
2144fn scan_line(
2145    chars: &[char],
2146    config: &ScanConfig,
2147    facts: &mut LineFacts,
2148    in_block_comment: &mut bool,
2149    string_state: &mut Option<StringState>,
2150) {
2151    let mut i = 0usize;
2152    while i < chars.len() {
2153        // Inside a string literal — advance until the closing delimiter.
2154        if let Some(state) = *string_state {
2155            facts.has_code = true;
2156            let (new_state, advance) = process_string_char(state, chars, i);
2157            *string_state = new_state;
2158            i += advance;
2159            continue;
2160        }
2161
2162        // Inside a block comment — advance until the closing delimiter.
2163        if *in_block_comment {
2164            facts.has_multi_comment = true;
2165            i += step_through_block_comment(chars, i, config.block_comment, in_block_comment);
2166            continue;
2167        }
2168
2169        // Whitespace outside any string/comment — skip.
2170        if chars[i].is_whitespace() {
2171            i += 1;
2172            continue;
2173        }
2174
2175        // Attempt to open a string literal.
2176        if let Some((new_state, advance)) = try_open_string(chars, i, config) {
2177            facts.has_code = true;
2178            *string_state = Some(new_state);
2179            i += advance;
2180            continue;
2181        }
2182
2183        // Attempt to open a block comment.
2184        if let Some(advance) = try_open_block_comment(chars, i, config.block_comment) {
2185            facts.has_multi_comment = true;
2186            *in_block_comment = true;
2187            i += advance;
2188            continue;
2189        }
2190
2191        // Line comment — rest of the line is a comment; stop scanning.
2192        if config
2193            .line_comments
2194            .iter()
2195            .any(|prefix| starts_with(chars, i, prefix))
2196        {
2197            facts.has_single_comment = true;
2198            break;
2199        }
2200
2201        // Plain code character.
2202        facts.has_code = true;
2203        i += 1;
2204    }
2205}
2206
2207/// Apply IEEE 1045-1992 §4.2 preprocessor-directive tracking and continuation-line merging,
2208/// then emit the finalized `LineFacts` for this physical line.
2209///
2210/// Returns `None` when the line is part of a continuation sequence and should be deferred.
2211fn finalize_line_facts(
2212    facts: LineFacts,
2213    trimmed: &str,
2214    raw: &mut RawLineCounts,
2215    ieee: IeeeFlags,
2216    in_block_comment: bool,
2217    string_state: Option<StringState>,
2218    pending_continuation: &mut Option<LineFacts>,
2219) -> Option<LineFacts> {
2220    // IEEE 1045-1992 §4.2: track preprocessor/compiler directive lines (C/C++/ObjC).
2221    // A directive line is a pure code line (no comment on the same physical line) whose
2222    // trimmed content starts with '#'.
2223    if ieee.has_preprocessor_directives
2224        && facts.has_code
2225        && !facts.has_single_comment
2226        && !facts.has_multi_comment
2227        && trimmed.starts_with('#')
2228    {
2229        raw.compiler_directive_lines += 1;
2230    }
2231
2232    // IEEE 1045-1992 continuation-line handling.
2233    // A line is a continuation starter when it ends with '\' outside any comment or string.
2234    let is_continuation = ieee.collapse_continuation_lines
2235        && !in_block_comment
2236        && string_state.is_none()
2237        && trimmed.ends_with('\\');
2238
2239    if is_continuation {
2240        let pending = pending_continuation.get_or_insert_with(LineFacts::default);
2241        pending.has_code |= facts.has_code;
2242        pending.has_single_comment |= facts.has_single_comment;
2243        pending.has_multi_comment |= facts.has_multi_comment;
2244        pending.has_docstring |= facts.has_docstring;
2245        return None; // defer classification until the sequence ends
2246    }
2247
2248    // Merge any accumulated continuation facts into the final line.
2249    let emit = if let Some(pending) = pending_continuation.take() {
2250        LineFacts {
2251            has_code: pending.has_code | facts.has_code,
2252            has_single_comment: pending.has_single_comment | facts.has_single_comment,
2253            has_multi_comment: pending.has_multi_comment | facts.has_multi_comment,
2254            has_docstring: pending.has_docstring | facts.has_docstring,
2255        }
2256    } else {
2257        facts
2258    };
2259    Some(emit)
2260}
2261
2262/// Scan and classify one physical line, updating all running state in place.
2263///
2264/// Pre-classified lines (present in `config.skip_lines`) are counted as docstring-comment
2265/// lines and returned early without further analysis.
2266#[allow(clippy::needless_pass_by_value)]
2267#[allow(clippy::too_many_arguments)]
2268#[allow(clippy::many_single_char_names)] // destructuring return from count_symbols; names match field roles
2269fn process_physical_line(
2270    line: &str,
2271    line_idx: usize,
2272    config: &ScanConfig,
2273    raw: &mut RawLineCounts,
2274    in_block_comment: &mut bool,
2275    string_state: &mut Option<StringState>,
2276    pending_continuation: &mut Option<LineFacts>,
2277    ieee: IeeeFlags,
2278) {
2279    raw.total_physical_lines += 1;
2280
2281    if config.skip_lines.contains(&line_idx) {
2282        raw.docstring_comment_lines += 1;
2283        return;
2284    }
2285
2286    let trimmed = line.trim();
2287    let mut facts = LineFacts::default();
2288
2289    // IEEE 1045-1992: blank lines inside block comments are comment lines by default.
2290    // When blank_in_block_comment_as_comment is false, blank lines keep their blank
2291    // classification even while inside a block comment.
2292    if *in_block_comment && (ieee.blank_in_block_comment_as_comment || !trimmed.is_empty()) {
2293        facts.has_multi_comment = true;
2294    }
2295
2296    let chars: Vec<char> = line.chars().collect();
2297    scan_line(&chars, config, &mut facts, in_block_comment, string_state);
2298
2299    let Some(emit) = finalize_line_facts(
2300        facts,
2301        trimmed,
2302        raw,
2303        ieee,
2304        *in_block_comment,
2305        *string_state,
2306        pending_continuation,
2307    ) else {
2308        return;
2309    };
2310
2311    classify_line(raw, &emit, trimmed);
2312
2313    if emit.has_code {
2314        let (f, c, v, i, t, a, s) = count_symbols(&config.symbol_patterns, trimmed);
2315        raw.functions += f;
2316        raw.classes += c;
2317        raw.variables += v;
2318        raw.imports += i;
2319        raw.test_count += t;
2320        raw.test_assertion_count += a;
2321        raw.test_suite_count += s;
2322    }
2323}
2324
2325#[allow(clippy::needless_pass_by_value)]
2326fn analyze_generic(text: &str, config: ScanConfig, ieee: IeeeFlags) -> RawFileAnalysis {
2327    let normalized = text.replace("\r\n", "\n").replace('\r', "\n");
2328    let lines: Vec<&str> = normalized.split_terminator('\n').collect();
2329
2330    let mut raw = RawLineCounts::default();
2331    let mut warnings = Vec::new();
2332
2333    let mut in_block_comment = false;
2334    let mut string_state: Option<StringState> = None;
2335    // IEEE continuation-line state: accumulates facts across a backslash-continued sequence.
2336    let mut pending_continuation: Option<LineFacts> = None;
2337
2338    for (line_idx, line) in lines.iter().enumerate() {
2339        process_physical_line(
2340            line,
2341            line_idx,
2342            &config,
2343            &mut raw,
2344            &mut in_block_comment,
2345            &mut string_state,
2346            &mut pending_continuation,
2347            ieee,
2348        );
2349    }
2350
2351    // Flush any pending continuation that reaches end-of-file without a closing line.
2352    if let Some(pending) = pending_continuation.take() {
2353        classify_line(&mut raw, &pending, "");
2354    }
2355
2356    if in_block_comment {
2357        warnings.push("unclosed block comment detected; result is best effort".into());
2358    }
2359    if string_state.is_some() {
2360        warnings.push("unclosed string literal detected; result is best effort".into());
2361    }
2362
2363    RawFileAnalysis {
2364        raw,
2365        parse_mode: if warnings.is_empty() {
2366            ParseMode::Lexical
2367        } else {
2368            ParseMode::LexicalBestEffort
2369        },
2370        warnings,
2371        style_analysis: None,
2372    }
2373}
2374
2375const fn classify_line(raw: &mut RawLineCounts, facts: &LineFacts, trimmed: &str) {
2376    if facts.has_docstring {
2377        raw.docstring_comment_lines += 1;
2378    } else if !facts.has_code
2379        && !facts.has_single_comment
2380        && !facts.has_multi_comment
2381        && trimmed.is_empty()
2382    {
2383        raw.blank_only_lines += 1;
2384    } else if facts.has_code && facts.has_single_comment {
2385        raw.mixed_code_single_comment_lines += 1;
2386    } else if facts.has_code && facts.has_multi_comment {
2387        raw.mixed_code_multi_comment_lines += 1;
2388    } else if facts.has_code {
2389        raw.code_only_lines += 1;
2390    } else if facts.has_single_comment {
2391        raw.single_comment_only_lines += 1;
2392    } else if facts.has_multi_comment {
2393        raw.multi_comment_only_lines += 1;
2394    } else if trimmed.is_empty() {
2395        raw.blank_only_lines += 1;
2396    } else {
2397        raw.skipped_unknown_lines += 1;
2398    }
2399}
2400
2401fn count_symbols(patterns: &SymbolPatterns, trimmed: &str) -> (u64, u64, u64, u64, u64, u64, u64) {
2402    let hit = |pats: &[&str]| u64::from(pats.iter().any(|p| trimmed.starts_with(p)));
2403    // For return-type-led languages (C/C++): match prefix AND `(` present AND no `=` sits
2404    // between the prefix start and the first `(` (guards against `void* p = malloc(n)`).
2405    let fn_pp = if patterns.functions_prefix_paren.is_empty() {
2406        0
2407    } else if let Some(paren_pos) = trimmed.find('(') {
2408        if trimmed[..paren_pos].contains('=') {
2409            0
2410        } else {
2411            hit(patterns.functions_prefix_paren)
2412        }
2413    } else {
2414        0
2415    };
2416    let test_hit = hit(patterns.tests);
2417    // Lines matching a test pattern count as tests, not as plain functions or classes.
2418    // This prevents double-counting in Python (`def test_` / `class Test`) and Go
2419    // (`func Test` / `func Benchmark` / `func Fuzz`) where the same line satisfies both
2420    // a function/class prefix and a test pattern. Rust is unaffected: `#[test]` is a
2421    // standalone attribute line; the `fn` declaration on the next line does not match any
2422    // test pattern and still increments functions correctly.
2423    let fn_hit = if test_hit == 0 {
2424        hit(patterns.functions) | fn_pp
2425    } else {
2426        0
2427    };
2428    let class_hit = if test_hit == 0 {
2429        hit(patterns.classes)
2430    } else {
2431        0
2432    };
2433    (
2434        fn_hit,
2435        class_hit,
2436        hit(patterns.variables),
2437        hit(patterns.imports),
2438        test_hit,
2439        hit(patterns.assertions),
2440        hit(patterns.test_suites),
2441    )
2442}
2443
2444fn starts_with(chars: &[char], index: usize, needle: &str) -> bool {
2445    let needle_chars: Vec<char> = needle.chars().collect();
2446    chars.get(index..index + needle_chars.len()) == Some(needle_chars.as_slice())
2447}
2448
2449#[derive(Debug, Clone)]
2450struct PyContext {
2451    indent: usize,
2452    expect_docstring: bool,
2453}
2454
2455/// Update `contexts` to pop any scopes that the current `indent` has outdented past.
2456fn py_pop_outdented_contexts(contexts: &mut Vec<PyContext>, indent: usize) {
2457    while contexts.len() > 1 && indent < contexts.last().map_or(0, |c| c.indent) {
2458        contexts.pop();
2459    }
2460}
2461
2462/// Handle `pending_block_indent` transition: push a new docstring-expecting context when we
2463/// detect the first indented line of a new block, or cancel the pending state otherwise.
2464fn py_handle_pending_indent(
2465    pending_block_indent: &mut Option<usize>,
2466    contexts: &mut Vec<PyContext>,
2467    indent: usize,
2468    trimmed: &str,
2469) {
2470    let Some(base_indent) = *pending_block_indent else {
2471        return;
2472    };
2473    if indent > base_indent {
2474        contexts.push(PyContext {
2475            indent,
2476            expect_docstring: true,
2477        });
2478        *pending_block_indent = None;
2479    } else if !trimmed.starts_with('@') {
2480        *pending_block_indent = None;
2481    }
2482}
2483
2484/// Check whether the current line is a docstring opener in the current context.
2485///
2486/// If it is, records the line, adjusts `ctx.expect_docstring`, and optionally sets
2487/// `active_docstring` for multi-line docstrings. Returns `true` when the caller should
2488/// `continue` to the next line.
2489fn py_try_record_docstring(
2490    ctx: &mut PyContext,
2491    trimmed: &str,
2492    idx: usize,
2493    docstring_lines: &mut HashSet<usize>,
2494    active_docstring: &mut Option<(&'static str, usize)>,
2495) -> bool {
2496    if !ctx.expect_docstring {
2497        return false;
2498    }
2499    if let Some(delim) = docstring_delimiter(trimmed) {
2500        docstring_lines.insert(idx);
2501        ctx.expect_docstring = false;
2502        if !closes_triple_docstring(trimmed, delim, true) {
2503            *active_docstring = Some((delim, idx));
2504        }
2505        return true;
2506    }
2507    ctx.expect_docstring = false;
2508    false
2509}
2510
2511/// Advance through an active multi-line docstring: marks the current line and clears
2512/// `active_docstring` when the closing delimiter is found. Returns `true` when the caller
2513/// should `continue` to the next line (i.e. we were inside a docstring).
2514fn track_active_docstring(
2515    active_docstring: &mut Option<(&'static str, usize)>,
2516    docstring_lines: &mut HashSet<usize>,
2517    idx: usize,
2518    trimmed: &str,
2519) -> bool {
2520    let Some((delim, start_line)) = *active_docstring else {
2521        return false;
2522    };
2523    docstring_lines.insert(idx);
2524    if closes_triple_docstring(trimmed, delim, idx == start_line) {
2525        *active_docstring = None;
2526    }
2527    true
2528}
2529
2530/// Attempt to record a docstring opener using the top of the context stack.
2531/// Returns `true` when the caller should `continue` to the next line.
2532fn try_record_docstring_if_context(
2533    contexts: &mut [PyContext],
2534    trimmed: &str,
2535    idx: usize,
2536    docstring_lines: &mut HashSet<usize>,
2537    active_docstring: &mut Option<(&'static str, usize)>,
2538) -> bool {
2539    let Some(ctx) = contexts.last_mut() else {
2540        return false;
2541    };
2542    py_try_record_docstring(ctx, trimmed, idx, docstring_lines, active_docstring)
2543}
2544
2545/// If an unclosed docstring is still active at end-of-file, mark all remaining lines.
2546fn mark_unclosed_docstring_lines(
2547    active_docstring: Option<&(&'static str, usize)>,
2548    docstring_lines: &mut HashSet<usize>,
2549    num_lines: usize,
2550) {
2551    if let Some(&(_, start_line)) = active_docstring {
2552        for idx in start_line..num_lines {
2553            docstring_lines.insert(idx);
2554        }
2555    }
2556}
2557
2558fn detect_python_docstring_lines(text: &str) -> HashSet<usize> {
2559    let normalized = text.replace("\r\n", "\n").replace('\r', "\n");
2560    let lines: Vec<&str> = normalized.split_terminator('\n').collect();
2561
2562    let mut docstring_lines = HashSet::new();
2563    let mut contexts = vec![PyContext {
2564        indent: 0,
2565        expect_docstring: true,
2566    }];
2567    let mut pending_block_indent: Option<usize> = None;
2568    let mut active_docstring: Option<(&'static str, usize)> = None;
2569
2570    for (idx, line) in lines.iter().enumerate() {
2571        let trimmed = line.trim();
2572        let indent = leading_indent(line);
2573
2574        if track_active_docstring(&mut active_docstring, &mut docstring_lines, idx, trimmed) {
2575            continue;
2576        }
2577
2578        // Blank lines and comment lines don't affect docstring detection.
2579        if trimmed.is_empty() || trimmed.starts_with('#') {
2580            continue;
2581        }
2582
2583        py_pop_outdented_contexts(&mut contexts, indent);
2584        py_handle_pending_indent(&mut pending_block_indent, &mut contexts, indent, trimmed);
2585
2586        if try_record_docstring_if_context(
2587            &mut contexts,
2588            trimmed,
2589            idx,
2590            &mut docstring_lines,
2591            &mut active_docstring,
2592        ) {
2593            continue;
2594        }
2595
2596        if is_python_block_header(trimmed) {
2597            pending_block_indent = Some(indent);
2598        }
2599    }
2600
2601    mark_unclosed_docstring_lines(active_docstring.as_ref(), &mut docstring_lines, lines.len());
2602
2603    docstring_lines
2604}
2605
2606fn leading_indent(line: &str) -> usize {
2607    line.chars().take_while(|c| c.is_whitespace()).count()
2608}
2609
2610fn is_python_block_header(trimmed: &str) -> bool {
2611    (trimmed.starts_with("def ")
2612        || trimmed.starts_with("async def ")
2613        || trimmed.starts_with("class "))
2614        && trimmed.ends_with(':')
2615}
2616
2617fn docstring_delimiter(trimmed: &str) -> Option<&'static str> {
2618    let mut idx = 0usize;
2619    let bytes = trimmed.as_bytes();
2620    while idx < bytes.len() {
2621        let c = bytes[idx] as char;
2622        if matches!(c, 'r' | 'R' | 'u' | 'U' | 'b' | 'B' | 'f' | 'F') {
2623            idx += 1;
2624            continue;
2625        }
2626        break;
2627    }
2628
2629    let rest = &trimmed[idx..];
2630    if rest.starts_with("\"\"\"") {
2631        Some("\"\"\"")
2632    } else if rest.starts_with("'''") {
2633        Some("'''")
2634    } else {
2635        None
2636    }
2637}
2638
2639fn closes_triple_docstring(trimmed: &str, delim: &str, same_line_as_start: bool) -> bool {
2640    let mut occurrences = 0usize;
2641    let mut search = trimmed;
2642    while let Some(index) = search.find(delim) {
2643        occurrences += 1;
2644        search = &search[index + delim.len()..];
2645    }
2646
2647    if same_line_as_start {
2648        occurrences >= 2
2649    } else {
2650        occurrences >= 1
2651    }
2652}
2653
2654/// Tree-sitter-backed adapters (compiled only when the `tree-sitter` feature is enabled).
2655///
2656/// When parsing succeeds the result is used directly; on any failure the caller falls back
2657/// to the lexical state machine.
2658#[cfg(feature = "tree-sitter")]
2659pub mod ts {
2660    use tree_sitter::Node;
2661
2662    use super::{ParseMode, RawFileAnalysis, RawLineCounts};
2663
2664    /// Configuration for which AST node kinds map to symbols in this grammar.
2665    struct SymbolKinds {
2666        /// Node kind name for function definitions (e.g. `"function_definition"`).
2667        function_def: &'static str,
2668        /// Node kind name for class definitions (e.g. `"class_definition"`).
2669        class_def: &'static str,
2670        /// Name field of a function node that, when it starts with this prefix, marks a test.
2671        /// Empty string disables test-prefix detection.
2672        test_fn_prefix: &'static str,
2673        /// Name field of a class node that, when it starts with this prefix, marks a test.
2674        /// Empty string disables test-prefix detection.
2675        test_class_prefix: &'static str,
2676        /// When non-empty, `call` nodes whose `function` is an `attribute` access and whose
2677        /// attribute identifier starts with this prefix are counted as test assertions.
2678        /// Used for Python `self.assertXxx(...)` detection.
2679        assertion_attr_prefix: &'static str,
2680    }
2681
2682    impl SymbolKinds {
2683        const fn none() -> Self {
2684            Self {
2685                function_def: "",
2686                class_def: "",
2687                test_fn_prefix: "",
2688                test_class_prefix: "",
2689                assertion_attr_prefix: "",
2690            }
2691        }
2692    }
2693
2694    /// Classify every line of `text` using a tree-sitter grammar.
2695    ///
2696    /// `comment_node_kinds` — node type names that represent comments in this grammar
2697    /// `docstring_stmt_kind` — optional parent node type whose direct `string` child is a docstring
2698    /// `symbols` — AST node kinds used to populate symbol counters
2699    fn analyze_lines(
2700        text: &str,
2701        ts_language: &tree_sitter::Language,
2702        comment_node_kinds: &[&str],
2703        docstring_stmt_kind: Option<&str>,
2704        symbols: &SymbolKinds,
2705    ) -> Option<RawFileAnalysis> {
2706        let mut parser = tree_sitter::Parser::new();
2707        parser.set_language(ts_language).ok()?;
2708        let tree = parser.parse(text, None)?;
2709
2710        let lines: Vec<&str> = text.split_terminator('\n').collect();
2711        let n = lines.len();
2712
2713        let mut has_code = vec![false; n];
2714        let mut has_comment = vec![false; n];
2715        let mut comment_is_block = vec![false; n];
2716        let mut has_docstring = vec![false; n];
2717
2718        // Walk every node in the tree and mark line arrays.
2719        let mut ctx = VisitCtx {
2720            source: text.as_bytes(),
2721            comment_kinds: comment_node_kinds,
2722            docstring_stmt_kind,
2723            has_code: &mut has_code,
2724            has_comment: &mut has_comment,
2725            comment_is_block: &mut comment_is_block,
2726            has_docstring: &mut has_docstring,
2727        };
2728        visit(tree.root_node(), &mut ctx);
2729
2730        let mut raw = RawLineCounts::default();
2731        classify_ts_lines(
2732            &lines,
2733            &has_code,
2734            &has_comment,
2735            &comment_is_block,
2736            &has_docstring,
2737            &mut raw,
2738        );
2739
2740        // Symbol counting: walk the AST a second time to collect function/class/test counts.
2741        if !symbols.function_def.is_empty() || !symbols.class_def.is_empty() {
2742            count_symbols(tree.root_node(), text.as_bytes(), symbols, &mut raw);
2743        }
2744
2745        Some(RawFileAnalysis {
2746            raw,
2747            parse_mode: ParseMode::TreeSitter,
2748            warnings: Vec::new(),
2749        })
2750    }
2751
2752    /// Recurse into every direct child of `node`.
2753    fn recurse_children(node: Node, source: &[u8], kinds: &SymbolKinds, raw: &mut RawLineCounts) {
2754        for i in 0..node.child_count() {
2755            #[allow(clippy::cast_possible_truncation)]
2756            if let Some(child) = node.child(i as u32) {
2757                count_symbols(child, source, kinds, raw);
2758            }
2759        }
2760    }
2761
2762    /// Handle a function-definition node. Returns `true` if the node matched.
2763    fn try_count_function(
2764        node: Node,
2765        source: &[u8],
2766        kinds: &SymbolKinds,
2767        raw: &mut RawLineCounts,
2768    ) -> bool {
2769        if kinds.function_def.is_empty() || node.kind() != kinds.function_def {
2770            return false;
2771        }
2772        let name = node
2773            .child_by_field_name("name")
2774            .and_then(|n| n.utf8_text(source).ok())
2775            .unwrap_or("");
2776        if !kinds.test_fn_prefix.is_empty() && name.starts_with(kinds.test_fn_prefix) {
2777            raw.test_count += 1;
2778        } else {
2779            raw.functions += 1;
2780        }
2781        recurse_children(node, source, kinds, raw);
2782        true
2783    }
2784
2785    /// Handle a class-definition node. Returns `true` if the node matched.
2786    fn try_count_class(
2787        node: Node,
2788        source: &[u8],
2789        kinds: &SymbolKinds,
2790        raw: &mut RawLineCounts,
2791    ) -> bool {
2792        if kinds.class_def.is_empty() || node.kind() != kinds.class_def {
2793            return false;
2794        }
2795        let name = node
2796            .child_by_field_name("name")
2797            .and_then(|n| n.utf8_text(source).ok())
2798            .unwrap_or("");
2799        if !kinds.test_class_prefix.is_empty() && name.starts_with(kinds.test_class_prefix) {
2800            raw.test_count += 1;
2801        } else {
2802            raw.classes += 1;
2803        }
2804        recurse_children(node, source, kinds, raw);
2805        true
2806    }
2807
2808    /// Handle an assertion call node. Returns `true` if the node matched (skips recursion
2809    /// into arguments, preserving "don't double-count test bodies" semantics).
2810    fn try_count_assertion(
2811        node: Node,
2812        source: &[u8],
2813        kinds: &SymbolKinds,
2814        raw: &mut RawLineCounts,
2815    ) -> bool {
2816        if kinds.assertion_attr_prefix.is_empty() || node.kind() != "call" {
2817            return false;
2818        }
2819        let Some(func) = node.child_by_field_name("function") else {
2820            return false;
2821        };
2822        if func.kind() != "attribute" {
2823            return false;
2824        }
2825        let attr_text = func
2826            .child_by_field_name("attribute")
2827            .and_then(|n| n.utf8_text(source).ok())
2828            .unwrap_or("");
2829        if !attr_text.starts_with(kinds.assertion_attr_prefix) {
2830            return false;
2831        }
2832        raw.test_assertion_count += 1;
2833        true
2834    }
2835
2836    /// Walk the AST and populate `raw.functions`, `raw.classes`, `raw.test_count`,
2837    /// and `raw.test_assertion_count`.
2838    fn count_symbols(node: Node, source: &[u8], kinds: &SymbolKinds, raw: &mut RawLineCounts) {
2839        if try_count_function(node, source, kinds, raw) {
2840            return;
2841        }
2842        if try_count_class(node, source, kinds, raw) {
2843            return;
2844        }
2845        if try_count_assertion(node, source, kinds, raw) {
2846            return;
2847        }
2848        recurse_children(node, source, kinds, raw);
2849    }
2850
2851    /// Flags describing what kinds of content appear on a single line.
2852    // Four bools are the natural representation for these four independent properties.
2853    #[allow(clippy::struct_excessive_bools)]
2854    #[derive(Clone, Copy)]
2855    struct TsLineFlags {
2856        has_code: bool,
2857        has_comment: bool,
2858        comment_is_block: bool,
2859        has_docstring: bool,
2860    }
2861
2862    /// Classify a single tree-sitter-annotated line and accumulate into `raw`.
2863    const fn classify_ts_line(trimmed: &str, flags: TsLineFlags, raw: &mut RawLineCounts) {
2864        if trimmed.is_empty() {
2865            raw.blank_only_lines += 1;
2866        } else if flags.has_docstring && !flags.has_code {
2867            raw.docstring_comment_lines += 1;
2868        } else if flags.has_code && flags.has_comment {
2869            // Classify the mixed line as single or multi based on what kind of comment is on it.
2870            if flags.comment_is_block {
2871                raw.mixed_code_multi_comment_lines += 1;
2872            } else {
2873                raw.mixed_code_single_comment_lines += 1;
2874            }
2875        } else if flags.has_comment {
2876            if flags.comment_is_block {
2877                raw.multi_comment_only_lines += 1;
2878            } else {
2879                raw.single_comment_only_lines += 1;
2880            }
2881        } else {
2882            raw.code_only_lines += 1;
2883        }
2884    }
2885
2886    /// Classify each tree-sitter-annotated line and accumulate counts into `raw`.
2887    fn classify_ts_lines(
2888        lines: &[&str],
2889        has_code: &[bool],
2890        has_comment: &[bool],
2891        comment_is_block: &[bool],
2892        has_docstring: &[bool],
2893        raw: &mut RawLineCounts,
2894    ) {
2895        for i in 0..lines.len() {
2896            raw.total_physical_lines += 1;
2897            classify_ts_line(
2898                lines[i].trim(),
2899                TsLineFlags {
2900                    has_code: has_code[i],
2901                    has_comment: has_comment[i],
2902                    comment_is_block: comment_is_block[i],
2903                    has_docstring: has_docstring[i],
2904                },
2905                raw,
2906            );
2907        }
2908    }
2909
2910    struct VisitCtx<'a> {
2911        source: &'a [u8],
2912        comment_kinds: &'a [&'a str],
2913        docstring_stmt_kind: Option<&'a str>,
2914        has_code: &'a mut Vec<bool>,
2915        has_comment: &'a mut Vec<bool>,
2916        comment_is_block: &'a mut Vec<bool>,
2917        has_docstring: &'a mut Vec<bool>,
2918    }
2919
2920    /// Mark all rows of a comment node and detect whether it is a block comment.
2921    fn visit_comment_node(node: Node, ctx: &mut VisitCtx<'_>) {
2922        let start_row = node.start_position().row;
2923        let end_row = node.end_position().row;
2924        let first_two = node
2925            .utf8_text(ctx.source)
2926            .unwrap_or("")
2927            .get(..2)
2928            .unwrap_or("");
2929        let is_block = first_two == "/*" || first_two == "<#";
2930        for row in start_row..=end_row {
2931            if row < ctx.has_comment.len() {
2932                ctx.has_comment[row] = true;
2933                if is_block {
2934                    ctx.comment_is_block[row] = true;
2935                }
2936            }
2937        }
2938    }
2939
2940    /// If `node` is an `expression_statement` whose sole named child is a string literal,
2941    /// mark those rows as docstring and return `true`.
2942    fn visit_maybe_docstring(node: Node, kind: &str, ctx: &mut VisitCtx<'_>) -> bool {
2943        let Some(stmt_kind) = ctx.docstring_stmt_kind else {
2944            return false;
2945        };
2946        if kind != stmt_kind || node.named_child_count() != 1 {
2947            return false;
2948        }
2949        let Some(child) = node.named_child(0) else {
2950            return false;
2951        };
2952        if child.kind() != "string" {
2953            return false;
2954        }
2955        let child_start = child.start_position().row;
2956        let child_end = child.end_position().row;
2957        for row in child_start..=child_end {
2958            if row < ctx.has_docstring.len() {
2959                ctx.has_docstring[row] = true;
2960            }
2961        }
2962        true
2963    }
2964
2965    /// Mark all rows of a leaf (non-comment, non-extra) node as code.
2966    fn visit_leaf_code(node: Node, ctx: &mut VisitCtx<'_>) {
2967        let start_row = node.start_position().row;
2968        let end_row = node.end_position().row;
2969        for row in start_row..=end_row {
2970            if row < ctx.has_code.len() {
2971                ctx.has_code[row] = true;
2972            }
2973        }
2974    }
2975
2976    #[allow(clippy::too_many_lines)]
2977    fn visit(node: Node, ctx: &mut VisitCtx<'_>) {
2978        let kind = node.kind();
2979
2980        // Comment node — mark rows as comment, detect block vs. line comment.
2981        if ctx.comment_kinds.contains(&kind) {
2982            visit_comment_node(node, ctx);
2983            return;
2984        }
2985
2986        // Python docstring: expression_statement whose only named child is a string literal.
2987        if visit_maybe_docstring(node, kind, ctx) {
2988            return;
2989        }
2990
2991        // Leaf non-comment node: mark as code.
2992        if node.child_count() == 0 && !node.is_extra() {
2993            visit_leaf_code(node, ctx);
2994            return;
2995        }
2996
2997        for i in 0..node.child_count() {
2998            #[allow(clippy::cast_possible_truncation)]
2999            // child_count bounded by tree-sitter u32 capacity
3000            if let Some(child) = node.child(i as u32) {
3001                visit(child, ctx);
3002            }
3003        }
3004    }
3005
3006    const C_SYMBOLS: SymbolKinds = SymbolKinds::none();
3007
3008    const PYTHON_SYMBOLS: SymbolKinds = SymbolKinds {
3009        function_def: "function_definition",
3010        class_def: "class_definition",
3011        test_fn_prefix: "test_",
3012        test_class_prefix: "Test",
3013        assertion_attr_prefix: "assert",
3014    };
3015
3016    /// Parse C or C++ source with tree-sitter-c.
3017    #[must_use]
3018    pub fn analyze_c(text: &str) -> Option<RawFileAnalysis> {
3019        let lang: tree_sitter::Language = tree_sitter_c::LANGUAGE.into();
3020        analyze_lines(text, &lang, &["comment"], None, &C_SYMBOLS)
3021    }
3022
3023    /// Parse Python source with tree-sitter-python.
3024    #[must_use]
3025    pub fn analyze_python(text: &str) -> Option<RawFileAnalysis> {
3026        let lang: tree_sitter::Language = tree_sitter_python::LANGUAGE.into();
3027        analyze_lines(
3028            text,
3029            &lang,
3030            &["comment"],
3031            Some("expression_statement"),
3032            &PYTHON_SYMBOLS,
3033        )
3034    }
3035}
3036
3037#[cfg(test)]
3038mod tests {
3039    use super::*;
3040
3041    #[test]
3042    fn python_docstrings_are_separated() {
3043        let input = r#""""module docs"""
3044
3045
3046def fn_a():
3047    """function docs"""
3048    value = 1  # trailing comment
3049    return value
3050"#;
3051
3052        let result = analyze_text(Language::Python, input, AnalysisOptions::default());
3053        assert_eq!(result.raw.docstring_comment_lines, 2);
3054        assert_eq!(result.raw.mixed_code_single_comment_lines, 1);
3055        assert_eq!(result.raw.code_only_lines, 2);
3056    }
3057
3058    #[test]
3059    fn c_style_mixed_lines_are_captured() {
3060        let input = "int x = 1; // note\n/* block */\n";
3061        let result = analyze_text(Language::C, input, AnalysisOptions::default());
3062        assert_eq!(result.raw.mixed_code_single_comment_lines, 1);
3063        assert_eq!(result.raw.multi_comment_only_lines, 1);
3064    }
3065
3066    #[test]
3067    fn detect_language_by_shebang() {
3068        let language = detect_language(
3069            Path::new("script"),
3070            Some("#!/usr/bin/env bash"),
3071            &BTreeMap::new(),
3072            true,
3073        );
3074        assert_eq!(language, Some(Language::Shell));
3075    }
3076
3077    // ── count_symbols: no double-counting of test functions ──────────────────
3078
3079    fn sym(lang: Language, line: &str) -> (u64, u64, u64, u64, u64, u64, u64) {
3080        let result = analyze_text(lang, &format!("{line}\n"), AnalysisOptions::default());
3081        let r = &result.raw;
3082        (
3083            r.functions,
3084            r.classes,
3085            r.variables,
3086            r.imports,
3087            r.test_count,
3088            r.test_assertion_count,
3089            r.test_suite_count,
3090        )
3091    }
3092
3093    #[test]
3094    fn python_test_fn_not_double_counted() {
3095        // def test_ lines count as tests only, NOT as functions
3096        let (f, c, _, _, t, _, _) = sym(Language::Python, "def test_foo():");
3097        assert_eq!(f, 0, "test fn must not also increment functions");
3098        assert_eq!(t, 1, "must be counted as a test");
3099        assert_eq!(c, 0);
3100    }
3101
3102    #[test]
3103    fn python_test_class_not_double_counted() {
3104        // class Test* lines count as tests only, NOT as classes
3105        let (f, c, _, _, t, _, _) = sym(Language::Python, "class TestFoo:");
3106        assert_eq!(c, 0, "test class must not also increment classes");
3107        assert_eq!(t, 1, "must be counted as a test");
3108        assert_eq!(f, 0);
3109    }
3110
3111    #[test]
3112    fn python_regular_fn_counts_as_function() {
3113        let (f, c, _, _, t, _, _) = sym(Language::Python, "def regular():");
3114        assert_eq!(f, 1, "regular function must be counted");
3115        assert_eq!(t, 0);
3116        assert_eq!(c, 0);
3117    }
3118
3119    #[test]
3120    fn python_regular_class_counts_as_class() {
3121        let (f, c, _, _, t, _, _) = sym(Language::Python, "class Regular:");
3122        assert_eq!(c, 1, "regular class must be counted");
3123        assert_eq!(t, 0);
3124        assert_eq!(f, 0);
3125    }
3126
3127    #[test]
3128    fn go_test_fn_not_double_counted() {
3129        let (f, _, _, _, t, _, _) = sym(Language::Go, "func TestFoo(t *testing.T) {");
3130        assert_eq!(f, 0, "Go test func must not also increment functions");
3131        assert_eq!(t, 1, "must be counted as a test");
3132    }
3133
3134    #[test]
3135    fn go_benchmark_fn_not_double_counted() {
3136        let (f, _, _, _, t, _, _) = sym(Language::Go, "func BenchmarkBar(b *testing.B) {");
3137        assert_eq!(f, 0, "Go benchmark func must not also increment functions");
3138        assert_eq!(t, 1, "must be counted as a test");
3139    }
3140
3141    #[test]
3142    fn go_regular_fn_counts_as_function() {
3143        let (f, _, _, _, t, _, _) = sym(Language::Go, "func doSomething() {");
3144        assert_eq!(f, 1, "regular Go func must be counted");
3145        assert_eq!(t, 0);
3146    }
3147
3148    #[test]
3149    fn rust_test_attr_counts_as_test_not_function() {
3150        // #[test] is a standalone attribute line — counted as a test, never as a function
3151        let (f, _, _, _, t, _, _) = sym(Language::Rust, "#[test]");
3152        assert_eq!(t, 1, "#[test] must be counted as a test");
3153        assert_eq!(f, 0, "#[test] attribute must not be counted as a function");
3154    }
3155
3156    #[test]
3157    fn rust_fn_line_counts_as_function_not_test() {
3158        // The fn declaration after #[test] does NOT match any test pattern
3159        let (f, _, _, _, t, _, _) = sym(Language::Rust, "fn test_something() {");
3160        assert_eq!(f, 1, "fn declaration must count as a function");
3161        assert_eq!(
3162            t, 0,
3163            "fn declaration line must not be double-counted as a test"
3164        );
3165    }
3166
3167    #[test]
3168    fn js_describe_counts_as_test_not_function() {
3169        let (f, _, _, _, t, _, _) = sym(Language::JavaScript, "describe('suite', () => {");
3170        assert_eq!(t, 1, "describe must be counted as a test");
3171        assert_eq!(f, 0, "describe must not be counted as a function");
3172    }
3173
3174    #[test]
3175    fn js_regular_fn_counts_as_function() {
3176        let (f, _, _, _, t, _, _) = sym(Language::JavaScript, "function doWork() {");
3177        assert_eq!(f, 1, "JS function declaration must be counted");
3178        assert_eq!(t, 0);
3179    }
3180}