Skip to main content

sloc_languages/
lib.rs

1// SPDX-License-Identifier: AGPL-3.0-or-later
2// Copyright (C) 2026 Nima Shafie <nimzshafie@gmail.com>
3
4use std::collections::{BTreeMap, BTreeSet, HashSet};
5use std::path::Path;
6
7use serde::{Deserialize, Serialize};
8
9#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
10#[serde(rename_all = "snake_case")]
11pub enum Language {
12    C,
13    Cpp,
14    CSharp,
15    Go,
16    Java,
17    JavaScript,
18    Python,
19    Rust,
20    Shell,
21    PowerShell,
22    TypeScript,
23    // --- Extended language support ---
24    Assembly,
25    Clojure,
26    Css,
27    Dart,
28    Dockerfile,
29    Elixir,
30    Erlang,
31    FSharp,
32    Groovy,
33    Haskell,
34    Html,
35    Julia,
36    Kotlin,
37    Lua,
38    Makefile,
39    Nim,
40    ObjectiveC,
41    Ocaml,
42    Perl,
43    Php,
44    R,
45    Ruby,
46    Scala,
47    Scss,
48    Sql,
49    Svelte,
50    Swift,
51    Vue,
52    Xml,
53    Zig,
54}
55
56impl Language {
57    #[must_use]
58    pub const fn display_name(&self) -> &'static str {
59        match self {
60            Self::C => "C",
61            Self::Cpp => "C++",
62            Self::CSharp => "C#",
63            Self::Go => "Go",
64            Self::Java => "Java",
65            Self::JavaScript => "JavaScript",
66            Self::Python => "Python",
67            Self::Rust => "Rust",
68            Self::Shell => "Shell",
69            Self::PowerShell => "PowerShell",
70            Self::TypeScript => "TypeScript",
71            Self::Assembly => "Assembly",
72            Self::Clojure => "Clojure",
73            Self::Css => "CSS",
74            Self::Dart => "Dart",
75            Self::Dockerfile => "Dockerfile",
76            Self::Elixir => "Elixir",
77            Self::Erlang => "Erlang",
78            Self::FSharp => "F#",
79            Self::Groovy => "Groovy",
80            Self::Haskell => "Haskell",
81            Self::Html => "HTML",
82            Self::Julia => "Julia",
83            Self::Kotlin => "Kotlin",
84            Self::Lua => "Lua",
85            Self::Makefile => "Makefile",
86            Self::Nim => "Nim",
87            Self::ObjectiveC => "Objective-C",
88            Self::Ocaml => "OCaml",
89            Self::Perl => "Perl",
90            Self::Php => "PHP",
91            Self::R => "R",
92            Self::Ruby => "Ruby",
93            Self::Scala => "Scala",
94            Self::Scss => "SCSS",
95            Self::Sql => "SQL",
96            Self::Svelte => "Svelte",
97            Self::Swift => "Swift",
98            Self::Vue => "Vue",
99            Self::Xml => "XML",
100            Self::Zig => "Zig",
101        }
102    }
103
104    #[must_use]
105    pub const fn as_slug(&self) -> &'static str {
106        match self {
107            Self::C => "c",
108            Self::Cpp => "cpp",
109            Self::CSharp => "csharp",
110            Self::Go => "go",
111            Self::Java => "java",
112            Self::JavaScript => "javascript",
113            Self::Python => "python",
114            Self::Rust => "rust",
115            Self::Shell => "shell",
116            Self::PowerShell => "powershell",
117            Self::TypeScript => "typescript",
118            Self::Assembly => "assembly",
119            Self::Clojure => "clojure",
120            Self::Css => "css",
121            Self::Dart => "dart",
122            Self::Dockerfile => "dockerfile",
123            Self::Elixir => "elixir",
124            Self::Erlang => "erlang",
125            Self::FSharp => "fsharp",
126            Self::Groovy => "groovy",
127            Self::Haskell => "haskell",
128            Self::Html => "html",
129            Self::Julia => "julia",
130            Self::Kotlin => "kotlin",
131            Self::Lua => "lua",
132            Self::Makefile => "makefile",
133            Self::Nim => "nim",
134            Self::ObjectiveC => "objectivec",
135            Self::Ocaml => "ocaml",
136            Self::Perl => "perl",
137            Self::Php => "php",
138            Self::R => "r",
139            Self::Ruby => "ruby",
140            Self::Scala => "scala",
141            Self::Scss => "scss",
142            Self::Sql => "sql",
143            Self::Svelte => "svelte",
144            Self::Swift => "swift",
145            Self::Vue => "vue",
146            Self::Xml => "xml",
147            Self::Zig => "zig",
148        }
149    }
150
151    #[must_use]
152    pub fn from_name(name: &str) -> Option<Self> {
153        match name.trim().to_ascii_lowercase().as_str() {
154            "c" => Some(Self::C),
155            "cpp" | "c++" | "cplusplus" => Some(Self::Cpp),
156            "csharp" | "c#" | "cs" => Some(Self::CSharp),
157            "go" | "golang" => Some(Self::Go),
158            "java" => Some(Self::Java),
159            "javascript" | "js" => Some(Self::JavaScript),
160            "python" | "py" => Some(Self::Python),
161            "rust" | "rs" => Some(Self::Rust),
162            "shell" | "sh" | "bash" => Some(Self::Shell),
163            "powershell" | "pwsh" | "ps" => Some(Self::PowerShell),
164            "typescript" | "ts" => Some(Self::TypeScript),
165            "assembly" | "asm" => Some(Self::Assembly),
166            "clojure" | "clj" => Some(Self::Clojure),
167            "css" => Some(Self::Css),
168            "dart" => Some(Self::Dart),
169            "dockerfile" | "docker" => Some(Self::Dockerfile),
170            "elixir" | "ex" => Some(Self::Elixir),
171            "erlang" | "erl" => Some(Self::Erlang),
172            "fsharp" | "f#" | "fs" => Some(Self::FSharp),
173            "groovy" => Some(Self::Groovy),
174            "haskell" | "hs" => Some(Self::Haskell),
175            "html" | "htm" => Some(Self::Html),
176            "julia" | "jl" => Some(Self::Julia),
177            "kotlin" | "kt" => Some(Self::Kotlin),
178            "lua" => Some(Self::Lua),
179            "makefile" | "make" | "mk" => Some(Self::Makefile),
180            "nim" => Some(Self::Nim),
181            "objectivec" | "objc" | "objective-c" => Some(Self::ObjectiveC),
182            "ocaml" | "ml" => Some(Self::Ocaml),
183            "perl" | "pl" => Some(Self::Perl),
184            "php" => Some(Self::Php),
185            "r" => Some(Self::R),
186            "ruby" | "rb" => Some(Self::Ruby),
187            "scala" => Some(Self::Scala),
188            "scss" | "sass" => Some(Self::Scss),
189            "sql" => Some(Self::Sql),
190            "svelte" => Some(Self::Svelte),
191            "swift" => Some(Self::Swift),
192            "vue" => Some(Self::Vue),
193            "xml" => Some(Self::Xml),
194            "zig" => Some(Self::Zig),
195            _ => None,
196        }
197    }
198}
199
200#[derive(Debug, Clone, Serialize, Deserialize, Default)]
201pub struct RawLineCounts {
202    pub total_physical_lines: u64,
203    pub blank_only_lines: u64,
204    pub code_only_lines: u64,
205    pub single_comment_only_lines: u64,
206    pub multi_comment_only_lines: u64,
207    pub mixed_code_single_comment_lines: u64,
208    pub mixed_code_multi_comment_lines: u64,
209    pub docstring_comment_lines: u64,
210    pub skipped_unknown_lines: u64,
211    /// Best-effort count of function/method definition lines detected lexically.
212    #[serde(default)]
213    pub functions: u64,
214    /// Best-effort count of class/struct/trait/type definition lines detected lexically.
215    #[serde(default)]
216    pub classes: u64,
217    /// Best-effort count of variable declaration lines detected lexically.
218    #[serde(default)]
219    pub variables: u64,
220    /// Best-effort count of import/use/include statement lines detected lexically.
221    #[serde(default)]
222    pub imports: u64,
223    /// Lines consisting solely of preprocessor/compiler directives (e.g. `#include`, `#define`
224    /// in C/C++/Objective-C). Always a subset of `code_only_lines`. Controlled by
225    /// `AnalysisConfig::count_compiler_directives`. IEEE 1045-1992 ยง4.2.
226    #[serde(default)]
227    pub compiler_directive_lines: u64,
228}
229
230#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
231#[serde(rename_all = "snake_case")]
232pub enum ParseMode {
233    Lexical,
234    LexicalBestEffort,
235    TreeSitter,
236}
237
238#[derive(Debug, Clone, Serialize, Deserialize)]
239pub struct RawFileAnalysis {
240    pub raw: RawLineCounts,
241    pub parse_mode: ParseMode,
242    pub warnings: Vec<String>,
243}
244
245/// IEEE 1045-1992 counting options passed from `sloc-core` (built from `AnalysisConfig`).
246///
247/// `analyze_text` accepts this struct so that the caller can control behaviour that the
248/// standard defines as configurable parameters rather than fixed conventions.
249#[derive(Debug, Clone, Copy)]
250pub struct AnalysisOptions {
251    /// When `true` (IEEE 1045-1992 default), blank lines inside block comments count as
252    /// comment lines rather than blank lines.
253    pub blank_in_block_comment_as_comment: bool,
254    /// When `true`, backslash-continued physical lines are collapsed into a single logical
255    /// line for SLOC counting purposes (IEEE logical SLOC mode).
256    pub collapse_continuation_lines: bool,
257}
258
259impl Default for AnalysisOptions {
260    fn default() -> Self {
261        Self {
262            blank_in_block_comment_as_comment: true,
263            collapse_continuation_lines: false,
264        }
265    }
266}
267
268#[must_use]
269pub fn supported_languages() -> BTreeSet<Language> {
270    [
271        Language::Assembly,
272        Language::C,
273        Language::Clojure,
274        Language::Cpp,
275        Language::CSharp,
276        Language::Css,
277        Language::Dart,
278        Language::Dockerfile,
279        Language::Elixir,
280        Language::Erlang,
281        Language::FSharp,
282        Language::Go,
283        Language::Groovy,
284        Language::Haskell,
285        Language::Html,
286        Language::Java,
287        Language::JavaScript,
288        Language::Julia,
289        Language::Kotlin,
290        Language::Lua,
291        Language::Makefile,
292        Language::Nim,
293        Language::ObjectiveC,
294        Language::Ocaml,
295        Language::Perl,
296        Language::Php,
297        Language::PowerShell,
298        Language::Python,
299        Language::R,
300        Language::Ruby,
301        Language::Rust,
302        Language::Scala,
303        Language::Scss,
304        Language::Shell,
305        Language::Sql,
306        Language::Svelte,
307        Language::Swift,
308        Language::TypeScript,
309        Language::Vue,
310        Language::Xml,
311        Language::Zig,
312    ]
313    .into_iter()
314    .collect()
315}
316
317/// Detect language from a shebang line (e.g. `#!/usr/bin/env python3`).
318fn detect_by_shebang(line: &str) -> Option<Language> {
319    let lower = line.to_ascii_lowercase();
320    if !lower.starts_with("#!") {
321        return None;
322    }
323    if lower.contains("python") {
324        return Some(Language::Python);
325    }
326    if lower.contains("pwsh") || lower.contains("powershell") {
327        return Some(Language::PowerShell);
328    }
329    if lower.contains("bash")
330        || lower.contains("/sh")
331        || lower.contains("zsh")
332        || lower.contains("ksh")
333    {
334        return Some(Language::Shell);
335    }
336    if lower.contains("ruby") {
337        return Some(Language::Ruby);
338    }
339    if lower.contains("perl") {
340        return Some(Language::Perl);
341    }
342    if lower.contains("php") {
343        return Some(Language::Php);
344    }
345    if lower.contains("node") || lower.contains("nodejs") {
346        return Some(Language::JavaScript);
347    }
348    None
349}
350
351/// Detect language purely from a (lowercased) file extension.
352fn detect_by_extension(ext: &str) -> Option<Language> {
353    match ext {
354        // --- Original 11 ---
355        "c" | "h" => Some(Language::C),
356        "cc" | "cp" | "cpp" | "cxx" | "hh" | "hpp" | "hxx" => Some(Language::Cpp),
357        "cs" => Some(Language::CSharp),
358        "go" => Some(Language::Go),
359        "java" => Some(Language::Java),
360        "js" | "mjs" | "cjs" => Some(Language::JavaScript),
361        "py" => Some(Language::Python),
362        "rs" => Some(Language::Rust),
363        "sh" | "bash" | "zsh" | "ksh" => Some(Language::Shell),
364        "ps1" | "psm1" | "psd1" => Some(Language::PowerShell),
365        "ts" | "mts" | "cts" => Some(Language::TypeScript),
366        // --- Extended 30 ---
367        "asm" | "s" => Some(Language::Assembly),
368        "clj" | "cljs" | "cljc" | "edn" => Some(Language::Clojure),
369        "css" => Some(Language::Css),
370        "dart" => Some(Language::Dart),
371        "ex" | "exs" => Some(Language::Elixir),
372        "erl" | "hrl" => Some(Language::Erlang),
373        "fs" | "fsi" | "fsx" => Some(Language::FSharp),
374        "groovy" | "gradle" => Some(Language::Groovy),
375        "hs" | "lhs" => Some(Language::Haskell),
376        "html" | "htm" | "xhtml" => Some(Language::Html),
377        "jl" => Some(Language::Julia),
378        "kt" | "kts" => Some(Language::Kotlin),
379        "lua" => Some(Language::Lua),
380        "mk" => Some(Language::Makefile),
381        "nim" | "nims" => Some(Language::Nim),
382        "m" | "mm" => Some(Language::ObjectiveC),
383        "ml" | "mli" => Some(Language::Ocaml),
384        "pl" | "pm" | "t" => Some(Language::Perl),
385        "php" | "php3" | "php4" | "php5" | "php7" | "phtml" => Some(Language::Php),
386        "r" => Some(Language::R),
387        "rb" | "rake" => Some(Language::Ruby),
388        "scala" | "sc" => Some(Language::Scala),
389        "scss" | "sass" => Some(Language::Scss),
390        "sql" => Some(Language::Sql),
391        "svelte" => Some(Language::Svelte),
392        "swift" => Some(Language::Swift),
393        "vue" => Some(Language::Vue),
394        "xml" | "xsd" | "xsl" | "xslt" | "svg" => Some(Language::Xml),
395        "zig" => Some(Language::Zig),
396        _ => None,
397    }
398}
399
400/// Detect language from an exact filename (no extension) or well-known filename patterns.
401fn detect_by_filename(filename: &str, filename_lower: &str) -> Option<Language> {
402    // Dockerfile: exact name or Dockerfile.* variant
403    if filename == "Dockerfile"
404        || filename.starts_with("Dockerfile.")
405        || filename_lower == "dockerfile"
406    {
407        return Some(Language::Dockerfile);
408    }
409    // Makefile variants
410    if matches!(
411        filename,
412        "Makefile" | "GNUmakefile" | "makefile" | "BSDmakefile"
413    ) {
414        return Some(Language::Makefile);
415    }
416    // Ruby ecosystem files that have no extension
417    if matches!(
418        filename,
419        "Rakefile" | "Gemfile" | "Guardfile" | "Vagrantfile" | "Fastfile" | "Podfile"
420    ) {
421        return Some(Language::Ruby);
422    }
423    None
424}
425
426#[must_use]
427#[allow(clippy::too_many_lines)]
428pub fn detect_language(
429    path: &Path,
430    first_line: Option<&str>,
431    extension_overrides: &BTreeMap<String, String>,
432    shebang_detection: bool,
433) -> Option<Language> {
434    let extension = path
435        .extension()
436        .and_then(|ext| ext.to_str())
437        .map(str::to_ascii_lowercase);
438
439    // Extension override check (user-configured mappings win over everything)
440    if let Some(ext) = extension.as_ref() {
441        if let Some(override_name) = extension_overrides.get(ext.as_str()) {
442            if let Some(lang) = Language::from_name(override_name) {
443                return Some(lang);
444            }
445        }
446    }
447
448    // Filename-based detection for files that have no extension or use exact names
449    let filename = path.file_name().and_then(|s| s.to_str()).unwrap_or("");
450    let filename_lower = filename.to_ascii_lowercase();
451
452    if let Some(lang) = detect_by_filename(filename, &filename_lower) {
453        return Some(lang);
454    }
455
456    // Extension-based detection
457    if let Some(lang) = extension.as_deref().and_then(detect_by_extension) {
458        return Some(lang);
459    }
460
461    // Shebang detection (last resort โ€” only for extensionless scripts)
462    if shebang_detection {
463        if let Some(line) = first_line {
464            if let Some(lang) = detect_by_shebang(line) {
465                return Some(lang);
466            }
467        }
468    }
469
470    None
471}
472
473#[must_use]
474#[allow(clippy::too_many_lines)]
475pub fn analyze_text(language: Language, text: &str, options: AnalysisOptions) -> RawFileAnalysis {
476    // IEEE flags shared by all non-preprocessor languages.
477    let base = IeeeFlags {
478        has_preprocessor_directives: false,
479        blank_in_block_comment_as_comment: options.blank_in_block_comment_as_comment,
480        collapse_continuation_lines: options.collapse_continuation_lines,
481    };
482    // C, C++, and Objective-C have a preprocessor whose directive lines are tracked separately
483    // per IEEE 1045-1992 ยง4.2.
484    let cpp = IeeeFlags {
485        has_preprocessor_directives: true,
486        ..base
487    };
488
489    match language {
490        Language::C => {
491            #[cfg(feature = "tree-sitter")]
492            if let Some(result) = ts::analyze_c(text) {
493                return result;
494            }
495            analyze_generic(
496                text,
497                ScanConfig {
498                    line_comments: &["//"],
499                    block_comment: Some(("/*", "*/")),
500                    allow_single_quote_strings: true,
501                    allow_double_quote_strings: true,
502                    allow_triple_quote_strings: false,
503                    allow_csharp_verbatim_strings: false,
504                    skip_lines: HashSet::new(),
505                    symbol_patterns: SP_C,
506                },
507                cpp,
508            )
509        }
510        Language::Cpp => {
511            // tree-sitter-c also parses C++ with acceptable accuracy for SLOC counting.
512            #[cfg(feature = "tree-sitter")]
513            if let Some(result) = ts::analyze_c(text) {
514                return result;
515            }
516            analyze_generic(
517                text,
518                ScanConfig {
519                    line_comments: &["//"],
520                    block_comment: Some(("/*", "*/")),
521                    allow_single_quote_strings: true,
522                    allow_double_quote_strings: true,
523                    allow_triple_quote_strings: false,
524                    allow_csharp_verbatim_strings: false,
525                    skip_lines: HashSet::new(),
526                    symbol_patterns: SP_CPP,
527                },
528                cpp,
529            )
530        }
531        Language::CSharp => analyze_generic(
532            text,
533            ScanConfig {
534                line_comments: &["//"],
535                block_comment: Some(("/*", "*/")),
536                allow_single_quote_strings: true,
537                allow_double_quote_strings: true,
538                allow_triple_quote_strings: false,
539                allow_csharp_verbatim_strings: true,
540                skip_lines: HashSet::new(),
541                symbol_patterns: SP_CSHARP,
542            },
543            base,
544        ),
545        Language::Go => analyze_generic(
546            text,
547            ScanConfig {
548                line_comments: &["//"],
549                block_comment: Some(("/*", "*/")),
550                allow_single_quote_strings: true,
551                allow_double_quote_strings: true,
552                allow_triple_quote_strings: false,
553                allow_csharp_verbatim_strings: false,
554                skip_lines: HashSet::new(),
555                symbol_patterns: SP_GO,
556            },
557            base,
558        ),
559        Language::Java => analyze_generic(
560            text,
561            ScanConfig {
562                line_comments: &["//"],
563                block_comment: Some(("/*", "*/")),
564                allow_single_quote_strings: true,
565                allow_double_quote_strings: true,
566                allow_triple_quote_strings: false,
567                allow_csharp_verbatim_strings: false,
568                skip_lines: HashSet::new(),
569                symbol_patterns: SP_JAVA,
570            },
571            base,
572        ),
573        Language::JavaScript | Language::Svelte | Language::Vue => analyze_generic(
574            text,
575            ScanConfig {
576                line_comments: &["//"],
577                block_comment: Some(("/*", "*/")),
578                allow_single_quote_strings: true,
579                allow_double_quote_strings: true,
580                allow_triple_quote_strings: false,
581                allow_csharp_verbatim_strings: false,
582                skip_lines: HashSet::new(),
583                symbol_patterns: SP_JS,
584            },
585            base,
586        ),
587        Language::Rust => analyze_generic(
588            text,
589            ScanConfig {
590                // Rust also has //! and /// doc comments โ€” they parse the same as //
591                line_comments: &["//"],
592                block_comment: Some(("/*", "*/")),
593                allow_single_quote_strings: false,
594                allow_double_quote_strings: true,
595                allow_triple_quote_strings: false,
596                allow_csharp_verbatim_strings: false,
597                skip_lines: HashSet::new(),
598                symbol_patterns: SP_RUST,
599            },
600            base,
601        ),
602        Language::Shell => analyze_generic(
603            text,
604            ScanConfig {
605                line_comments: &["#"],
606                block_comment: None,
607                allow_single_quote_strings: true,
608                allow_double_quote_strings: true,
609                allow_triple_quote_strings: false,
610                allow_csharp_verbatim_strings: false,
611                skip_lines: HashSet::new(),
612                symbol_patterns: SP_SHELL,
613            },
614            base,
615        ),
616        Language::PowerShell => analyze_generic(
617            text,
618            ScanConfig {
619                line_comments: &["#"],
620                block_comment: Some(("<#", "#>")),
621                allow_single_quote_strings: true,
622                allow_double_quote_strings: true,
623                allow_triple_quote_strings: false,
624                allow_csharp_verbatim_strings: false,
625                skip_lines: HashSet::new(),
626                symbol_patterns: SP_POWERSHELL,
627            },
628            base,
629        ),
630        Language::TypeScript => analyze_generic(
631            text,
632            ScanConfig {
633                line_comments: &["//"],
634                block_comment: Some(("/*", "*/")),
635                allow_single_quote_strings: true,
636                allow_double_quote_strings: true,
637                allow_triple_quote_strings: false,
638                allow_csharp_verbatim_strings: false,
639                skip_lines: HashSet::new(),
640                symbol_patterns: SP_TS,
641            },
642            base,
643        ),
644        Language::Python => {
645            #[cfg(feature = "tree-sitter")]
646            if let Some(result) = ts::analyze_python(text) {
647                return result;
648            }
649            let docstring_lines = detect_python_docstring_lines(text);
650            analyze_generic(
651                text,
652                ScanConfig {
653                    line_comments: &["#"],
654                    block_comment: None,
655                    allow_single_quote_strings: true,
656                    allow_double_quote_strings: true,
657                    allow_triple_quote_strings: true,
658                    allow_csharp_verbatim_strings: false,
659                    skip_lines: docstring_lines,
660                    symbol_patterns: SP_PYTHON,
661                },
662                base,
663            )
664        }
665        // --- Extended language analyzers ---
666        Language::Assembly => analyze_generic(
667            text,
668            ScanConfig {
669                line_comments: &[";"],
670                block_comment: None,
671                allow_single_quote_strings: false,
672                allow_double_quote_strings: false,
673                allow_triple_quote_strings: false,
674                allow_csharp_verbatim_strings: false,
675                skip_lines: HashSet::new(),
676                symbol_patterns: SP_ASSEMBLY,
677            },
678            base,
679        ),
680        Language::Clojure => analyze_generic(
681            text,
682            ScanConfig {
683                line_comments: &[";"],
684                block_comment: None,
685                allow_single_quote_strings: false,
686                allow_double_quote_strings: true,
687                allow_triple_quote_strings: false,
688                allow_csharp_verbatim_strings: false,
689                skip_lines: HashSet::new(),
690                symbol_patterns: SP_CLOJURE,
691            },
692            base,
693        ),
694        Language::Css => analyze_generic(
695            text,
696            ScanConfig {
697                line_comments: &[],
698                block_comment: Some(("/*", "*/")),
699                allow_single_quote_strings: true,
700                allow_double_quote_strings: true,
701                allow_triple_quote_strings: false,
702                allow_csharp_verbatim_strings: false,
703                skip_lines: HashSet::new(),
704                symbol_patterns: SP_NONE,
705            },
706            base,
707        ),
708        Language::Dart => analyze_generic(
709            text,
710            ScanConfig {
711                line_comments: &["//"],
712                block_comment: Some(("/*", "*/")),
713                allow_single_quote_strings: true,
714                allow_double_quote_strings: true,
715                allow_triple_quote_strings: false,
716                allow_csharp_verbatim_strings: false,
717                skip_lines: HashSet::new(),
718                symbol_patterns: SP_DART,
719            },
720            base,
721        ),
722        Language::Dockerfile | Language::Makefile => analyze_generic(
723            text,
724            ScanConfig {
725                line_comments: &["#"],
726                block_comment: None,
727                allow_single_quote_strings: false,
728                allow_double_quote_strings: false,
729                allow_triple_quote_strings: false,
730                allow_csharp_verbatim_strings: false,
731                skip_lines: HashSet::new(),
732                symbol_patterns: SP_NONE,
733            },
734            base,
735        ),
736        Language::Elixir => analyze_generic(
737            text,
738            ScanConfig {
739                line_comments: &["#"],
740                block_comment: None,
741                allow_single_quote_strings: true,
742                allow_double_quote_strings: true,
743                allow_triple_quote_strings: false,
744                allow_csharp_verbatim_strings: false,
745                skip_lines: HashSet::new(),
746                symbol_patterns: SP_ELIXIR,
747            },
748            base,
749        ),
750        Language::Erlang => analyze_generic(
751            text,
752            ScanConfig {
753                line_comments: &["%"],
754                block_comment: None,
755                allow_single_quote_strings: false,
756                allow_double_quote_strings: true,
757                allow_triple_quote_strings: false,
758                allow_csharp_verbatim_strings: false,
759                skip_lines: HashSet::new(),
760                symbol_patterns: SP_ERLANG,
761            },
762            base,
763        ),
764        Language::FSharp => analyze_generic(
765            text,
766            ScanConfig {
767                line_comments: &["//"],
768                block_comment: Some(("(*", "*)")),
769                allow_single_quote_strings: false,
770                allow_double_quote_strings: true,
771                allow_triple_quote_strings: false,
772                allow_csharp_verbatim_strings: false,
773                skip_lines: HashSet::new(),
774                symbol_patterns: SP_FSHARP,
775            },
776            base,
777        ),
778        Language::Groovy => analyze_generic(
779            text,
780            ScanConfig {
781                line_comments: &["//"],
782                block_comment: Some(("/*", "*/")),
783                allow_single_quote_strings: true,
784                allow_double_quote_strings: true,
785                allow_triple_quote_strings: false,
786                allow_csharp_verbatim_strings: false,
787                skip_lines: HashSet::new(),
788                symbol_patterns: SP_GROOVY,
789            },
790            base,
791        ),
792        Language::Haskell => analyze_generic(
793            text,
794            ScanConfig {
795                line_comments: &["--"],
796                block_comment: Some(("{-", "-}")),
797                allow_single_quote_strings: true,
798                allow_double_quote_strings: true,
799                allow_triple_quote_strings: false,
800                allow_csharp_verbatim_strings: false,
801                skip_lines: HashSet::new(),
802                symbol_patterns: SP_HASKELL,
803            },
804            base,
805        ),
806        Language::Html | Language::Xml => analyze_generic(
807            text,
808            ScanConfig {
809                line_comments: &[],
810                block_comment: Some(("<!--", "-->")),
811                allow_single_quote_strings: false,
812                allow_double_quote_strings: false,
813                allow_triple_quote_strings: false,
814                allow_csharp_verbatim_strings: false,
815                skip_lines: HashSet::new(),
816                symbol_patterns: SP_NONE,
817            },
818            base,
819        ),
820        Language::Julia => analyze_generic(
821            text,
822            ScanConfig {
823                line_comments: &["#"],
824                block_comment: Some(("#=", "=#")),
825                allow_single_quote_strings: false,
826                allow_double_quote_strings: true,
827                allow_triple_quote_strings: true,
828                allow_csharp_verbatim_strings: false,
829                skip_lines: HashSet::new(),
830                symbol_patterns: SP_JULIA,
831            },
832            base,
833        ),
834        Language::Kotlin => analyze_generic(
835            text,
836            ScanConfig {
837                line_comments: &["//"],
838                block_comment: Some(("/*", "*/")),
839                allow_single_quote_strings: true,
840                allow_double_quote_strings: true,
841                allow_triple_quote_strings: false,
842                allow_csharp_verbatim_strings: false,
843                skip_lines: HashSet::new(),
844                symbol_patterns: SP_KOTLIN,
845            },
846            base,
847        ),
848        Language::Lua => analyze_generic(
849            text,
850            ScanConfig {
851                line_comments: &["--"],
852                block_comment: Some(("--[[", "]]")),
853                allow_single_quote_strings: true,
854                allow_double_quote_strings: true,
855                allow_triple_quote_strings: false,
856                allow_csharp_verbatim_strings: false,
857                skip_lines: HashSet::new(),
858                symbol_patterns: SP_LUA,
859            },
860            base,
861        ),
862        Language::Nim => analyze_generic(
863            text,
864            ScanConfig {
865                line_comments: &["#"],
866                block_comment: Some(("#[", "]#")),
867                allow_single_quote_strings: true,
868                allow_double_quote_strings: true,
869                allow_triple_quote_strings: false,
870                allow_csharp_verbatim_strings: false,
871                skip_lines: HashSet::new(),
872                symbol_patterns: SP_NIM,
873            },
874            base,
875        ),
876        Language::ObjectiveC => analyze_generic(
877            text,
878            ScanConfig {
879                line_comments: &["//"],
880                block_comment: Some(("/*", "*/")),
881                allow_single_quote_strings: true,
882                allow_double_quote_strings: true,
883                allow_triple_quote_strings: false,
884                allow_csharp_verbatim_strings: false,
885                skip_lines: HashSet::new(),
886                symbol_patterns: SP_OBJECTIVEC,
887            },
888            cpp,
889        ),
890        Language::Ocaml => analyze_generic(
891            text,
892            ScanConfig {
893                line_comments: &[],
894                block_comment: Some(("(*", "*)")),
895                allow_single_quote_strings: false,
896                allow_double_quote_strings: true,
897                allow_triple_quote_strings: false,
898                allow_csharp_verbatim_strings: false,
899                skip_lines: HashSet::new(),
900                symbol_patterns: SP_OCAML,
901            },
902            base,
903        ),
904        Language::Perl => analyze_generic(
905            text,
906            ScanConfig {
907                line_comments: &["#"],
908                block_comment: None,
909                allow_single_quote_strings: true,
910                allow_double_quote_strings: true,
911                allow_triple_quote_strings: false,
912                allow_csharp_verbatim_strings: false,
913                skip_lines: HashSet::new(),
914                symbol_patterns: SP_PERL,
915            },
916            base,
917        ),
918        Language::Php => analyze_generic(
919            text,
920            ScanConfig {
921                line_comments: &["//", "#"],
922                block_comment: Some(("/*", "*/")),
923                allow_single_quote_strings: true,
924                allow_double_quote_strings: true,
925                allow_triple_quote_strings: false,
926                allow_csharp_verbatim_strings: false,
927                skip_lines: HashSet::new(),
928                symbol_patterns: SP_PHP,
929            },
930            base,
931        ),
932        Language::R => analyze_generic(
933            text,
934            ScanConfig {
935                line_comments: &["#"],
936                block_comment: None,
937                allow_single_quote_strings: true,
938                allow_double_quote_strings: true,
939                allow_triple_quote_strings: false,
940                allow_csharp_verbatim_strings: false,
941                skip_lines: HashSet::new(),
942                symbol_patterns: SP_R,
943            },
944            base,
945        ),
946        Language::Ruby => analyze_generic(
947            text,
948            ScanConfig {
949                line_comments: &["#"],
950                block_comment: None,
951                allow_single_quote_strings: true,
952                allow_double_quote_strings: true,
953                allow_triple_quote_strings: false,
954                allow_csharp_verbatim_strings: false,
955                skip_lines: HashSet::new(),
956                symbol_patterns: SP_RUBY,
957            },
958            base,
959        ),
960        Language::Scala => analyze_generic(
961            text,
962            ScanConfig {
963                line_comments: &["//"],
964                block_comment: Some(("/*", "*/")),
965                allow_single_quote_strings: true,
966                allow_double_quote_strings: true,
967                allow_triple_quote_strings: false,
968                allow_csharp_verbatim_strings: false,
969                skip_lines: HashSet::new(),
970                symbol_patterns: SP_SCALA,
971            },
972            base,
973        ),
974        Language::Scss => analyze_generic(
975            text,
976            ScanConfig {
977                line_comments: &["//"],
978                block_comment: Some(("/*", "*/")),
979                allow_single_quote_strings: true,
980                allow_double_quote_strings: true,
981                allow_triple_quote_strings: false,
982                allow_csharp_verbatim_strings: false,
983                skip_lines: HashSet::new(),
984                symbol_patterns: SP_NONE,
985            },
986            base,
987        ),
988        Language::Sql => analyze_generic(
989            text,
990            ScanConfig {
991                line_comments: &["--"],
992                block_comment: Some(("/*", "*/")),
993                allow_single_quote_strings: true,
994                allow_double_quote_strings: false,
995                allow_triple_quote_strings: false,
996                allow_csharp_verbatim_strings: false,
997                skip_lines: HashSet::new(),
998                symbol_patterns: SP_SQL,
999            },
1000            base,
1001        ),
1002        Language::Swift => analyze_generic(
1003            text,
1004            ScanConfig {
1005                line_comments: &["//"],
1006                block_comment: Some(("/*", "*/")),
1007                allow_single_quote_strings: false,
1008                allow_double_quote_strings: true,
1009                allow_triple_quote_strings: false,
1010                allow_csharp_verbatim_strings: false,
1011                skip_lines: HashSet::new(),
1012                symbol_patterns: SP_SWIFT,
1013            },
1014            base,
1015        ),
1016        Language::Zig => analyze_generic(
1017            text,
1018            ScanConfig {
1019                line_comments: &["//"],
1020                block_comment: None,
1021                allow_single_quote_strings: true,
1022                allow_double_quote_strings: true,
1023                allow_triple_quote_strings: false,
1024                allow_csharp_verbatim_strings: false,
1025                skip_lines: HashSet::new(),
1026                symbol_patterns: SP_ZIG,
1027            },
1028            base,
1029        ),
1030    }
1031}
1032
1033/// Per-language keyword prefixes used for best-effort structural symbol detection.
1034/// Each slice lists line prefixes (after leading whitespace is stripped) that indicate
1035/// a definition of that category. Empty slice = detection disabled for that category.
1036#[derive(Debug, Clone, Copy)]
1037struct SymbolPatterns {
1038    functions: &'static [&'static str],
1039    classes: &'static [&'static str],
1040    variables: &'static [&'static str],
1041    imports: &'static [&'static str],
1042}
1043
1044impl SymbolPatterns {
1045    const fn none() -> Self {
1046        Self {
1047            functions: &[],
1048            classes: &[],
1049            variables: &[],
1050            imports: &[],
1051        }
1052    }
1053}
1054
1055const SP_NONE: SymbolPatterns = SymbolPatterns::none();
1056
1057const SP_RUST: SymbolPatterns = SymbolPatterns {
1058    functions: &[
1059        "fn ",
1060        "pub fn ",
1061        "pub(crate) fn ",
1062        "pub(super) fn ",
1063        "async fn ",
1064        "pub async fn ",
1065        "pub(crate) async fn ",
1066        "unsafe fn ",
1067        "pub unsafe fn ",
1068        "pub(crate) unsafe fn ",
1069        "const fn ",
1070        "pub const fn ",
1071        "pub(crate) const fn ",
1072        "extern fn ",
1073        "pub extern fn ",
1074    ],
1075    classes: &[
1076        "struct ",
1077        "pub struct ",
1078        "pub(crate) struct ",
1079        "enum ",
1080        "pub enum ",
1081        "pub(crate) enum ",
1082        "trait ",
1083        "pub trait ",
1084        "pub(crate) trait ",
1085        "impl ",
1086        "impl<",
1087        "type ",
1088        "pub type ",
1089        "pub(crate) type ",
1090    ],
1091    variables: &["let ", "let mut "],
1092    imports: &["use ", "pub use ", "pub(crate) use ", "extern crate "],
1093};
1094
1095const SP_PYTHON: SymbolPatterns = SymbolPatterns {
1096    functions: &["def ", "async def "],
1097    classes: &["class "],
1098    variables: &[],
1099    imports: &["import ", "from "],
1100};
1101
1102const SP_JS: SymbolPatterns = SymbolPatterns {
1103    functions: &[
1104        "function ",
1105        "async function ",
1106        "export function ",
1107        "export async function ",
1108        "export default function ",
1109    ],
1110    classes: &["class ", "export class ", "export default class "],
1111    variables: &[
1112        "var ",
1113        "let ",
1114        "const ",
1115        "export var ",
1116        "export let ",
1117        "export const ",
1118    ],
1119    imports: &["import "],
1120};
1121
1122const SP_TS: SymbolPatterns = SymbolPatterns {
1123    functions: &[
1124        "function ",
1125        "async function ",
1126        "export function ",
1127        "export async function ",
1128        "export default function ",
1129    ],
1130    classes: &[
1131        "class ",
1132        "export class ",
1133        "export default class ",
1134        "abstract class ",
1135        "export abstract class ",
1136        "interface ",
1137        "export interface ",
1138        "declare class ",
1139        "declare interface ",
1140    ],
1141    variables: &[
1142        "var ",
1143        "let ",
1144        "const ",
1145        "export var ",
1146        "export let ",
1147        "export const ",
1148    ],
1149    imports: &["import "],
1150};
1151
1152const SP_GO: SymbolPatterns = SymbolPatterns {
1153    functions: &["func "],
1154    classes: &["type "],
1155    variables: &["var "],
1156    imports: &["import "],
1157};
1158
1159const SP_JAVA: SymbolPatterns = SymbolPatterns {
1160    functions: &[],
1161    classes: &[
1162        "class ",
1163        "public class ",
1164        "private class ",
1165        "protected class ",
1166        "abstract class ",
1167        "final class ",
1168        "public abstract class ",
1169        "public final class ",
1170        "interface ",
1171        "public interface ",
1172        "enum ",
1173        "public enum ",
1174        "record ",
1175        "public record ",
1176        "@interface ",
1177    ],
1178    variables: &[],
1179    imports: &["import "],
1180};
1181
1182const SP_CSHARP: SymbolPatterns = SymbolPatterns {
1183    functions: &[],
1184    classes: &[
1185        "class ",
1186        "public class ",
1187        "private class ",
1188        "protected class ",
1189        "internal class ",
1190        "abstract class ",
1191        "sealed class ",
1192        "static class ",
1193        "partial class ",
1194        "public abstract class ",
1195        "public sealed class ",
1196        "public static class ",
1197        "interface ",
1198        "public interface ",
1199        "internal interface ",
1200        "enum ",
1201        "public enum ",
1202        "struct ",
1203        "public struct ",
1204        "record ",
1205        "public record ",
1206    ],
1207    variables: &["var "],
1208    imports: &["using "],
1209};
1210
1211const SP_C: SymbolPatterns = SymbolPatterns {
1212    functions: &[],
1213    classes: &[
1214        "struct ",
1215        "typedef struct ",
1216        "union ",
1217        "typedef union ",
1218        "typedef enum ",
1219    ],
1220    variables: &[],
1221    imports: &["#include "],
1222};
1223
1224const SP_CPP: SymbolPatterns = SymbolPatterns {
1225    functions: &[],
1226    classes: &["class ", "struct ", "namespace ", "template "],
1227    variables: &[],
1228    imports: &["#include "],
1229};
1230
1231const SP_SHELL: SymbolPatterns = SymbolPatterns {
1232    functions: &["function "],
1233    classes: &[],
1234    variables: &["declare ", "local ", "export "],
1235    imports: &["source ", ". "],
1236};
1237
1238const SP_POWERSHELL: SymbolPatterns = SymbolPatterns {
1239    functions: &["function ", "Function "],
1240    classes: &["class "],
1241    variables: &[],
1242    imports: &["Import-Module ", "using "],
1243};
1244
1245const SP_KOTLIN: SymbolPatterns = SymbolPatterns {
1246    functions: &[
1247        "fun ",
1248        "private fun ",
1249        "public fun ",
1250        "protected fun ",
1251        "internal fun ",
1252        "override fun ",
1253        "suspend fun ",
1254        "abstract fun ",
1255        "open fun ",
1256        "private suspend fun ",
1257        "public suspend fun ",
1258    ],
1259    classes: &[
1260        "class ",
1261        "data class ",
1262        "sealed class ",
1263        "abstract class ",
1264        "open class ",
1265        "object ",
1266        "companion object",
1267        "interface ",
1268        "enum class ",
1269        "annotation class ",
1270    ],
1271    variables: &["val ", "var ", "private val ", "private var ", "const val "],
1272    imports: &["import "],
1273};
1274
1275const SP_SWIFT: SymbolPatterns = SymbolPatterns {
1276    functions: &[
1277        "func ",
1278        "private func ",
1279        "public func ",
1280        "internal func ",
1281        "override func ",
1282        "open func ",
1283        "static func ",
1284        "class func ",
1285        "mutating func ",
1286        "private static func ",
1287        "public static func ",
1288    ],
1289    classes: &[
1290        "class ",
1291        "struct ",
1292        "protocol ",
1293        "enum ",
1294        "extension ",
1295        "actor ",
1296        "public class ",
1297        "private class ",
1298        "open class ",
1299        "final class ",
1300        "public struct ",
1301        "private struct ",
1302        "public protocol ",
1303    ],
1304    variables: &[
1305        "var ",
1306        "let ",
1307        "private var ",
1308        "private let ",
1309        "static var ",
1310        "static let ",
1311    ],
1312    imports: &["import "],
1313};
1314
1315const SP_RUBY: SymbolPatterns = SymbolPatterns {
1316    functions: &["def ", "private def ", "protected def "],
1317    classes: &["class ", "module "],
1318    variables: &[],
1319    imports: &["require ", "require_relative "],
1320};
1321
1322const SP_SCALA: SymbolPatterns = SymbolPatterns {
1323    functions: &["def ", "private def ", "protected def ", "override def "],
1324    classes: &[
1325        "class ",
1326        "case class ",
1327        "abstract class ",
1328        "sealed class ",
1329        "object ",
1330        "trait ",
1331    ],
1332    variables: &["val ", "var ", "lazy val "],
1333    imports: &["import "],
1334};
1335
1336const SP_PHP: SymbolPatterns = SymbolPatterns {
1337    functions: &[
1338        "function ",
1339        "public function ",
1340        "private function ",
1341        "protected function ",
1342        "static function ",
1343        "abstract function ",
1344        "final function ",
1345        "public static function ",
1346        "private static function ",
1347        "protected static function ",
1348    ],
1349    classes: &[
1350        "class ",
1351        "abstract class ",
1352        "final class ",
1353        "interface ",
1354        "trait ",
1355        "enum ",
1356    ],
1357    variables: &[],
1358    imports: &[
1359        "use ",
1360        "require ",
1361        "require_once ",
1362        "include ",
1363        "include_once ",
1364    ],
1365};
1366
1367const SP_ELIXIR: SymbolPatterns = SymbolPatterns {
1368    functions: &[
1369        "def ",
1370        "defp ",
1371        "defmacro ",
1372        "defmacrop ",
1373        "defguard ",
1374        "defguardp ",
1375    ],
1376    classes: &["defmodule ", "defprotocol ", "defimpl "],
1377    variables: &[],
1378    imports: &["import ", "alias ", "use ", "require "],
1379};
1380
1381const SP_ERLANG: SymbolPatterns = SymbolPatterns {
1382    functions: &[],
1383    classes: &["-module("],
1384    variables: &[],
1385    imports: &["-import(", "-include(", "-include_lib("],
1386};
1387
1388const SP_FSHARP: SymbolPatterns = SymbolPatterns {
1389    functions: &[
1390        "let ",
1391        "let rec ",
1392        "member ",
1393        "override ",
1394        "abstract member ",
1395    ],
1396    classes: &["type "],
1397    variables: &["let mutable "],
1398    imports: &["open "],
1399};
1400
1401const SP_GROOVY: SymbolPatterns = SymbolPatterns {
1402    functions: &["def ", "private def ", "public def ", "protected def "],
1403    classes: &["class ", "abstract class ", "interface ", "enum ", "trait "],
1404    variables: &[],
1405    imports: &["import "],
1406};
1407
1408const SP_HASKELL: SymbolPatterns = SymbolPatterns {
1409    functions: &[],
1410    classes: &["class ", "data ", "newtype ", "type "],
1411    variables: &[],
1412    imports: &["import "],
1413};
1414
1415const SP_LUA: SymbolPatterns = SymbolPatterns {
1416    functions: &["function ", "local function "],
1417    classes: &[],
1418    variables: &["local "],
1419    imports: &[],
1420};
1421
1422const SP_NIM: SymbolPatterns = SymbolPatterns {
1423    functions: &[
1424        "proc ",
1425        "func ",
1426        "method ",
1427        "iterator ",
1428        "converter ",
1429        "template ",
1430        "macro ",
1431    ],
1432    classes: &["type "],
1433    variables: &["var ", "let ", "const "],
1434    imports: &["import ", "from "],
1435};
1436
1437const SP_OBJECTIVEC: SymbolPatterns = SymbolPatterns {
1438    functions: &["- (", "+ ("],
1439    classes: &["@interface ", "@implementation ", "@protocol "],
1440    variables: &[],
1441    imports: &["#import ", "#include "],
1442};
1443
1444const SP_OCAML: SymbolPatterns = SymbolPatterns {
1445    functions: &["let ", "let rec "],
1446    classes: &["type ", "module ", "class "],
1447    variables: &[],
1448    imports: &["open "],
1449};
1450
1451const SP_PERL: SymbolPatterns = SymbolPatterns {
1452    functions: &["sub "],
1453    classes: &["package "],
1454    variables: &["my ", "our ", "local "],
1455    imports: &["use ", "require "],
1456};
1457
1458const SP_CLOJURE: SymbolPatterns = SymbolPatterns {
1459    functions: &["(defn ", "(defn- ", "(defmacro ", "(defmulti "],
1460    classes: &[
1461        "(defrecord ",
1462        "(defprotocol ",
1463        "(deftype ",
1464        "(definterface ",
1465    ],
1466    variables: &["(def ", "(defonce "],
1467    imports: &["(ns ", "(require "],
1468};
1469
1470const SP_JULIA: SymbolPatterns = SymbolPatterns {
1471    functions: &["function ", "macro "],
1472    classes: &[
1473        "struct ",
1474        "mutable struct ",
1475        "abstract type ",
1476        "primitive type ",
1477    ],
1478    variables: &["const "],
1479    imports: &["import ", "using "],
1480};
1481
1482const SP_DART: SymbolPatterns = SymbolPatterns {
1483    functions: &[],
1484    classes: &["class ", "abstract class ", "mixin ", "extension ", "enum "],
1485    variables: &["var ", "final ", "const ", "late "],
1486    imports: &["import "],
1487};
1488
1489const SP_R: SymbolPatterns = SymbolPatterns {
1490    functions: &[],
1491    classes: &[],
1492    variables: &[],
1493    imports: &["library(", "source("],
1494};
1495
1496const SP_SQL: SymbolPatterns = SymbolPatterns {
1497    functions: &[
1498        "create function ",
1499        "create or replace function ",
1500        "create procedure ",
1501        "create or replace procedure ",
1502        "CREATE FUNCTION ",
1503        "CREATE OR REPLACE FUNCTION ",
1504        "CREATE PROCEDURE ",
1505        "CREATE OR REPLACE PROCEDURE ",
1506    ],
1507    classes: &[
1508        "create table ",
1509        "create view ",
1510        "create schema ",
1511        "CREATE TABLE ",
1512        "CREATE VIEW ",
1513        "CREATE SCHEMA ",
1514    ],
1515    variables: &["declare ", "DECLARE "],
1516    imports: &[],
1517};
1518
1519const SP_ASSEMBLY: SymbolPatterns = SymbolPatterns {
1520    functions: &["proc ", "PROC "],
1521    classes: &[],
1522    variables: &[],
1523    imports: &["include ", "INCLUDE ", "%include "],
1524};
1525
1526const SP_ZIG: SymbolPatterns = SymbolPatterns {
1527    functions: &[
1528        "fn ",
1529        "pub fn ",
1530        "export fn ",
1531        "inline fn ",
1532        "pub inline fn ",
1533    ],
1534    classes: &[],
1535    variables: &["var ", "pub var "],
1536    imports: &[],
1537};
1538
1539#[allow(clippy::struct_excessive_bools)]
1540#[derive(Debug, Clone)]
1541struct ScanConfig {
1542    line_comments: &'static [&'static str],
1543    block_comment: Option<(&'static str, &'static str)>,
1544    allow_single_quote_strings: bool,
1545    allow_double_quote_strings: bool,
1546    allow_triple_quote_strings: bool,
1547    allow_csharp_verbatim_strings: bool,
1548    skip_lines: HashSet<usize>,
1549    symbol_patterns: SymbolPatterns,
1550}
1551
1552/// Per-call IEEE 1045-1992 flags derived from `AnalysisOptions` plus per-language properties.
1553/// Private to this crate; constructed inside `analyze_text`.
1554#[derive(Debug, Clone, Copy)]
1555struct IeeeFlags {
1556    /// True for C, C++, and Objective-C โ€” languages with a C preprocessor.
1557    has_preprocessor_directives: bool,
1558    /// Mirrors `AnalysisOptions::blank_in_block_comment_as_comment`.
1559    blank_in_block_comment_as_comment: bool,
1560    /// Mirrors `AnalysisOptions::collapse_continuation_lines`.
1561    collapse_continuation_lines: bool,
1562}
1563
1564#[derive(Debug, Clone, Copy)]
1565enum StringState {
1566    Single(char),
1567    Triple(&'static str),
1568    VerbatimDouble,
1569}
1570
1571#[allow(clippy::struct_excessive_bools)]
1572#[derive(Debug, Default)]
1573struct LineFacts {
1574    has_code: bool,
1575    has_single_comment: bool,
1576    has_multi_comment: bool,
1577    has_docstring: bool,
1578}
1579
1580/// Process one character while the lexer is inside a string literal.
1581///
1582/// Returns `(new_string_state, advance)` where `advance` is the number of chars to skip.
1583fn process_string_char(
1584    state: StringState,
1585    chars: &[char],
1586    i: usize,
1587) -> (Option<StringState>, usize) {
1588    match state {
1589        StringState::Single(delim) => {
1590            if chars[i] == '\\' {
1591                return (Some(state), 2); // skip escaped character
1592            }
1593            if chars[i] == delim {
1594                (None, 1)
1595            } else {
1596                (Some(state), 1)
1597            }
1598        }
1599        StringState::Triple(delim) => {
1600            if starts_with(chars, i, delim) {
1601                (None, delim.len())
1602            } else {
1603                (Some(state), 1)
1604            }
1605        }
1606        StringState::VerbatimDouble => {
1607            if starts_with(chars, i, "\"\"") {
1608                return (Some(state), 2); // escaped quote-quote inside verbatim string
1609            }
1610            if chars[i] == '"' {
1611                (None, 1)
1612            } else {
1613                (Some(state), 1)
1614            }
1615        }
1616    }
1617}
1618
1619/// Process one character while the lexer is inside a block comment.
1620///
1621/// Returns `(still_in_block_comment, advance)`.
1622fn process_block_comment_char(chars: &[char], i: usize, close: &str) -> (bool, usize) {
1623    if starts_with(chars, i, close) {
1624        (false, close.len())
1625    } else {
1626        (true, 1)
1627    }
1628}
1629
1630/// Attempt to begin a new string literal at position `i`.
1631///
1632/// Returns `Some((new_state, advance))` when a string opener is detected, else `None`.
1633fn try_open_string(chars: &[char], i: usize, config: &ScanConfig) -> Option<(StringState, usize)> {
1634    if config.allow_csharp_verbatim_strings && starts_with(chars, i, "@\"") {
1635        return Some((StringState::VerbatimDouble, 2));
1636    }
1637    if config.allow_triple_quote_strings {
1638        if starts_with(chars, i, "\"\"\"") {
1639            return Some((StringState::Triple("\"\"\""), 3));
1640        }
1641        if starts_with(chars, i, "'''") {
1642            return Some((StringState::Triple("'''"), 3));
1643        }
1644    }
1645    if config.allow_single_quote_strings && chars[i] == '\'' {
1646        return Some((StringState::Single('\''), 1));
1647    }
1648    if config.allow_double_quote_strings && chars[i] == '"' {
1649        return Some((StringState::Single('"'), 1));
1650    }
1651    None
1652}
1653
1654/// Advance past one character position while inside a block comment.
1655///
1656/// Updates `in_block_comment` if the closing delimiter is found and returns the
1657/// number of characters consumed. Returns 0 when no block-comment config is set
1658/// (preserving the caller's `continue`-without-advance behaviour for that impossible state).
1659fn step_through_block_comment(
1660    chars: &[char],
1661    i: usize,
1662    block_comment: Option<(&'static str, &'static str)>,
1663    in_block_comment: &mut bool,
1664) -> usize {
1665    if let Some((_, close)) = block_comment {
1666        let (still_in, advance) = process_block_comment_char(chars, i, close);
1667        *in_block_comment = still_in;
1668        return advance;
1669    }
1670    0
1671}
1672
1673/// If the character at `i` starts a block comment, return the length of the opening
1674/// delimiter so the caller can advance past it. Returns `None` if no match.
1675fn try_open_block_comment(
1676    chars: &[char],
1677    i: usize,
1678    block_comment: Option<(&'static str, &'static str)>,
1679) -> Option<usize> {
1680    let (open, _) = block_comment?;
1681    starts_with(chars, i, open).then_some(open.len())
1682}
1683
1684/// Scan a single physical line and update `facts`, `in_block_comment`, and `string_state`.
1685///
1686/// Returns `true` when the caller should break out of the per-line loop early (line comment hit).
1687fn scan_line(
1688    chars: &[char],
1689    config: &ScanConfig,
1690    facts: &mut LineFacts,
1691    in_block_comment: &mut bool,
1692    string_state: &mut Option<StringState>,
1693) {
1694    let mut i = 0usize;
1695    while i < chars.len() {
1696        // Inside a string literal โ€” advance until the closing delimiter.
1697        if let Some(state) = *string_state {
1698            facts.has_code = true;
1699            let (new_state, advance) = process_string_char(state, chars, i);
1700            *string_state = new_state;
1701            i += advance;
1702            continue;
1703        }
1704
1705        // Inside a block comment โ€” advance until the closing delimiter.
1706        if *in_block_comment {
1707            facts.has_multi_comment = true;
1708            i += step_through_block_comment(chars, i, config.block_comment, in_block_comment);
1709            continue;
1710        }
1711
1712        // Whitespace outside any string/comment โ€” skip.
1713        if chars[i].is_whitespace() {
1714            i += 1;
1715            continue;
1716        }
1717
1718        // Attempt to open a string literal.
1719        if let Some((new_state, advance)) = try_open_string(chars, i, config) {
1720            facts.has_code = true;
1721            *string_state = Some(new_state);
1722            i += advance;
1723            continue;
1724        }
1725
1726        // Attempt to open a block comment.
1727        if let Some(advance) = try_open_block_comment(chars, i, config.block_comment) {
1728            facts.has_multi_comment = true;
1729            *in_block_comment = true;
1730            i += advance;
1731            continue;
1732        }
1733
1734        // Line comment โ€” rest of the line is a comment; stop scanning.
1735        if config
1736            .line_comments
1737            .iter()
1738            .any(|prefix| starts_with(chars, i, prefix))
1739        {
1740            facts.has_single_comment = true;
1741            break;
1742        }
1743
1744        // Plain code character.
1745        facts.has_code = true;
1746        i += 1;
1747    }
1748}
1749
1750/// Apply IEEE 1045-1992 ยง4.2 preprocessor-directive tracking and continuation-line merging,
1751/// then emit the finalized `LineFacts` for this physical line.
1752///
1753/// Returns `None` when the line is part of a continuation sequence and should be deferred.
1754fn finalize_line_facts(
1755    facts: LineFacts,
1756    trimmed: &str,
1757    raw: &mut RawLineCounts,
1758    ieee: IeeeFlags,
1759    in_block_comment: bool,
1760    string_state: Option<StringState>,
1761    pending_continuation: &mut Option<LineFacts>,
1762) -> Option<LineFacts> {
1763    // IEEE 1045-1992 ยง4.2: track preprocessor/compiler directive lines (C/C++/ObjC).
1764    // A directive line is a pure code line (no comment on the same physical line) whose
1765    // trimmed content starts with '#'.
1766    if ieee.has_preprocessor_directives
1767        && facts.has_code
1768        && !facts.has_single_comment
1769        && !facts.has_multi_comment
1770        && trimmed.starts_with('#')
1771    {
1772        raw.compiler_directive_lines += 1;
1773    }
1774
1775    // IEEE 1045-1992 continuation-line handling.
1776    // A line is a continuation starter when it ends with '\' outside any comment or string.
1777    let is_continuation = ieee.collapse_continuation_lines
1778        && !in_block_comment
1779        && string_state.is_none()
1780        && trimmed.ends_with('\\');
1781
1782    if is_continuation {
1783        let pending = pending_continuation.get_or_insert_with(LineFacts::default);
1784        pending.has_code |= facts.has_code;
1785        pending.has_single_comment |= facts.has_single_comment;
1786        pending.has_multi_comment |= facts.has_multi_comment;
1787        pending.has_docstring |= facts.has_docstring;
1788        return None; // defer classification until the sequence ends
1789    }
1790
1791    // Merge any accumulated continuation facts into the final line.
1792    let emit = if let Some(pending) = pending_continuation.take() {
1793        LineFacts {
1794            has_code: pending.has_code | facts.has_code,
1795            has_single_comment: pending.has_single_comment | facts.has_single_comment,
1796            has_multi_comment: pending.has_multi_comment | facts.has_multi_comment,
1797            has_docstring: pending.has_docstring | facts.has_docstring,
1798        }
1799    } else {
1800        facts
1801    };
1802    Some(emit)
1803}
1804
1805/// Scan and classify one physical line, updating all running state in place.
1806///
1807/// Pre-classified lines (present in `config.skip_lines`) are counted as docstring-comment
1808/// lines and returned early without further analysis.
1809#[allow(clippy::needless_pass_by_value)]
1810#[allow(clippy::too_many_arguments)]
1811fn process_physical_line(
1812    line: &str,
1813    line_idx: usize,
1814    config: &ScanConfig,
1815    raw: &mut RawLineCounts,
1816    in_block_comment: &mut bool,
1817    string_state: &mut Option<StringState>,
1818    pending_continuation: &mut Option<LineFacts>,
1819    ieee: IeeeFlags,
1820) {
1821    raw.total_physical_lines += 1;
1822
1823    if config.skip_lines.contains(&line_idx) {
1824        raw.docstring_comment_lines += 1;
1825        return;
1826    }
1827
1828    let trimmed = line.trim();
1829    let mut facts = LineFacts::default();
1830
1831    // IEEE 1045-1992: blank lines inside block comments are comment lines by default.
1832    // When blank_in_block_comment_as_comment is false, blank lines keep their blank
1833    // classification even while inside a block comment.
1834    if *in_block_comment && (ieee.blank_in_block_comment_as_comment || !trimmed.is_empty()) {
1835        facts.has_multi_comment = true;
1836    }
1837
1838    let chars: Vec<char> = line.chars().collect();
1839    scan_line(&chars, config, &mut facts, in_block_comment, string_state);
1840
1841    let Some(emit) = finalize_line_facts(
1842        facts,
1843        trimmed,
1844        raw,
1845        ieee,
1846        *in_block_comment,
1847        *string_state,
1848        pending_continuation,
1849    ) else {
1850        return;
1851    };
1852
1853    classify_line(raw, &emit, trimmed);
1854
1855    if emit.has_code {
1856        let (f, c, v, i) = count_symbols(&config.symbol_patterns, trimmed);
1857        raw.functions += f;
1858        raw.classes += c;
1859        raw.variables += v;
1860        raw.imports += i;
1861    }
1862}
1863
1864#[allow(clippy::needless_pass_by_value)]
1865fn analyze_generic(text: &str, config: ScanConfig, ieee: IeeeFlags) -> RawFileAnalysis {
1866    let normalized = text.replace("\r\n", "\n").replace('\r', "\n");
1867    let lines: Vec<&str> = normalized.split_terminator('\n').collect();
1868
1869    let mut raw = RawLineCounts::default();
1870    let mut warnings = Vec::new();
1871
1872    let mut in_block_comment = false;
1873    let mut string_state: Option<StringState> = None;
1874    // IEEE continuation-line state: accumulates facts across a backslash-continued sequence.
1875    let mut pending_continuation: Option<LineFacts> = None;
1876
1877    for (line_idx, line) in lines.iter().enumerate() {
1878        process_physical_line(
1879            line,
1880            line_idx,
1881            &config,
1882            &mut raw,
1883            &mut in_block_comment,
1884            &mut string_state,
1885            &mut pending_continuation,
1886            ieee,
1887        );
1888    }
1889
1890    // Flush any pending continuation that reaches end-of-file without a closing line.
1891    if let Some(pending) = pending_continuation.take() {
1892        classify_line(&mut raw, &pending, "");
1893    }
1894
1895    if in_block_comment {
1896        warnings.push("unclosed block comment detected; result is best effort".into());
1897    }
1898    if string_state.is_some() {
1899        warnings.push("unclosed string literal detected; result is best effort".into());
1900    }
1901
1902    RawFileAnalysis {
1903        raw,
1904        parse_mode: if warnings.is_empty() {
1905            ParseMode::Lexical
1906        } else {
1907            ParseMode::LexicalBestEffort
1908        },
1909        warnings,
1910    }
1911}
1912
1913const fn classify_line(raw: &mut RawLineCounts, facts: &LineFacts, trimmed: &str) {
1914    if facts.has_docstring {
1915        raw.docstring_comment_lines += 1;
1916    } else if !facts.has_code
1917        && !facts.has_single_comment
1918        && !facts.has_multi_comment
1919        && trimmed.is_empty()
1920    {
1921        raw.blank_only_lines += 1;
1922    } else if facts.has_code && facts.has_single_comment {
1923        raw.mixed_code_single_comment_lines += 1;
1924    } else if facts.has_code && facts.has_multi_comment {
1925        raw.mixed_code_multi_comment_lines += 1;
1926    } else if facts.has_code {
1927        raw.code_only_lines += 1;
1928    } else if facts.has_single_comment {
1929        raw.single_comment_only_lines += 1;
1930    } else if facts.has_multi_comment {
1931        raw.multi_comment_only_lines += 1;
1932    } else if trimmed.is_empty() {
1933        raw.blank_only_lines += 1;
1934    } else {
1935        raw.skipped_unknown_lines += 1;
1936    }
1937}
1938
1939fn count_symbols(patterns: &SymbolPatterns, trimmed: &str) -> (u64, u64, u64, u64) {
1940    let hit = |pats: &[&str]| u64::from(pats.iter().any(|p| trimmed.starts_with(p)));
1941    (
1942        hit(patterns.functions),
1943        hit(patterns.classes),
1944        hit(patterns.variables),
1945        hit(patterns.imports),
1946    )
1947}
1948
1949fn starts_with(chars: &[char], index: usize, needle: &str) -> bool {
1950    let needle_chars: Vec<char> = needle.chars().collect();
1951    chars.get(index..index + needle_chars.len()) == Some(needle_chars.as_slice())
1952}
1953
1954#[derive(Debug, Clone)]
1955struct PyContext {
1956    indent: usize,
1957    expect_docstring: bool,
1958}
1959
1960/// Update `contexts` to pop any scopes that the current `indent` has outdented past.
1961fn py_pop_outdented_contexts(contexts: &mut Vec<PyContext>, indent: usize) {
1962    while contexts.len() > 1 && indent < contexts.last().map_or(0, |c| c.indent) {
1963        contexts.pop();
1964    }
1965}
1966
1967/// Handle `pending_block_indent` transition: push a new docstring-expecting context when we
1968/// detect the first indented line of a new block, or cancel the pending state otherwise.
1969fn py_handle_pending_indent(
1970    pending_block_indent: &mut Option<usize>,
1971    contexts: &mut Vec<PyContext>,
1972    indent: usize,
1973    trimmed: &str,
1974) {
1975    let Some(base_indent) = *pending_block_indent else {
1976        return;
1977    };
1978    if indent > base_indent {
1979        contexts.push(PyContext {
1980            indent,
1981            expect_docstring: true,
1982        });
1983        *pending_block_indent = None;
1984    } else if !trimmed.starts_with('@') {
1985        *pending_block_indent = None;
1986    }
1987}
1988
1989/// Check whether the current line is a docstring opener in the current context.
1990///
1991/// If it is, records the line, adjusts `ctx.expect_docstring`, and optionally sets
1992/// `active_docstring` for multi-line docstrings. Returns `true` when the caller should
1993/// `continue` to the next line.
1994fn py_try_record_docstring(
1995    ctx: &mut PyContext,
1996    trimmed: &str,
1997    idx: usize,
1998    docstring_lines: &mut HashSet<usize>,
1999    active_docstring: &mut Option<(&'static str, usize)>,
2000) -> bool {
2001    if !ctx.expect_docstring {
2002        return false;
2003    }
2004    if let Some(delim) = docstring_delimiter(trimmed) {
2005        docstring_lines.insert(idx);
2006        ctx.expect_docstring = false;
2007        if !closes_triple_docstring(trimmed, delim, true) {
2008            *active_docstring = Some((delim, idx));
2009        }
2010        return true;
2011    }
2012    ctx.expect_docstring = false;
2013    false
2014}
2015
2016/// Advance through an active multi-line docstring: marks the current line and clears
2017/// `active_docstring` when the closing delimiter is found. Returns `true` when the caller
2018/// should `continue` to the next line (i.e. we were inside a docstring).
2019fn track_active_docstring(
2020    active_docstring: &mut Option<(&'static str, usize)>,
2021    docstring_lines: &mut HashSet<usize>,
2022    idx: usize,
2023    trimmed: &str,
2024) -> bool {
2025    let Some((delim, start_line)) = *active_docstring else {
2026        return false;
2027    };
2028    docstring_lines.insert(idx);
2029    if closes_triple_docstring(trimmed, delim, idx == start_line) {
2030        *active_docstring = None;
2031    }
2032    true
2033}
2034
2035/// Attempt to record a docstring opener using the top of the context stack.
2036/// Returns `true` when the caller should `continue` to the next line.
2037fn try_record_docstring_if_context(
2038    contexts: &mut [PyContext],
2039    trimmed: &str,
2040    idx: usize,
2041    docstring_lines: &mut HashSet<usize>,
2042    active_docstring: &mut Option<(&'static str, usize)>,
2043) -> bool {
2044    let Some(ctx) = contexts.last_mut() else {
2045        return false;
2046    };
2047    py_try_record_docstring(ctx, trimmed, idx, docstring_lines, active_docstring)
2048}
2049
2050/// If an unclosed docstring is still active at end-of-file, mark all remaining lines.
2051fn mark_unclosed_docstring_lines(
2052    active_docstring: Option<&(&'static str, usize)>,
2053    docstring_lines: &mut HashSet<usize>,
2054    num_lines: usize,
2055) {
2056    if let Some(&(_, start_line)) = active_docstring {
2057        for idx in start_line..num_lines {
2058            docstring_lines.insert(idx);
2059        }
2060    }
2061}
2062
2063fn detect_python_docstring_lines(text: &str) -> HashSet<usize> {
2064    let normalized = text.replace("\r\n", "\n").replace('\r', "\n");
2065    let lines: Vec<&str> = normalized.split_terminator('\n').collect();
2066
2067    let mut docstring_lines = HashSet::new();
2068    let mut contexts = vec![PyContext {
2069        indent: 0,
2070        expect_docstring: true,
2071    }];
2072    let mut pending_block_indent: Option<usize> = None;
2073    let mut active_docstring: Option<(&'static str, usize)> = None;
2074
2075    for (idx, line) in lines.iter().enumerate() {
2076        let trimmed = line.trim();
2077        let indent = leading_indent(line);
2078
2079        if track_active_docstring(&mut active_docstring, &mut docstring_lines, idx, trimmed) {
2080            continue;
2081        }
2082
2083        // Blank lines and comment lines don't affect docstring detection.
2084        if trimmed.is_empty() || trimmed.starts_with('#') {
2085            continue;
2086        }
2087
2088        py_pop_outdented_contexts(&mut contexts, indent);
2089        py_handle_pending_indent(&mut pending_block_indent, &mut contexts, indent, trimmed);
2090
2091        if try_record_docstring_if_context(
2092            &mut contexts,
2093            trimmed,
2094            idx,
2095            &mut docstring_lines,
2096            &mut active_docstring,
2097        ) {
2098            continue;
2099        }
2100
2101        if is_python_block_header(trimmed) {
2102            pending_block_indent = Some(indent);
2103        }
2104    }
2105
2106    mark_unclosed_docstring_lines(active_docstring.as_ref(), &mut docstring_lines, lines.len());
2107
2108    docstring_lines
2109}
2110
2111fn leading_indent(line: &str) -> usize {
2112    line.chars().take_while(|c| c.is_whitespace()).count()
2113}
2114
2115fn is_python_block_header(trimmed: &str) -> bool {
2116    (trimmed.starts_with("def ")
2117        || trimmed.starts_with("async def ")
2118        || trimmed.starts_with("class "))
2119        && trimmed.ends_with(':')
2120}
2121
2122fn docstring_delimiter(trimmed: &str) -> Option<&'static str> {
2123    let mut idx = 0usize;
2124    let bytes = trimmed.as_bytes();
2125    while idx < bytes.len() {
2126        let c = bytes[idx] as char;
2127        if matches!(c, 'r' | 'R' | 'u' | 'U' | 'b' | 'B' | 'f' | 'F') {
2128            idx += 1;
2129            continue;
2130        }
2131        break;
2132    }
2133
2134    let rest = &trimmed[idx..];
2135    if rest.starts_with("\"\"\"") {
2136        Some("\"\"\"")
2137    } else if rest.starts_with("'''") {
2138        Some("'''")
2139    } else {
2140        None
2141    }
2142}
2143
2144fn closes_triple_docstring(trimmed: &str, delim: &str, same_line_as_start: bool) -> bool {
2145    let mut occurrences = 0usize;
2146    let mut search = trimmed;
2147    while let Some(index) = search.find(delim) {
2148        occurrences += 1;
2149        search = &search[index + delim.len()..];
2150    }
2151
2152    if same_line_as_start {
2153        occurrences >= 2
2154    } else {
2155        occurrences >= 1
2156    }
2157}
2158
2159/// Tree-sitter-backed adapters (compiled only when the `tree-sitter` feature is enabled).
2160///
2161/// When parsing succeeds the result is used directly; on any failure the caller falls back
2162/// to the lexical state machine.
2163#[cfg(feature = "tree-sitter")]
2164pub mod ts {
2165    use tree_sitter::Node;
2166
2167    use super::{ParseMode, RawFileAnalysis, RawLineCounts};
2168
2169    /// Classify every line of `text` using a tree-sitter grammar.
2170    ///
2171    /// `comment_node_kinds` โ€” node type names that represent comments in this grammar
2172    /// `docstring_stmt_kind` โ€” optional parent node type whose direct `string` child is a docstring
2173    fn analyze_lines(
2174        text: &str,
2175        ts_language: &tree_sitter::Language,
2176        comment_node_kinds: &[&str],
2177        docstring_stmt_kind: Option<&str>,
2178    ) -> Option<RawFileAnalysis> {
2179        let mut parser = tree_sitter::Parser::new();
2180        parser.set_language(ts_language).ok()?;
2181        let tree = parser.parse(text, None)?;
2182
2183        let lines: Vec<&str> = text.split_terminator('\n').collect();
2184        let n = lines.len();
2185
2186        let mut has_code = vec![false; n];
2187        let mut has_comment = vec![false; n];
2188        let mut comment_is_block = vec![false; n];
2189        let mut has_docstring = vec![false; n];
2190
2191        // Walk every node in the tree and mark line arrays.
2192        let mut ctx = VisitCtx {
2193            source: text.as_bytes(),
2194            comment_kinds: comment_node_kinds,
2195            docstring_stmt_kind,
2196            has_code: &mut has_code,
2197            has_comment: &mut has_comment,
2198            comment_is_block: &mut comment_is_block,
2199            has_docstring: &mut has_docstring,
2200        };
2201        visit(tree.root_node(), &mut ctx);
2202
2203        let mut raw = RawLineCounts::default();
2204        classify_ts_lines(
2205            &lines,
2206            &has_code,
2207            &has_comment,
2208            &comment_is_block,
2209            &has_docstring,
2210            &mut raw,
2211        );
2212
2213        Some(RawFileAnalysis {
2214            raw,
2215            parse_mode: ParseMode::TreeSitter,
2216            warnings: Vec::new(),
2217        })
2218    }
2219
2220    /// Flags describing what kinds of content appear on a single line.
2221    // Four bools are the natural representation for these four independent properties.
2222    #[allow(clippy::struct_excessive_bools)]
2223    #[derive(Clone, Copy)]
2224    struct TsLineFlags {
2225        has_code: bool,
2226        has_comment: bool,
2227        comment_is_block: bool,
2228        has_docstring: bool,
2229    }
2230
2231    /// Classify a single tree-sitter-annotated line and accumulate into `raw`.
2232    const fn classify_ts_line(trimmed: &str, flags: TsLineFlags, raw: &mut RawLineCounts) {
2233        if trimmed.is_empty() {
2234            raw.blank_only_lines += 1;
2235        } else if flags.has_docstring && !flags.has_code {
2236            raw.docstring_comment_lines += 1;
2237        } else if flags.has_code && flags.has_comment {
2238            // Classify the mixed line as single or multi based on what kind of comment is on it.
2239            if flags.comment_is_block {
2240                raw.mixed_code_multi_comment_lines += 1;
2241            } else {
2242                raw.mixed_code_single_comment_lines += 1;
2243            }
2244        } else if flags.has_comment {
2245            if flags.comment_is_block {
2246                raw.multi_comment_only_lines += 1;
2247            } else {
2248                raw.single_comment_only_lines += 1;
2249            }
2250        } else {
2251            raw.code_only_lines += 1;
2252        }
2253    }
2254
2255    /// Classify each tree-sitter-annotated line and accumulate counts into `raw`.
2256    fn classify_ts_lines(
2257        lines: &[&str],
2258        has_code: &[bool],
2259        has_comment: &[bool],
2260        comment_is_block: &[bool],
2261        has_docstring: &[bool],
2262        raw: &mut RawLineCounts,
2263    ) {
2264        for i in 0..lines.len() {
2265            raw.total_physical_lines += 1;
2266            classify_ts_line(
2267                lines[i].trim(),
2268                TsLineFlags {
2269                    has_code: has_code[i],
2270                    has_comment: has_comment[i],
2271                    comment_is_block: comment_is_block[i],
2272                    has_docstring: has_docstring[i],
2273                },
2274                raw,
2275            );
2276        }
2277    }
2278
2279    struct VisitCtx<'a> {
2280        source: &'a [u8],
2281        comment_kinds: &'a [&'a str],
2282        docstring_stmt_kind: Option<&'a str>,
2283        has_code: &'a mut Vec<bool>,
2284        has_comment: &'a mut Vec<bool>,
2285        comment_is_block: &'a mut Vec<bool>,
2286        has_docstring: &'a mut Vec<bool>,
2287    }
2288
2289    /// Mark all rows of a comment node and detect whether it is a block comment.
2290    fn visit_comment_node(node: Node, ctx: &mut VisitCtx<'_>) {
2291        let start_row = node.start_position().row;
2292        let end_row = node.end_position().row;
2293        let first_two = node
2294            .utf8_text(ctx.source)
2295            .unwrap_or("")
2296            .get(..2)
2297            .unwrap_or("");
2298        let is_block = first_two == "/*" || first_two == "<#";
2299        for row in start_row..=end_row {
2300            if row < ctx.has_comment.len() {
2301                ctx.has_comment[row] = true;
2302                if is_block {
2303                    ctx.comment_is_block[row] = true;
2304                }
2305            }
2306        }
2307    }
2308
2309    /// If `node` is an `expression_statement` whose sole named child is a string literal,
2310    /// mark those rows as docstring and return `true`.
2311    fn visit_maybe_docstring(node: Node, kind: &str, ctx: &mut VisitCtx<'_>) -> bool {
2312        let Some(stmt_kind) = ctx.docstring_stmt_kind else {
2313            return false;
2314        };
2315        if kind != stmt_kind || node.named_child_count() != 1 {
2316            return false;
2317        }
2318        let Some(child) = node.named_child(0) else {
2319            return false;
2320        };
2321        if child.kind() != "string" {
2322            return false;
2323        }
2324        let child_start = child.start_position().row;
2325        let child_end = child.end_position().row;
2326        for row in child_start..=child_end {
2327            if row < ctx.has_docstring.len() {
2328                ctx.has_docstring[row] = true;
2329            }
2330        }
2331        true
2332    }
2333
2334    /// Mark all rows of a leaf (non-comment, non-extra) node as code.
2335    fn visit_leaf_code(node: Node, ctx: &mut VisitCtx<'_>) {
2336        let start_row = node.start_position().row;
2337        let end_row = node.end_position().row;
2338        for row in start_row..=end_row {
2339            if row < ctx.has_code.len() {
2340                ctx.has_code[row] = true;
2341            }
2342        }
2343    }
2344
2345    #[allow(clippy::too_many_lines)]
2346    fn visit(node: Node, ctx: &mut VisitCtx<'_>) {
2347        // NOSONAR
2348        let kind = node.kind();
2349
2350        // Comment node โ€” mark rows as comment, detect block vs. line comment.
2351        if ctx.comment_kinds.contains(&kind) {
2352            visit_comment_node(node, ctx);
2353            return;
2354        }
2355
2356        // Python docstring: expression_statement whose only named child is a string literal.
2357        if visit_maybe_docstring(node, kind, ctx) {
2358            return;
2359        }
2360
2361        // Leaf non-comment node: mark as code.
2362        if node.child_count() == 0 && !node.is_extra() {
2363            visit_leaf_code(node, ctx);
2364            return;
2365        }
2366
2367        for i in 0..node.child_count() {
2368            if let Some(child) = node.child(i) {
2369                visit(child, ctx);
2370            }
2371        }
2372    }
2373
2374    /// Parse C or C++ source with tree-sitter-c.
2375    #[must_use]
2376    pub fn analyze_c(text: &str) -> Option<RawFileAnalysis> {
2377        let lang: tree_sitter::Language = tree_sitter_c::LANGUAGE.into();
2378        analyze_lines(text, &lang, &["comment"], None)
2379    }
2380
2381    /// Parse Python source with tree-sitter-python.
2382    #[must_use]
2383    pub fn analyze_python(text: &str) -> Option<RawFileAnalysis> {
2384        let lang: tree_sitter::Language = tree_sitter_python::LANGUAGE.into();
2385        analyze_lines(text, &lang, &["comment"], Some("expression_statement"))
2386    }
2387}
2388
2389#[cfg(test)]
2390mod tests {
2391    use super::*;
2392
2393    #[test]
2394    fn python_docstrings_are_separated() {
2395        let input = r#""""module docs"""
2396
2397
2398def fn_a():
2399    """function docs"""
2400    value = 1  # trailing comment
2401    return value
2402"#;
2403
2404        let result = analyze_text(Language::Python, input, AnalysisOptions::default());
2405        assert_eq!(result.raw.docstring_comment_lines, 2);
2406        assert_eq!(result.raw.mixed_code_single_comment_lines, 1);
2407        assert_eq!(result.raw.code_only_lines, 2);
2408    }
2409
2410    #[test]
2411    fn c_style_mixed_lines_are_captured() {
2412        let input = "int x = 1; // note\n/* block */\n";
2413        let result = analyze_text(Language::C, input, AnalysisOptions::default());
2414        assert_eq!(result.raw.mixed_code_single_comment_lines, 1);
2415        assert_eq!(result.raw.multi_comment_only_lines, 1);
2416    }
2417
2418    #[test]
2419    fn detect_language_by_shebang() {
2420        let language = detect_language(
2421            Path::new("script"),
2422            Some("#!/usr/bin/env bash"),
2423            &BTreeMap::new(),
2424            true,
2425        );
2426        assert_eq!(language, Some(Language::Shell));
2427    }
2428}