Skip to main content

sloc_languages/
lib.rs

1// SPDX-License-Identifier: AGPL-3.0-or-later
2// Copyright (C) 2026 Nima Shafie <nimzshafie@gmail.com>
3
4use std::collections::{BTreeMap, BTreeSet, HashSet};
5use std::path::Path;
6
7use serde::{Deserialize, Serialize};
8
9#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
10#[serde(rename_all = "snake_case")]
11pub enum Language {
12    C,
13    Cpp,
14    CSharp,
15    Go,
16    Java,
17    JavaScript,
18    Python,
19    Rust,
20    Shell,
21    PowerShell,
22    TypeScript,
23    // --- Extended language support ---
24    Assembly,
25    Clojure,
26    Css,
27    Dart,
28    Dockerfile,
29    Elixir,
30    Erlang,
31    FSharp,
32    Groovy,
33    Haskell,
34    Html,
35    Julia,
36    Kotlin,
37    Lua,
38    Makefile,
39    Nim,
40    ObjectiveC,
41    Ocaml,
42    Perl,
43    Php,
44    R,
45    Ruby,
46    Scala,
47    Scss,
48    Sql,
49    Svelte,
50    Swift,
51    Vue,
52    Xml,
53    Zig,
54}
55
56impl Language {
57    pub fn display_name(&self) -> &'static str {
58        match self {
59            Language::C => "C",
60            Language::Cpp => "C++",
61            Language::CSharp => "C#",
62            Language::Go => "Go",
63            Language::Java => "Java",
64            Language::JavaScript => "JavaScript",
65            Language::Python => "Python",
66            Language::Rust => "Rust",
67            Language::Shell => "Shell",
68            Language::PowerShell => "PowerShell",
69            Language::TypeScript => "TypeScript",
70            Language::Assembly => "Assembly",
71            Language::Clojure => "Clojure",
72            Language::Css => "CSS",
73            Language::Dart => "Dart",
74            Language::Dockerfile => "Dockerfile",
75            Language::Elixir => "Elixir",
76            Language::Erlang => "Erlang",
77            Language::FSharp => "F#",
78            Language::Groovy => "Groovy",
79            Language::Haskell => "Haskell",
80            Language::Html => "HTML",
81            Language::Julia => "Julia",
82            Language::Kotlin => "Kotlin",
83            Language::Lua => "Lua",
84            Language::Makefile => "Makefile",
85            Language::Nim => "Nim",
86            Language::ObjectiveC => "Objective-C",
87            Language::Ocaml => "OCaml",
88            Language::Perl => "Perl",
89            Language::Php => "PHP",
90            Language::R => "R",
91            Language::Ruby => "Ruby",
92            Language::Scala => "Scala",
93            Language::Scss => "SCSS",
94            Language::Sql => "SQL",
95            Language::Svelte => "Svelte",
96            Language::Swift => "Swift",
97            Language::Vue => "Vue",
98            Language::Xml => "XML",
99            Language::Zig => "Zig",
100        }
101    }
102
103    pub fn as_slug(&self) -> &'static str {
104        match self {
105            Language::C => "c",
106            Language::Cpp => "cpp",
107            Language::CSharp => "csharp",
108            Language::Go => "go",
109            Language::Java => "java",
110            Language::JavaScript => "javascript",
111            Language::Python => "python",
112            Language::Rust => "rust",
113            Language::Shell => "shell",
114            Language::PowerShell => "powershell",
115            Language::TypeScript => "typescript",
116            Language::Assembly => "assembly",
117            Language::Clojure => "clojure",
118            Language::Css => "css",
119            Language::Dart => "dart",
120            Language::Dockerfile => "dockerfile",
121            Language::Elixir => "elixir",
122            Language::Erlang => "erlang",
123            Language::FSharp => "fsharp",
124            Language::Groovy => "groovy",
125            Language::Haskell => "haskell",
126            Language::Html => "html",
127            Language::Julia => "julia",
128            Language::Kotlin => "kotlin",
129            Language::Lua => "lua",
130            Language::Makefile => "makefile",
131            Language::Nim => "nim",
132            Language::ObjectiveC => "objectivec",
133            Language::Ocaml => "ocaml",
134            Language::Perl => "perl",
135            Language::Php => "php",
136            Language::R => "r",
137            Language::Ruby => "ruby",
138            Language::Scala => "scala",
139            Language::Scss => "scss",
140            Language::Sql => "sql",
141            Language::Svelte => "svelte",
142            Language::Swift => "swift",
143            Language::Vue => "vue",
144            Language::Xml => "xml",
145            Language::Zig => "zig",
146        }
147    }
148
149    pub fn from_name(name: &str) -> Option<Self> {
150        match name.trim().to_ascii_lowercase().as_str() {
151            "c" => Some(Language::C),
152            "cpp" | "c++" | "cplusplus" => Some(Language::Cpp),
153            "csharp" | "c#" | "cs" => Some(Language::CSharp),
154            "go" | "golang" => Some(Language::Go),
155            "java" => Some(Language::Java),
156            "javascript" | "js" => Some(Language::JavaScript),
157            "python" | "py" => Some(Language::Python),
158            "rust" | "rs" => Some(Language::Rust),
159            "shell" | "sh" | "bash" => Some(Language::Shell),
160            "powershell" | "pwsh" | "ps" => Some(Language::PowerShell),
161            "typescript" | "ts" => Some(Language::TypeScript),
162            "assembly" | "asm" => Some(Language::Assembly),
163            "clojure" | "clj" => Some(Language::Clojure),
164            "css" => Some(Language::Css),
165            "dart" => Some(Language::Dart),
166            "dockerfile" | "docker" => Some(Language::Dockerfile),
167            "elixir" | "ex" => Some(Language::Elixir),
168            "erlang" | "erl" => Some(Language::Erlang),
169            "fsharp" | "f#" | "fs" => Some(Language::FSharp),
170            "groovy" => Some(Language::Groovy),
171            "haskell" | "hs" => Some(Language::Haskell),
172            "html" | "htm" => Some(Language::Html),
173            "julia" | "jl" => Some(Language::Julia),
174            "kotlin" | "kt" => Some(Language::Kotlin),
175            "lua" => Some(Language::Lua),
176            "makefile" | "make" | "mk" => Some(Language::Makefile),
177            "nim" => Some(Language::Nim),
178            "objectivec" | "objc" | "objective-c" => Some(Language::ObjectiveC),
179            "ocaml" | "ml" => Some(Language::Ocaml),
180            "perl" | "pl" => Some(Language::Perl),
181            "php" => Some(Language::Php),
182            "r" => Some(Language::R),
183            "ruby" | "rb" => Some(Language::Ruby),
184            "scala" => Some(Language::Scala),
185            "scss" | "sass" => Some(Language::Scss),
186            "sql" => Some(Language::Sql),
187            "svelte" => Some(Language::Svelte),
188            "swift" => Some(Language::Swift),
189            "vue" => Some(Language::Vue),
190            "xml" => Some(Language::Xml),
191            "zig" => Some(Language::Zig),
192            _ => None,
193        }
194    }
195}
196
197#[derive(Debug, Clone, Serialize, Deserialize, Default)]
198pub struct RawLineCounts {
199    pub total_physical_lines: u64,
200    pub blank_only_lines: u64,
201    pub code_only_lines: u64,
202    pub single_comment_only_lines: u64,
203    pub multi_comment_only_lines: u64,
204    pub mixed_code_single_comment_lines: u64,
205    pub mixed_code_multi_comment_lines: u64,
206    pub docstring_comment_lines: u64,
207    pub skipped_unknown_lines: u64,
208    /// Best-effort count of function/method definition lines detected lexically.
209    #[serde(default)]
210    pub functions: u64,
211    /// Best-effort count of class/struct/trait/type definition lines detected lexically.
212    #[serde(default)]
213    pub classes: u64,
214    /// Best-effort count of variable declaration lines detected lexically.
215    #[serde(default)]
216    pub variables: u64,
217    /// Best-effort count of import/use/include statement lines detected lexically.
218    #[serde(default)]
219    pub imports: u64,
220    /// Lines consisting solely of preprocessor/compiler directives (e.g. `#include`, `#define`
221    /// in C/C++/Objective-C). Always a subset of `code_only_lines`. Controlled by
222    /// `AnalysisConfig::count_compiler_directives`. IEEE 1045-1992 §4.2.
223    #[serde(default)]
224    pub compiler_directive_lines: u64,
225}
226
227#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
228#[serde(rename_all = "snake_case")]
229pub enum ParseMode {
230    Lexical,
231    LexicalBestEffort,
232    TreeSitter,
233}
234
235#[derive(Debug, Clone, Serialize, Deserialize)]
236pub struct RawFileAnalysis {
237    pub raw: RawLineCounts,
238    pub parse_mode: ParseMode,
239    pub warnings: Vec<String>,
240}
241
242/// IEEE 1045-1992 counting options passed from `sloc-core` (built from `AnalysisConfig`).
243///
244/// `analyze_text` accepts this struct so that the caller can control behaviour that the
245/// standard defines as configurable parameters rather than fixed conventions.
246#[derive(Debug, Clone, Copy)]
247pub struct AnalysisOptions {
248    /// When `true` (IEEE 1045-1992 default), blank lines inside block comments count as
249    /// comment lines rather than blank lines.
250    pub blank_in_block_comment_as_comment: bool,
251    /// When `true`, backslash-continued physical lines are collapsed into a single logical
252    /// line for SLOC counting purposes (IEEE logical SLOC mode).
253    pub collapse_continuation_lines: bool,
254}
255
256impl Default for AnalysisOptions {
257    fn default() -> Self {
258        Self {
259            blank_in_block_comment_as_comment: true,
260            collapse_continuation_lines: false,
261        }
262    }
263}
264
265pub fn supported_languages() -> BTreeSet<Language> {
266    [
267        Language::Assembly,
268        Language::C,
269        Language::Clojure,
270        Language::Cpp,
271        Language::CSharp,
272        Language::Css,
273        Language::Dart,
274        Language::Dockerfile,
275        Language::Elixir,
276        Language::Erlang,
277        Language::FSharp,
278        Language::Go,
279        Language::Groovy,
280        Language::Haskell,
281        Language::Html,
282        Language::Java,
283        Language::JavaScript,
284        Language::Julia,
285        Language::Kotlin,
286        Language::Lua,
287        Language::Makefile,
288        Language::Nim,
289        Language::ObjectiveC,
290        Language::Ocaml,
291        Language::Perl,
292        Language::Php,
293        Language::PowerShell,
294        Language::Python,
295        Language::R,
296        Language::Ruby,
297        Language::Rust,
298        Language::Scala,
299        Language::Scss,
300        Language::Shell,
301        Language::Sql,
302        Language::Svelte,
303        Language::Swift,
304        Language::TypeScript,
305        Language::Vue,
306        Language::Xml,
307        Language::Zig,
308    ]
309    .into_iter()
310    .collect()
311}
312
313pub fn detect_language(
314    path: &Path,
315    first_line: Option<&str>,
316    extension_overrides: &BTreeMap<String, String>,
317    shebang_detection: bool,
318) -> Option<Language> {
319    let extension = path
320        .extension()
321        .and_then(|ext| ext.to_str())
322        .map(|ext| ext.to_ascii_lowercase());
323
324    // Extension override check (user-configured mappings win over everything)
325    if let Some(ext) = extension.as_ref() {
326        if let Some(override_name) = extension_overrides.get(ext.as_str()) {
327            if let Some(lang) = Language::from_name(override_name) {
328                return Some(lang);
329            }
330        }
331    }
332
333    // Filename-based detection for files that have no extension or use exact names
334    let stem = path.file_stem().and_then(|s| s.to_str()).unwrap_or("");
335    let filename = path.file_name().and_then(|s| s.to_str()).unwrap_or("");
336    let filename_lower = filename.to_ascii_lowercase();
337
338    // Dockerfile: exact name or Dockerfile.* variant
339    if filename == "Dockerfile"
340        || filename.starts_with("Dockerfile.")
341        || filename_lower == "dockerfile"
342    {
343        return Some(Language::Dockerfile);
344    }
345
346    // Makefile variants
347    if matches!(
348        filename,
349        "Makefile" | "GNUmakefile" | "makefile" | "BSDmakefile"
350    ) {
351        return Some(Language::Makefile);
352    }
353
354    // Ruby ecosystem files that have no extension
355    if matches!(
356        filename,
357        "Rakefile" | "Gemfile" | "Guardfile" | "Vagrantfile" | "Fastfile" | "Podfile"
358    ) {
359        return Some(Language::Ruby);
360    }
361
362    let _ = stem; // suppress unused warning
363
364    // Extension-based detection
365    if let Some(ext) = extension.as_deref() {
366        let by_ext = match ext {
367            // --- Original 11 ---
368            "c" | "h" => Some(Language::C),
369            "cc" | "cp" | "cpp" | "cxx" | "hh" | "hpp" | "hxx" => Some(Language::Cpp),
370            "cs" => Some(Language::CSharp),
371            "go" => Some(Language::Go),
372            "java" => Some(Language::Java),
373            "js" | "mjs" | "cjs" => Some(Language::JavaScript),
374            "py" => Some(Language::Python),
375            "rs" => Some(Language::Rust),
376            "sh" | "bash" | "zsh" | "ksh" => Some(Language::Shell),
377            "ps1" | "psm1" | "psd1" => Some(Language::PowerShell),
378            "ts" | "mts" | "cts" => Some(Language::TypeScript),
379            // --- Extended 30 ---
380            "asm" | "s" => Some(Language::Assembly),
381            "clj" | "cljs" | "cljc" | "edn" => Some(Language::Clojure),
382            "css" => Some(Language::Css),
383            "dart" => Some(Language::Dart),
384            "ex" | "exs" => Some(Language::Elixir),
385            "erl" | "hrl" => Some(Language::Erlang),
386            "fs" | "fsi" | "fsx" => Some(Language::FSharp),
387            "groovy" | "gradle" => Some(Language::Groovy),
388            "hs" | "lhs" => Some(Language::Haskell),
389            "html" | "htm" | "xhtml" => Some(Language::Html),
390            "jl" => Some(Language::Julia),
391            "kt" | "kts" => Some(Language::Kotlin),
392            "lua" => Some(Language::Lua),
393            "mk" => Some(Language::Makefile),
394            "nim" | "nims" => Some(Language::Nim),
395            "m" | "mm" => Some(Language::ObjectiveC),
396            "ml" | "mli" => Some(Language::Ocaml),
397            "pl" | "pm" | "t" => Some(Language::Perl),
398            "php" | "php3" | "php4" | "php5" | "php7" | "phtml" => Some(Language::Php),
399            "r" => Some(Language::R),
400            "rb" | "rake" => Some(Language::Ruby),
401            "scala" | "sc" => Some(Language::Scala),
402            "scss" | "sass" => Some(Language::Scss),
403            "sql" => Some(Language::Sql),
404            "svelte" => Some(Language::Svelte),
405            "swift" => Some(Language::Swift),
406            "vue" => Some(Language::Vue),
407            "xml" | "xsd" | "xsl" | "xslt" | "svg" => Some(Language::Xml),
408            "zig" => Some(Language::Zig),
409            _ => None,
410        };
411
412        if by_ext.is_some() {
413            return by_ext;
414        }
415    }
416
417    if shebang_detection {
418        if let Some(line) = first_line {
419            let lower = line.to_ascii_lowercase();
420            if lower.starts_with("#!") {
421                if lower.contains("python") {
422                    return Some(Language::Python);
423                }
424                if lower.contains("pwsh") || lower.contains("powershell") {
425                    return Some(Language::PowerShell);
426                }
427                if lower.contains("bash")
428                    || lower.contains("/sh")
429                    || lower.contains("zsh")
430                    || lower.contains("ksh")
431                {
432                    return Some(Language::Shell);
433                }
434                if lower.contains("ruby") {
435                    return Some(Language::Ruby);
436                }
437                if lower.contains("perl") {
438                    return Some(Language::Perl);
439                }
440                if lower.contains("php") {
441                    return Some(Language::Php);
442                }
443                if lower.contains("node") || lower.contains("nodejs") {
444                    return Some(Language::JavaScript);
445                }
446            }
447        }
448    }
449
450    None
451}
452
453pub fn analyze_text(language: Language, text: &str, options: AnalysisOptions) -> RawFileAnalysis {
454    // IEEE flags shared by all non-preprocessor languages.
455    let base = IeeeFlags {
456        has_preprocessor_directives: false,
457        blank_in_block_comment_as_comment: options.blank_in_block_comment_as_comment,
458        collapse_continuation_lines: options.collapse_continuation_lines,
459    };
460    // C, C++, and Objective-C have a preprocessor whose directive lines are tracked separately
461    // per IEEE 1045-1992 §4.2.
462    let cpp = IeeeFlags {
463        has_preprocessor_directives: true,
464        ..base
465    };
466
467    match language {
468        Language::C => {
469            #[cfg(feature = "tree-sitter")]
470            if let Some(result) = ts::analyze_c(text) {
471                return result;
472            }
473            analyze_generic(
474                text,
475                ScanConfig {
476                    line_comments: &["//"],
477                    block_comment: Some(("/*", "*/")),
478                    allow_single_quote_strings: true,
479                    allow_double_quote_strings: true,
480                    allow_triple_quote_strings: false,
481                    allow_csharp_verbatim_strings: false,
482                    skip_lines: HashSet::new(),
483                    symbol_patterns: SP_C,
484                },
485                cpp,
486            )
487        }
488        Language::Cpp => {
489            // tree-sitter-c also parses C++ with acceptable accuracy for SLOC counting.
490            #[cfg(feature = "tree-sitter")]
491            if let Some(result) = ts::analyze_c(text) {
492                return result;
493            }
494            analyze_generic(
495                text,
496                ScanConfig {
497                    line_comments: &["//"],
498                    block_comment: Some(("/*", "*/")),
499                    allow_single_quote_strings: true,
500                    allow_double_quote_strings: true,
501                    allow_triple_quote_strings: false,
502                    allow_csharp_verbatim_strings: false,
503                    skip_lines: HashSet::new(),
504                    symbol_patterns: SP_CPP,
505                },
506                cpp,
507            )
508        }
509        Language::CSharp => analyze_generic(
510            text,
511            ScanConfig {
512                line_comments: &["//"],
513                block_comment: Some(("/*", "*/")),
514                allow_single_quote_strings: true,
515                allow_double_quote_strings: true,
516                allow_triple_quote_strings: false,
517                allow_csharp_verbatim_strings: true,
518                skip_lines: HashSet::new(),
519                symbol_patterns: SP_CSHARP,
520            },
521            base,
522        ),
523        Language::Go => analyze_generic(
524            text,
525            ScanConfig {
526                line_comments: &["//"],
527                block_comment: Some(("/*", "*/")),
528                allow_single_quote_strings: true,
529                allow_double_quote_strings: true,
530                allow_triple_quote_strings: false,
531                allow_csharp_verbatim_strings: false,
532                skip_lines: HashSet::new(),
533                symbol_patterns: SP_GO,
534            },
535            base,
536        ),
537        Language::Java => analyze_generic(
538            text,
539            ScanConfig {
540                line_comments: &["//"],
541                block_comment: Some(("/*", "*/")),
542                allow_single_quote_strings: true,
543                allow_double_quote_strings: true,
544                allow_triple_quote_strings: false,
545                allow_csharp_verbatim_strings: false,
546                skip_lines: HashSet::new(),
547                symbol_patterns: SP_JAVA,
548            },
549            base,
550        ),
551        Language::JavaScript => analyze_generic(
552            text,
553            ScanConfig {
554                line_comments: &["//"],
555                block_comment: Some(("/*", "*/")),
556                allow_single_quote_strings: true,
557                allow_double_quote_strings: true,
558                allow_triple_quote_strings: false,
559                allow_csharp_verbatim_strings: false,
560                skip_lines: HashSet::new(),
561                symbol_patterns: SP_JS,
562            },
563            base,
564        ),
565        Language::Rust => analyze_generic(
566            text,
567            ScanConfig {
568                // Rust also has //! and /// doc comments — they parse the same as //
569                line_comments: &["//"],
570                block_comment: Some(("/*", "*/")),
571                allow_single_quote_strings: false,
572                allow_double_quote_strings: true,
573                allow_triple_quote_strings: false,
574                allow_csharp_verbatim_strings: false,
575                skip_lines: HashSet::new(),
576                symbol_patterns: SP_RUST,
577            },
578            base,
579        ),
580        Language::Shell => analyze_generic(
581            text,
582            ScanConfig {
583                line_comments: &["#"],
584                block_comment: None,
585                allow_single_quote_strings: true,
586                allow_double_quote_strings: true,
587                allow_triple_quote_strings: false,
588                allow_csharp_verbatim_strings: false,
589                skip_lines: HashSet::new(),
590                symbol_patterns: SP_SHELL,
591            },
592            base,
593        ),
594        Language::PowerShell => analyze_generic(
595            text,
596            ScanConfig {
597                line_comments: &["#"],
598                block_comment: Some(("<#", "#>")),
599                allow_single_quote_strings: true,
600                allow_double_quote_strings: true,
601                allow_triple_quote_strings: false,
602                allow_csharp_verbatim_strings: false,
603                skip_lines: HashSet::new(),
604                symbol_patterns: SP_POWERSHELL,
605            },
606            base,
607        ),
608        Language::TypeScript => analyze_generic(
609            text,
610            ScanConfig {
611                line_comments: &["//"],
612                block_comment: Some(("/*", "*/")),
613                allow_single_quote_strings: true,
614                allow_double_quote_strings: true,
615                allow_triple_quote_strings: false,
616                allow_csharp_verbatim_strings: false,
617                skip_lines: HashSet::new(),
618                symbol_patterns: SP_TS,
619            },
620            base,
621        ),
622        Language::Python => {
623            #[cfg(feature = "tree-sitter")]
624            if let Some(result) = ts::analyze_python(text) {
625                return result;
626            }
627            let docstring_lines = detect_python_docstring_lines(text);
628            analyze_generic(
629                text,
630                ScanConfig {
631                    line_comments: &["#"],
632                    block_comment: None,
633                    allow_single_quote_strings: true,
634                    allow_double_quote_strings: true,
635                    allow_triple_quote_strings: true,
636                    allow_csharp_verbatim_strings: false,
637                    skip_lines: docstring_lines,
638                    symbol_patterns: SP_PYTHON,
639                },
640                base,
641            )
642        }
643        // --- Extended language analyzers ---
644        Language::Assembly => analyze_generic(
645            text,
646            ScanConfig {
647                line_comments: &[";"],
648                block_comment: None,
649                allow_single_quote_strings: false,
650                allow_double_quote_strings: false,
651                allow_triple_quote_strings: false,
652                allow_csharp_verbatim_strings: false,
653                skip_lines: HashSet::new(),
654                symbol_patterns: SP_ASSEMBLY,
655            },
656            base,
657        ),
658        Language::Clojure => analyze_generic(
659            text,
660            ScanConfig {
661                line_comments: &[";"],
662                block_comment: None,
663                allow_single_quote_strings: false,
664                allow_double_quote_strings: true,
665                allow_triple_quote_strings: false,
666                allow_csharp_verbatim_strings: false,
667                skip_lines: HashSet::new(),
668                symbol_patterns: SP_CLOJURE,
669            },
670            base,
671        ),
672        Language::Css => analyze_generic(
673            text,
674            ScanConfig {
675                line_comments: &[],
676                block_comment: Some(("/*", "*/")),
677                allow_single_quote_strings: true,
678                allow_double_quote_strings: true,
679                allow_triple_quote_strings: false,
680                allow_csharp_verbatim_strings: false,
681                skip_lines: HashSet::new(),
682                symbol_patterns: SP_NONE,
683            },
684            base,
685        ),
686        Language::Dart => analyze_generic(
687            text,
688            ScanConfig {
689                line_comments: &["//"],
690                block_comment: Some(("/*", "*/")),
691                allow_single_quote_strings: true,
692                allow_double_quote_strings: true,
693                allow_triple_quote_strings: false,
694                allow_csharp_verbatim_strings: false,
695                skip_lines: HashSet::new(),
696                symbol_patterns: SP_DART,
697            },
698            base,
699        ),
700        Language::Dockerfile => analyze_generic(
701            text,
702            ScanConfig {
703                line_comments: &["#"],
704                block_comment: None,
705                allow_single_quote_strings: false,
706                allow_double_quote_strings: false,
707                allow_triple_quote_strings: false,
708                allow_csharp_verbatim_strings: false,
709                skip_lines: HashSet::new(),
710                symbol_patterns: SP_NONE,
711            },
712            base,
713        ),
714        Language::Elixir => analyze_generic(
715            text,
716            ScanConfig {
717                line_comments: &["#"],
718                block_comment: None,
719                allow_single_quote_strings: true,
720                allow_double_quote_strings: true,
721                allow_triple_quote_strings: false,
722                allow_csharp_verbatim_strings: false,
723                skip_lines: HashSet::new(),
724                symbol_patterns: SP_ELIXIR,
725            },
726            base,
727        ),
728        Language::Erlang => analyze_generic(
729            text,
730            ScanConfig {
731                line_comments: &["%"],
732                block_comment: None,
733                allow_single_quote_strings: false,
734                allow_double_quote_strings: true,
735                allow_triple_quote_strings: false,
736                allow_csharp_verbatim_strings: false,
737                skip_lines: HashSet::new(),
738                symbol_patterns: SP_ERLANG,
739            },
740            base,
741        ),
742        Language::FSharp => analyze_generic(
743            text,
744            ScanConfig {
745                line_comments: &["//"],
746                block_comment: Some(("(*", "*)")),
747                allow_single_quote_strings: false,
748                allow_double_quote_strings: true,
749                allow_triple_quote_strings: false,
750                allow_csharp_verbatim_strings: false,
751                skip_lines: HashSet::new(),
752                symbol_patterns: SP_FSHARP,
753            },
754            base,
755        ),
756        Language::Groovy => analyze_generic(
757            text,
758            ScanConfig {
759                line_comments: &["//"],
760                block_comment: Some(("/*", "*/")),
761                allow_single_quote_strings: true,
762                allow_double_quote_strings: true,
763                allow_triple_quote_strings: false,
764                allow_csharp_verbatim_strings: false,
765                skip_lines: HashSet::new(),
766                symbol_patterns: SP_GROOVY,
767            },
768            base,
769        ),
770        Language::Haskell => analyze_generic(
771            text,
772            ScanConfig {
773                line_comments: &["--"],
774                block_comment: Some(("{-", "-}")),
775                allow_single_quote_strings: true,
776                allow_double_quote_strings: true,
777                allow_triple_quote_strings: false,
778                allow_csharp_verbatim_strings: false,
779                skip_lines: HashSet::new(),
780                symbol_patterns: SP_HASKELL,
781            },
782            base,
783        ),
784        Language::Html | Language::Xml => analyze_generic(
785            text,
786            ScanConfig {
787                line_comments: &[],
788                block_comment: Some(("<!--", "-->")),
789                allow_single_quote_strings: false,
790                allow_double_quote_strings: false,
791                allow_triple_quote_strings: false,
792                allow_csharp_verbatim_strings: false,
793                skip_lines: HashSet::new(),
794                symbol_patterns: SP_NONE,
795            },
796            base,
797        ),
798        Language::Julia => analyze_generic(
799            text,
800            ScanConfig {
801                line_comments: &["#"],
802                block_comment: Some(("#=", "=#")),
803                allow_single_quote_strings: false,
804                allow_double_quote_strings: true,
805                allow_triple_quote_strings: true,
806                allow_csharp_verbatim_strings: false,
807                skip_lines: HashSet::new(),
808                symbol_patterns: SP_JULIA,
809            },
810            base,
811        ),
812        Language::Kotlin => analyze_generic(
813            text,
814            ScanConfig {
815                line_comments: &["//"],
816                block_comment: Some(("/*", "*/")),
817                allow_single_quote_strings: true,
818                allow_double_quote_strings: true,
819                allow_triple_quote_strings: false,
820                allow_csharp_verbatim_strings: false,
821                skip_lines: HashSet::new(),
822                symbol_patterns: SP_KOTLIN,
823            },
824            base,
825        ),
826        Language::Lua => analyze_generic(
827            text,
828            ScanConfig {
829                line_comments: &["--"],
830                block_comment: Some(("--[[", "]]")),
831                allow_single_quote_strings: true,
832                allow_double_quote_strings: true,
833                allow_triple_quote_strings: false,
834                allow_csharp_verbatim_strings: false,
835                skip_lines: HashSet::new(),
836                symbol_patterns: SP_LUA,
837            },
838            base,
839        ),
840        Language::Makefile => analyze_generic(
841            text,
842            ScanConfig {
843                line_comments: &["#"],
844                block_comment: None,
845                allow_single_quote_strings: false,
846                allow_double_quote_strings: false,
847                allow_triple_quote_strings: false,
848                allow_csharp_verbatim_strings: false,
849                skip_lines: HashSet::new(),
850                symbol_patterns: SP_NONE,
851            },
852            base,
853        ),
854        Language::Nim => analyze_generic(
855            text,
856            ScanConfig {
857                line_comments: &["#"],
858                block_comment: Some(("#[", "]#")),
859                allow_single_quote_strings: true,
860                allow_double_quote_strings: true,
861                allow_triple_quote_strings: false,
862                allow_csharp_verbatim_strings: false,
863                skip_lines: HashSet::new(),
864                symbol_patterns: SP_NIM,
865            },
866            base,
867        ),
868        Language::ObjectiveC => analyze_generic(
869            text,
870            ScanConfig {
871                line_comments: &["//"],
872                block_comment: Some(("/*", "*/")),
873                allow_single_quote_strings: true,
874                allow_double_quote_strings: true,
875                allow_triple_quote_strings: false,
876                allow_csharp_verbatim_strings: false,
877                skip_lines: HashSet::new(),
878                symbol_patterns: SP_OBJECTIVEC,
879            },
880            cpp,
881        ),
882        Language::Ocaml => analyze_generic(
883            text,
884            ScanConfig {
885                line_comments: &[],
886                block_comment: Some(("(*", "*)")),
887                allow_single_quote_strings: false,
888                allow_double_quote_strings: true,
889                allow_triple_quote_strings: false,
890                allow_csharp_verbatim_strings: false,
891                skip_lines: HashSet::new(),
892                symbol_patterns: SP_OCAML,
893            },
894            base,
895        ),
896        Language::Perl => analyze_generic(
897            text,
898            ScanConfig {
899                line_comments: &["#"],
900                block_comment: None,
901                allow_single_quote_strings: true,
902                allow_double_quote_strings: true,
903                allow_triple_quote_strings: false,
904                allow_csharp_verbatim_strings: false,
905                skip_lines: HashSet::new(),
906                symbol_patterns: SP_PERL,
907            },
908            base,
909        ),
910        Language::Php => analyze_generic(
911            text,
912            ScanConfig {
913                line_comments: &["//", "#"],
914                block_comment: Some(("/*", "*/")),
915                allow_single_quote_strings: true,
916                allow_double_quote_strings: true,
917                allow_triple_quote_strings: false,
918                allow_csharp_verbatim_strings: false,
919                skip_lines: HashSet::new(),
920                symbol_patterns: SP_PHP,
921            },
922            base,
923        ),
924        Language::R => analyze_generic(
925            text,
926            ScanConfig {
927                line_comments: &["#"],
928                block_comment: None,
929                allow_single_quote_strings: true,
930                allow_double_quote_strings: true,
931                allow_triple_quote_strings: false,
932                allow_csharp_verbatim_strings: false,
933                skip_lines: HashSet::new(),
934                symbol_patterns: SP_R,
935            },
936            base,
937        ),
938        Language::Ruby => analyze_generic(
939            text,
940            ScanConfig {
941                line_comments: &["#"],
942                block_comment: None,
943                allow_single_quote_strings: true,
944                allow_double_quote_strings: true,
945                allow_triple_quote_strings: false,
946                allow_csharp_verbatim_strings: false,
947                skip_lines: HashSet::new(),
948                symbol_patterns: SP_RUBY,
949            },
950            base,
951        ),
952        Language::Scala => analyze_generic(
953            text,
954            ScanConfig {
955                line_comments: &["//"],
956                block_comment: Some(("/*", "*/")),
957                allow_single_quote_strings: true,
958                allow_double_quote_strings: true,
959                allow_triple_quote_strings: false,
960                allow_csharp_verbatim_strings: false,
961                skip_lines: HashSet::new(),
962                symbol_patterns: SP_SCALA,
963            },
964            base,
965        ),
966        Language::Scss => analyze_generic(
967            text,
968            ScanConfig {
969                line_comments: &["//"],
970                block_comment: Some(("/*", "*/")),
971                allow_single_quote_strings: true,
972                allow_double_quote_strings: true,
973                allow_triple_quote_strings: false,
974                allow_csharp_verbatim_strings: false,
975                skip_lines: HashSet::new(),
976                symbol_patterns: SP_NONE,
977            },
978            base,
979        ),
980        Language::Sql => analyze_generic(
981            text,
982            ScanConfig {
983                line_comments: &["--"],
984                block_comment: Some(("/*", "*/")),
985                allow_single_quote_strings: true,
986                allow_double_quote_strings: false,
987                allow_triple_quote_strings: false,
988                allow_csharp_verbatim_strings: false,
989                skip_lines: HashSet::new(),
990                symbol_patterns: SP_SQL,
991            },
992            base,
993        ),
994        Language::Svelte => analyze_generic(
995            text,
996            ScanConfig {
997                line_comments: &["//"],
998                block_comment: Some(("/*", "*/")),
999                allow_single_quote_strings: true,
1000                allow_double_quote_strings: true,
1001                allow_triple_quote_strings: false,
1002                allow_csharp_verbatim_strings: false,
1003                skip_lines: HashSet::new(),
1004                symbol_patterns: SP_JS,
1005            },
1006            base,
1007        ),
1008        Language::Swift => analyze_generic(
1009            text,
1010            ScanConfig {
1011                line_comments: &["//"],
1012                block_comment: Some(("/*", "*/")),
1013                allow_single_quote_strings: false,
1014                allow_double_quote_strings: true,
1015                allow_triple_quote_strings: false,
1016                allow_csharp_verbatim_strings: false,
1017                skip_lines: HashSet::new(),
1018                symbol_patterns: SP_SWIFT,
1019            },
1020            base,
1021        ),
1022        Language::Vue => analyze_generic(
1023            text,
1024            ScanConfig {
1025                line_comments: &["//"],
1026                block_comment: Some(("/*", "*/")),
1027                allow_single_quote_strings: true,
1028                allow_double_quote_strings: true,
1029                allow_triple_quote_strings: false,
1030                allow_csharp_verbatim_strings: false,
1031                skip_lines: HashSet::new(),
1032                symbol_patterns: SP_JS,
1033            },
1034            base,
1035        ),
1036        Language::Zig => analyze_generic(
1037            text,
1038            ScanConfig {
1039                line_comments: &["//"],
1040                block_comment: None,
1041                allow_single_quote_strings: true,
1042                allow_double_quote_strings: true,
1043                allow_triple_quote_strings: false,
1044                allow_csharp_verbatim_strings: false,
1045                skip_lines: HashSet::new(),
1046                symbol_patterns: SP_ZIG,
1047            },
1048            base,
1049        ),
1050    }
1051}
1052
1053/// Per-language keyword prefixes used for best-effort structural symbol detection.
1054/// Each slice lists line prefixes (after leading whitespace is stripped) that indicate
1055/// a definition of that category. Empty slice = detection disabled for that category.
1056#[derive(Debug, Clone, Copy)]
1057struct SymbolPatterns {
1058    functions: &'static [&'static str],
1059    classes: &'static [&'static str],
1060    variables: &'static [&'static str],
1061    imports: &'static [&'static str],
1062}
1063
1064impl SymbolPatterns {
1065    const fn none() -> Self {
1066        Self {
1067            functions: &[],
1068            classes: &[],
1069            variables: &[],
1070            imports: &[],
1071        }
1072    }
1073}
1074
1075const SP_NONE: SymbolPatterns = SymbolPatterns::none();
1076
1077const SP_RUST: SymbolPatterns = SymbolPatterns {
1078    functions: &[
1079        "fn ",
1080        "pub fn ",
1081        "pub(crate) fn ",
1082        "pub(super) fn ",
1083        "async fn ",
1084        "pub async fn ",
1085        "pub(crate) async fn ",
1086        "unsafe fn ",
1087        "pub unsafe fn ",
1088        "pub(crate) unsafe fn ",
1089        "const fn ",
1090        "pub const fn ",
1091        "pub(crate) const fn ",
1092        "extern fn ",
1093        "pub extern fn ",
1094    ],
1095    classes: &[
1096        "struct ",
1097        "pub struct ",
1098        "pub(crate) struct ",
1099        "enum ",
1100        "pub enum ",
1101        "pub(crate) enum ",
1102        "trait ",
1103        "pub trait ",
1104        "pub(crate) trait ",
1105        "impl ",
1106        "impl<",
1107        "type ",
1108        "pub type ",
1109        "pub(crate) type ",
1110    ],
1111    variables: &["let ", "let mut "],
1112    imports: &["use ", "pub use ", "pub(crate) use ", "extern crate "],
1113};
1114
1115const SP_PYTHON: SymbolPatterns = SymbolPatterns {
1116    functions: &["def ", "async def "],
1117    classes: &["class "],
1118    variables: &[],
1119    imports: &["import ", "from "],
1120};
1121
1122const SP_JS: SymbolPatterns = SymbolPatterns {
1123    functions: &[
1124        "function ",
1125        "async function ",
1126        "export function ",
1127        "export async function ",
1128        "export default function ",
1129    ],
1130    classes: &["class ", "export class ", "export default class "],
1131    variables: &[
1132        "var ",
1133        "let ",
1134        "const ",
1135        "export var ",
1136        "export let ",
1137        "export const ",
1138    ],
1139    imports: &["import "],
1140};
1141
1142const SP_TS: SymbolPatterns = SymbolPatterns {
1143    functions: &[
1144        "function ",
1145        "async function ",
1146        "export function ",
1147        "export async function ",
1148        "export default function ",
1149    ],
1150    classes: &[
1151        "class ",
1152        "export class ",
1153        "export default class ",
1154        "abstract class ",
1155        "export abstract class ",
1156        "interface ",
1157        "export interface ",
1158        "declare class ",
1159        "declare interface ",
1160    ],
1161    variables: &[
1162        "var ",
1163        "let ",
1164        "const ",
1165        "export var ",
1166        "export let ",
1167        "export const ",
1168    ],
1169    imports: &["import "],
1170};
1171
1172const SP_GO: SymbolPatterns = SymbolPatterns {
1173    functions: &["func "],
1174    classes: &["type "],
1175    variables: &["var "],
1176    imports: &["import "],
1177};
1178
1179const SP_JAVA: SymbolPatterns = SymbolPatterns {
1180    functions: &[],
1181    classes: &[
1182        "class ",
1183        "public class ",
1184        "private class ",
1185        "protected class ",
1186        "abstract class ",
1187        "final class ",
1188        "public abstract class ",
1189        "public final class ",
1190        "interface ",
1191        "public interface ",
1192        "enum ",
1193        "public enum ",
1194        "record ",
1195        "public record ",
1196        "@interface ",
1197    ],
1198    variables: &[],
1199    imports: &["import "],
1200};
1201
1202const SP_CSHARP: SymbolPatterns = SymbolPatterns {
1203    functions: &[],
1204    classes: &[
1205        "class ",
1206        "public class ",
1207        "private class ",
1208        "protected class ",
1209        "internal class ",
1210        "abstract class ",
1211        "sealed class ",
1212        "static class ",
1213        "partial class ",
1214        "public abstract class ",
1215        "public sealed class ",
1216        "public static class ",
1217        "interface ",
1218        "public interface ",
1219        "internal interface ",
1220        "enum ",
1221        "public enum ",
1222        "struct ",
1223        "public struct ",
1224        "record ",
1225        "public record ",
1226    ],
1227    variables: &["var "],
1228    imports: &["using "],
1229};
1230
1231const SP_C: SymbolPatterns = SymbolPatterns {
1232    functions: &[],
1233    classes: &[
1234        "struct ",
1235        "typedef struct ",
1236        "union ",
1237        "typedef union ",
1238        "typedef enum ",
1239    ],
1240    variables: &[],
1241    imports: &["#include "],
1242};
1243
1244const SP_CPP: SymbolPatterns = SymbolPatterns {
1245    functions: &[],
1246    classes: &["class ", "struct ", "namespace ", "template "],
1247    variables: &[],
1248    imports: &["#include "],
1249};
1250
1251const SP_SHELL: SymbolPatterns = SymbolPatterns {
1252    functions: &["function "],
1253    classes: &[],
1254    variables: &["declare ", "local ", "export "],
1255    imports: &["source ", ". "],
1256};
1257
1258const SP_POWERSHELL: SymbolPatterns = SymbolPatterns {
1259    functions: &["function ", "Function "],
1260    classes: &["class "],
1261    variables: &[],
1262    imports: &["Import-Module ", "using "],
1263};
1264
1265const SP_KOTLIN: SymbolPatterns = SymbolPatterns {
1266    functions: &[
1267        "fun ",
1268        "private fun ",
1269        "public fun ",
1270        "protected fun ",
1271        "internal fun ",
1272        "override fun ",
1273        "suspend fun ",
1274        "abstract fun ",
1275        "open fun ",
1276        "private suspend fun ",
1277        "public suspend fun ",
1278    ],
1279    classes: &[
1280        "class ",
1281        "data class ",
1282        "sealed class ",
1283        "abstract class ",
1284        "open class ",
1285        "object ",
1286        "companion object",
1287        "interface ",
1288        "enum class ",
1289        "annotation class ",
1290    ],
1291    variables: &["val ", "var ", "private val ", "private var ", "const val "],
1292    imports: &["import "],
1293};
1294
1295const SP_SWIFT: SymbolPatterns = SymbolPatterns {
1296    functions: &[
1297        "func ",
1298        "private func ",
1299        "public func ",
1300        "internal func ",
1301        "override func ",
1302        "open func ",
1303        "static func ",
1304        "class func ",
1305        "mutating func ",
1306        "private static func ",
1307        "public static func ",
1308    ],
1309    classes: &[
1310        "class ",
1311        "struct ",
1312        "protocol ",
1313        "enum ",
1314        "extension ",
1315        "actor ",
1316        "public class ",
1317        "private class ",
1318        "open class ",
1319        "final class ",
1320        "public struct ",
1321        "private struct ",
1322        "public protocol ",
1323    ],
1324    variables: &[
1325        "var ",
1326        "let ",
1327        "private var ",
1328        "private let ",
1329        "static var ",
1330        "static let ",
1331    ],
1332    imports: &["import "],
1333};
1334
1335const SP_RUBY: SymbolPatterns = SymbolPatterns {
1336    functions: &["def ", "private def ", "protected def "],
1337    classes: &["class ", "module "],
1338    variables: &[],
1339    imports: &["require ", "require_relative "],
1340};
1341
1342const SP_SCALA: SymbolPatterns = SymbolPatterns {
1343    functions: &["def ", "private def ", "protected def ", "override def "],
1344    classes: &[
1345        "class ",
1346        "case class ",
1347        "abstract class ",
1348        "sealed class ",
1349        "object ",
1350        "trait ",
1351    ],
1352    variables: &["val ", "var ", "lazy val "],
1353    imports: &["import "],
1354};
1355
1356const SP_PHP: SymbolPatterns = SymbolPatterns {
1357    functions: &[
1358        "function ",
1359        "public function ",
1360        "private function ",
1361        "protected function ",
1362        "static function ",
1363        "abstract function ",
1364        "final function ",
1365        "public static function ",
1366        "private static function ",
1367        "protected static function ",
1368    ],
1369    classes: &[
1370        "class ",
1371        "abstract class ",
1372        "final class ",
1373        "interface ",
1374        "trait ",
1375        "enum ",
1376    ],
1377    variables: &[],
1378    imports: &[
1379        "use ",
1380        "require ",
1381        "require_once ",
1382        "include ",
1383        "include_once ",
1384    ],
1385};
1386
1387const SP_ELIXIR: SymbolPatterns = SymbolPatterns {
1388    functions: &[
1389        "def ",
1390        "defp ",
1391        "defmacro ",
1392        "defmacrop ",
1393        "defguard ",
1394        "defguardp ",
1395    ],
1396    classes: &["defmodule ", "defprotocol ", "defimpl "],
1397    variables: &[],
1398    imports: &["import ", "alias ", "use ", "require "],
1399};
1400
1401const SP_ERLANG: SymbolPatterns = SymbolPatterns {
1402    functions: &[],
1403    classes: &["-module("],
1404    variables: &[],
1405    imports: &["-import(", "-include(", "-include_lib("],
1406};
1407
1408const SP_FSHARP: SymbolPatterns = SymbolPatterns {
1409    functions: &[
1410        "let ",
1411        "let rec ",
1412        "member ",
1413        "override ",
1414        "abstract member ",
1415    ],
1416    classes: &["type "],
1417    variables: &["let mutable "],
1418    imports: &["open "],
1419};
1420
1421const SP_GROOVY: SymbolPatterns = SymbolPatterns {
1422    functions: &["def ", "private def ", "public def ", "protected def "],
1423    classes: &["class ", "abstract class ", "interface ", "enum ", "trait "],
1424    variables: &[],
1425    imports: &["import "],
1426};
1427
1428const SP_HASKELL: SymbolPatterns = SymbolPatterns {
1429    functions: &[],
1430    classes: &["class ", "data ", "newtype ", "type "],
1431    variables: &[],
1432    imports: &["import "],
1433};
1434
1435const SP_LUA: SymbolPatterns = SymbolPatterns {
1436    functions: &["function ", "local function "],
1437    classes: &[],
1438    variables: &["local "],
1439    imports: &[],
1440};
1441
1442const SP_NIM: SymbolPatterns = SymbolPatterns {
1443    functions: &[
1444        "proc ",
1445        "func ",
1446        "method ",
1447        "iterator ",
1448        "converter ",
1449        "template ",
1450        "macro ",
1451    ],
1452    classes: &["type "],
1453    variables: &["var ", "let ", "const "],
1454    imports: &["import ", "from "],
1455};
1456
1457const SP_OBJECTIVEC: SymbolPatterns = SymbolPatterns {
1458    functions: &["- (", "+ ("],
1459    classes: &["@interface ", "@implementation ", "@protocol "],
1460    variables: &[],
1461    imports: &["#import ", "#include "],
1462};
1463
1464const SP_OCAML: SymbolPatterns = SymbolPatterns {
1465    functions: &["let ", "let rec "],
1466    classes: &["type ", "module ", "class "],
1467    variables: &[],
1468    imports: &["open "],
1469};
1470
1471const SP_PERL: SymbolPatterns = SymbolPatterns {
1472    functions: &["sub "],
1473    classes: &["package "],
1474    variables: &["my ", "our ", "local "],
1475    imports: &["use ", "require "],
1476};
1477
1478const SP_CLOJURE: SymbolPatterns = SymbolPatterns {
1479    functions: &["(defn ", "(defn- ", "(defmacro ", "(defmulti "],
1480    classes: &[
1481        "(defrecord ",
1482        "(defprotocol ",
1483        "(deftype ",
1484        "(definterface ",
1485    ],
1486    variables: &["(def ", "(defonce "],
1487    imports: &["(ns ", "(require "],
1488};
1489
1490const SP_JULIA: SymbolPatterns = SymbolPatterns {
1491    functions: &["function ", "macro "],
1492    classes: &[
1493        "struct ",
1494        "mutable struct ",
1495        "abstract type ",
1496        "primitive type ",
1497    ],
1498    variables: &["const "],
1499    imports: &["import ", "using "],
1500};
1501
1502const SP_DART: SymbolPatterns = SymbolPatterns {
1503    functions: &[],
1504    classes: &["class ", "abstract class ", "mixin ", "extension ", "enum "],
1505    variables: &["var ", "final ", "const ", "late "],
1506    imports: &["import "],
1507};
1508
1509const SP_R: SymbolPatterns = SymbolPatterns {
1510    functions: &[],
1511    classes: &[],
1512    variables: &[],
1513    imports: &["library(", "source("],
1514};
1515
1516const SP_SQL: SymbolPatterns = SymbolPatterns {
1517    functions: &[
1518        "create function ",
1519        "create or replace function ",
1520        "create procedure ",
1521        "create or replace procedure ",
1522        "CREATE FUNCTION ",
1523        "CREATE OR REPLACE FUNCTION ",
1524        "CREATE PROCEDURE ",
1525        "CREATE OR REPLACE PROCEDURE ",
1526    ],
1527    classes: &[
1528        "create table ",
1529        "create view ",
1530        "create schema ",
1531        "CREATE TABLE ",
1532        "CREATE VIEW ",
1533        "CREATE SCHEMA ",
1534    ],
1535    variables: &["declare ", "DECLARE "],
1536    imports: &[],
1537};
1538
1539const SP_ASSEMBLY: SymbolPatterns = SymbolPatterns {
1540    functions: &["proc ", "PROC "],
1541    classes: &[],
1542    variables: &[],
1543    imports: &["include ", "INCLUDE ", "%include "],
1544};
1545
1546const SP_ZIG: SymbolPatterns = SymbolPatterns {
1547    functions: &[
1548        "fn ",
1549        "pub fn ",
1550        "export fn ",
1551        "inline fn ",
1552        "pub inline fn ",
1553    ],
1554    classes: &[],
1555    variables: &["var ", "pub var "],
1556    imports: &[],
1557};
1558
1559#[derive(Debug, Clone)]
1560struct ScanConfig {
1561    line_comments: &'static [&'static str],
1562    block_comment: Option<(&'static str, &'static str)>,
1563    allow_single_quote_strings: bool,
1564    allow_double_quote_strings: bool,
1565    allow_triple_quote_strings: bool,
1566    allow_csharp_verbatim_strings: bool,
1567    skip_lines: HashSet<usize>,
1568    symbol_patterns: SymbolPatterns,
1569}
1570
1571/// Per-call IEEE 1045-1992 flags derived from AnalysisOptions plus per-language properties.
1572/// Private to this crate; constructed inside analyze_text.
1573#[derive(Debug, Clone, Copy)]
1574struct IeeeFlags {
1575    /// True for C, C++, and Objective-C — languages with a C preprocessor.
1576    has_preprocessor_directives: bool,
1577    /// Mirrors AnalysisOptions::blank_in_block_comment_as_comment.
1578    blank_in_block_comment_as_comment: bool,
1579    /// Mirrors AnalysisOptions::collapse_continuation_lines.
1580    collapse_continuation_lines: bool,
1581}
1582
1583#[derive(Debug, Clone, Copy)]
1584enum StringState {
1585    Single(char),
1586    Triple(&'static str),
1587    VerbatimDouble,
1588}
1589
1590#[derive(Debug, Default)]
1591struct LineFacts {
1592    has_code: bool,
1593    has_single_comment: bool,
1594    has_multi_comment: bool,
1595    has_docstring: bool,
1596}
1597
1598fn analyze_generic(text: &str, config: ScanConfig, ieee: IeeeFlags) -> RawFileAnalysis {
1599    let normalized = if text.is_empty() {
1600        String::new()
1601    } else {
1602        text.replace("\r\n", "\n").replace('\r', "\n")
1603    };
1604
1605    let lines: Vec<&str> = if normalized.is_empty() {
1606        Vec::new()
1607    } else {
1608        normalized.split_terminator('\n').collect()
1609    };
1610
1611    let mut raw = RawLineCounts::default();
1612    let mut warnings = Vec::new();
1613
1614    let mut in_block_comment = false;
1615    let mut string_state: Option<StringState> = None;
1616    // IEEE continuation-line state: accumulates facts across a backslash-continued sequence.
1617    let mut pending_continuation: Option<LineFacts> = None;
1618
1619    for (line_idx, line) in lines.iter().enumerate() {
1620        raw.total_physical_lines += 1;
1621
1622        if config.skip_lines.contains(&line_idx) {
1623            raw.docstring_comment_lines += 1;
1624            continue;
1625        }
1626
1627        let mut facts = LineFacts::default();
1628        let trimmed = line.trim();
1629
1630        // IEEE 1045-1992: blank lines inside block comments are comment lines by default.
1631        // When blank_in_block_comment_as_comment is false, blank lines keep their blank
1632        // classification even while inside a block comment.
1633        if in_block_comment && (ieee.blank_in_block_comment_as_comment || !trimmed.is_empty()) {
1634            facts.has_multi_comment = true;
1635        }
1636
1637        let chars: Vec<char> = line.chars().collect();
1638        let mut i = 0usize;
1639        while i < chars.len() {
1640            if config.skip_lines.contains(&line_idx) {
1641                break;
1642            }
1643
1644            if let Some(state) = string_state {
1645                facts.has_code = true;
1646                match state {
1647                    StringState::Single(delim) => {
1648                        if chars[i] == '\\' {
1649                            i += 2;
1650                            continue;
1651                        }
1652                        if chars[i] == delim {
1653                            string_state = None;
1654                        }
1655                        i += 1;
1656                        continue;
1657                    }
1658                    StringState::Triple(delim) => {
1659                        if starts_with(&chars, i, delim) {
1660                            string_state = None;
1661                            i += delim.len();
1662                        } else {
1663                            i += 1;
1664                        }
1665                        continue;
1666                    }
1667                    StringState::VerbatimDouble => {
1668                        if starts_with(&chars, i, "\"\"") {
1669                            i += 2;
1670                            continue;
1671                        }
1672                        if chars[i] == '"' {
1673                            string_state = None;
1674                        }
1675                        i += 1;
1676                        continue;
1677                    }
1678                }
1679            }
1680
1681            if in_block_comment {
1682                facts.has_multi_comment = true;
1683                if let Some((_, close)) = config.block_comment {
1684                    if starts_with(&chars, i, close) {
1685                        in_block_comment = false;
1686                        i += close.len();
1687                    } else {
1688                        i += 1;
1689                    }
1690                    continue;
1691                }
1692            }
1693
1694            if chars[i].is_whitespace() {
1695                i += 1;
1696                continue;
1697            }
1698
1699            if config.allow_csharp_verbatim_strings && starts_with(&chars, i, "@\"") {
1700                facts.has_code = true;
1701                string_state = Some(StringState::VerbatimDouble);
1702                i += 2;
1703                continue;
1704            }
1705
1706            if config.allow_triple_quote_strings {
1707                if starts_with(&chars, i, "\"\"\"") {
1708                    facts.has_code = true;
1709                    string_state = Some(StringState::Triple("\"\"\""));
1710                    i += 3;
1711                    continue;
1712                }
1713                if starts_with(&chars, i, "'''") {
1714                    facts.has_code = true;
1715                    string_state = Some(StringState::Triple("'''"));
1716                    i += 3;
1717                    continue;
1718                }
1719            }
1720
1721            if config.allow_single_quote_strings && chars[i] == '\'' {
1722                facts.has_code = true;
1723                string_state = Some(StringState::Single('\''));
1724                i += 1;
1725                continue;
1726            }
1727
1728            if config.allow_double_quote_strings && chars[i] == '"' {
1729                facts.has_code = true;
1730                string_state = Some(StringState::Single('"'));
1731                i += 1;
1732                continue;
1733            }
1734
1735            if let Some((open, _)) = config.block_comment {
1736                if starts_with(&chars, i, open) {
1737                    facts.has_multi_comment = true;
1738                    in_block_comment = true;
1739                    i += open.len();
1740                    continue;
1741                }
1742            }
1743
1744            if let Some(prefix) = config
1745                .line_comments
1746                .iter()
1747                .find(|prefix| starts_with(&chars, i, prefix))
1748            {
1749                let _ = prefix;
1750                facts.has_single_comment = true;
1751                break;
1752            }
1753
1754            facts.has_code = true;
1755            i += 1;
1756        }
1757
1758        // IEEE 1045-1992 §4.2: track preprocessor/compiler directive lines (C/C++/ObjC).
1759        // A directive line is a pure code line (no comment on the same physical line) whose
1760        // trimmed content starts with '#'.
1761        if ieee.has_preprocessor_directives
1762            && facts.has_code
1763            && !facts.has_single_comment
1764            && !facts.has_multi_comment
1765            && trimmed.starts_with('#')
1766        {
1767            raw.compiler_directive_lines += 1;
1768        }
1769
1770        // IEEE 1045-1992 continuation-line handling.
1771        // A line is a continuation starter when it ends with '\' outside any comment or string.
1772        let is_continuation = ieee.collapse_continuation_lines
1773            && !in_block_comment
1774            && string_state.is_none()
1775            && trimmed.ends_with('\\');
1776
1777        if is_continuation {
1778            let pending = pending_continuation.get_or_insert_with(LineFacts::default);
1779            pending.has_code |= facts.has_code;
1780            pending.has_single_comment |= facts.has_single_comment;
1781            pending.has_multi_comment |= facts.has_multi_comment;
1782            pending.has_docstring |= facts.has_docstring;
1783            continue; // defer classification until the sequence ends
1784        }
1785
1786        // Merge any accumulated continuation facts into the final line.
1787        let emit = if let Some(pending) = pending_continuation.take() {
1788            LineFacts {
1789                has_code: pending.has_code | facts.has_code,
1790                has_single_comment: pending.has_single_comment | facts.has_single_comment,
1791                has_multi_comment: pending.has_multi_comment | facts.has_multi_comment,
1792                has_docstring: pending.has_docstring | facts.has_docstring,
1793            }
1794        } else {
1795            facts
1796        };
1797
1798        classify_line(&mut raw, &emit, trimmed);
1799
1800        if emit.has_code {
1801            let (f, c, v, i) = count_symbols(&config.symbol_patterns, trimmed);
1802            raw.functions += f;
1803            raw.classes += c;
1804            raw.variables += v;
1805            raw.imports += i;
1806        }
1807    }
1808
1809    // Flush any pending continuation that reaches end-of-file without a closing line.
1810    if let Some(pending) = pending_continuation.take() {
1811        classify_line(&mut raw, &pending, "");
1812    }
1813
1814    if in_block_comment {
1815        warnings.push("unclosed block comment detected; result is best effort".into());
1816    }
1817    if string_state.is_some() {
1818        warnings.push("unclosed string literal detected; result is best effort".into());
1819    }
1820
1821    RawFileAnalysis {
1822        raw,
1823        parse_mode: if warnings.is_empty() {
1824            ParseMode::Lexical
1825        } else {
1826            ParseMode::LexicalBestEffort
1827        },
1828        warnings,
1829    }
1830}
1831
1832fn classify_line(raw: &mut RawLineCounts, facts: &LineFacts, trimmed: &str) {
1833    if facts.has_docstring {
1834        raw.docstring_comment_lines += 1;
1835    } else if !facts.has_code
1836        && !facts.has_single_comment
1837        && !facts.has_multi_comment
1838        && trimmed.is_empty()
1839    {
1840        raw.blank_only_lines += 1;
1841    } else if facts.has_code && facts.has_single_comment {
1842        raw.mixed_code_single_comment_lines += 1;
1843    } else if facts.has_code && facts.has_multi_comment {
1844        raw.mixed_code_multi_comment_lines += 1;
1845    } else if facts.has_code {
1846        raw.code_only_lines += 1;
1847    } else if facts.has_single_comment {
1848        raw.single_comment_only_lines += 1;
1849    } else if facts.has_multi_comment {
1850        raw.multi_comment_only_lines += 1;
1851    } else if trimmed.is_empty() {
1852        raw.blank_only_lines += 1;
1853    } else {
1854        raw.skipped_unknown_lines += 1;
1855    }
1856}
1857
1858fn count_symbols(patterns: &SymbolPatterns, trimmed: &str) -> (u64, u64, u64, u64) {
1859    let hit = |pats: &[&str]| pats.iter().any(|p| trimmed.starts_with(p)) as u64;
1860    (
1861        hit(patterns.functions),
1862        hit(patterns.classes),
1863        hit(patterns.variables),
1864        hit(patterns.imports),
1865    )
1866}
1867
1868fn starts_with(chars: &[char], index: usize, needle: &str) -> bool {
1869    let needle_chars: Vec<char> = needle.chars().collect();
1870    chars.get(index..index + needle_chars.len()) == Some(needle_chars.as_slice())
1871}
1872
1873fn detect_python_docstring_lines(text: &str) -> HashSet<usize> {
1874    let normalized = if text.is_empty() {
1875        String::new()
1876    } else {
1877        text.replace("\r\n", "\n").replace('\r', "\n")
1878    };
1879
1880    let lines: Vec<&str> = if normalized.is_empty() {
1881        Vec::new()
1882    } else {
1883        normalized.split_terminator('\n').collect()
1884    };
1885
1886    #[derive(Debug, Clone)]
1887    struct PyContext {
1888        indent: usize,
1889        expect_docstring: bool,
1890    }
1891
1892    let mut docstring_lines = HashSet::new();
1893    let mut contexts = vec![PyContext {
1894        indent: 0,
1895        expect_docstring: true,
1896    }];
1897    let mut pending_block_indent: Option<usize> = None;
1898    let mut active_docstring: Option<(&'static str, usize)> = None;
1899
1900    for (idx, line) in lines.iter().enumerate() {
1901        let trimmed = line.trim();
1902        let indent = leading_indent(line);
1903
1904        if let Some((delim, start_line)) = active_docstring {
1905            docstring_lines.insert(idx);
1906            if closes_triple_docstring(trimmed, delim, idx == start_line) {
1907                active_docstring = None;
1908            }
1909            continue;
1910        }
1911
1912        if trimmed.is_empty() || trimmed.starts_with('#') {
1913            continue;
1914        }
1915
1916        while contexts.len() > 1 && indent < contexts.last().map(|c| c.indent).unwrap_or(0) {
1917            contexts.pop();
1918        }
1919
1920        if let Some(base_indent) = pending_block_indent {
1921            if indent > base_indent {
1922                contexts.push(PyContext {
1923                    indent,
1924                    expect_docstring: true,
1925                });
1926                pending_block_indent = None;
1927            } else if !trimmed.starts_with('@') {
1928                pending_block_indent = None;
1929            }
1930        }
1931
1932        if let Some(ctx) = contexts.last_mut() {
1933            if ctx.expect_docstring {
1934                if let Some(delim) = docstring_delimiter(trimmed) {
1935                    docstring_lines.insert(idx);
1936                    ctx.expect_docstring = false;
1937                    if !closes_triple_docstring(trimmed, delim, true) {
1938                        active_docstring = Some((delim, idx));
1939                    }
1940                    continue;
1941                }
1942                ctx.expect_docstring = false;
1943            }
1944        }
1945
1946        if is_python_block_header(trimmed) {
1947            pending_block_indent = Some(indent);
1948        }
1949    }
1950
1951    if let Some((_, start_line)) = active_docstring {
1952        for idx in start_line..lines.len() {
1953            docstring_lines.insert(idx);
1954        }
1955    }
1956
1957    docstring_lines
1958}
1959
1960fn leading_indent(line: &str) -> usize {
1961    line.chars().take_while(|c| c.is_whitespace()).count()
1962}
1963
1964fn is_python_block_header(trimmed: &str) -> bool {
1965    (trimmed.starts_with("def ")
1966        || trimmed.starts_with("async def ")
1967        || trimmed.starts_with("class "))
1968        && trimmed.ends_with(':')
1969}
1970
1971fn docstring_delimiter(trimmed: &str) -> Option<&'static str> {
1972    let mut idx = 0usize;
1973    let bytes = trimmed.as_bytes();
1974    while idx < bytes.len() {
1975        let c = bytes[idx] as char;
1976        if matches!(c, 'r' | 'R' | 'u' | 'U' | 'b' | 'B' | 'f' | 'F') {
1977            idx += 1;
1978            continue;
1979        }
1980        break;
1981    }
1982
1983    let rest = &trimmed[idx..];
1984    if rest.starts_with("\"\"\"") {
1985        Some("\"\"\"")
1986    } else if rest.starts_with("'''") {
1987        Some("'''")
1988    } else {
1989        None
1990    }
1991}
1992
1993fn closes_triple_docstring(trimmed: &str, delim: &str, same_line_as_start: bool) -> bool {
1994    let mut occurrences = 0usize;
1995    let mut search = trimmed;
1996    while let Some(index) = search.find(delim) {
1997        occurrences += 1;
1998        search = &search[index + delim.len()..];
1999    }
2000
2001    if same_line_as_start {
2002        occurrences >= 2
2003    } else {
2004        occurrences >= 1
2005    }
2006}
2007
2008/// Tree-sitter-backed adapters. Compiled only when the `tree-sitter` feature is enabled.
2009/// When parsing succeeds the result is used directly; on any failure the caller falls back
2010/// to the lexical state machine.
2011#[cfg(feature = "tree-sitter")]
2012pub mod ts {
2013    use tree_sitter::Node;
2014
2015    use super::{ParseMode, RawFileAnalysis, RawLineCounts};
2016
2017    /// Classify every line of `text` using a tree-sitter grammar.
2018    ///
2019    /// `comment_node_kinds` — node type names that represent comments in this grammar
2020    /// `docstring_stmt_kind` — optional parent node type whose direct `string` child is a docstring
2021    fn analyze_lines(
2022        text: &str,
2023        ts_language: tree_sitter::Language,
2024        comment_node_kinds: &[&str],
2025        docstring_stmt_kind: Option<&str>,
2026    ) -> Option<RawFileAnalysis> {
2027        let mut parser = tree_sitter::Parser::new();
2028        parser.set_language(&ts_language).ok()?;
2029        let tree = parser.parse(text, None)?;
2030
2031        let lines: Vec<&str> = text.split_terminator('\n').collect();
2032        let n = lines.len();
2033
2034        let mut has_code = vec![false; n];
2035        let mut has_comment = vec![false; n];
2036        let mut comment_is_block = vec![false; n];
2037        let mut has_docstring = vec![false; n];
2038
2039        // Walk every node in the tree and mark line arrays.
2040        let mut ctx = VisitCtx {
2041            source: text.as_bytes(),
2042            comment_kinds: comment_node_kinds,
2043            docstring_stmt_kind,
2044            has_code: &mut has_code,
2045            has_comment: &mut has_comment,
2046            comment_is_block: &mut comment_is_block,
2047            has_docstring: &mut has_docstring,
2048        };
2049        visit(tree.root_node(), &mut ctx);
2050
2051        let mut raw = RawLineCounts::default();
2052
2053        for i in 0..n {
2054            raw.total_physical_lines += 1;
2055            let trimmed = lines[i].trim();
2056
2057            if trimmed.is_empty() {
2058                raw.blank_only_lines += 1;
2059            } else if has_docstring[i] && !has_code[i] {
2060                raw.docstring_comment_lines += 1;
2061            } else if has_code[i] && has_comment[i] {
2062                // Classify the mixed line as single or multi based on what kind of comment is on it.
2063                if comment_is_block[i] {
2064                    raw.mixed_code_multi_comment_lines += 1;
2065                } else {
2066                    raw.mixed_code_single_comment_lines += 1;
2067                }
2068            } else if has_comment[i] {
2069                if comment_is_block[i] {
2070                    raw.multi_comment_only_lines += 1;
2071                } else {
2072                    raw.single_comment_only_lines += 1;
2073                }
2074            } else {
2075                raw.code_only_lines += 1;
2076            }
2077        }
2078
2079        Some(RawFileAnalysis {
2080            raw,
2081            parse_mode: ParseMode::TreeSitter,
2082            warnings: Vec::new(),
2083        })
2084    }
2085
2086    struct VisitCtx<'a> {
2087        source: &'a [u8],
2088        comment_kinds: &'a [&'a str],
2089        docstring_stmt_kind: Option<&'a str>,
2090        has_code: &'a mut Vec<bool>,
2091        has_comment: &'a mut Vec<bool>,
2092        comment_is_block: &'a mut Vec<bool>,
2093        has_docstring: &'a mut Vec<bool>,
2094    }
2095
2096    fn visit(node: Node, ctx: &mut VisitCtx<'_>) {
2097        let kind = node.kind();
2098        let start_row = node.start_position().row;
2099        let end_row = node.end_position().row;
2100
2101        if ctx.comment_kinds.contains(&kind) {
2102            let first_two = node
2103                .utf8_text(ctx.source)
2104                .unwrap_or("")
2105                .get(..2)
2106                .unwrap_or("");
2107            let is_block = first_two == "/*" || first_two == "<#";
2108            for row in start_row..=end_row {
2109                if row < ctx.has_comment.len() {
2110                    ctx.has_comment[row] = true;
2111                    if is_block {
2112                        ctx.comment_is_block[row] = true;
2113                    }
2114                }
2115            }
2116            return;
2117        }
2118
2119        // Python docstring: expression_statement whose only named child is a string literal
2120        if let Some(stmt_kind) = ctx.docstring_stmt_kind {
2121            if kind == stmt_kind && node.named_child_count() == 1 {
2122                if let Some(child) = node.named_child(0) {
2123                    if child.kind() == "string" {
2124                        let child_start = child.start_position().row;
2125                        let child_end = child.end_position().row;
2126                        for row in child_start..=child_end {
2127                            if row < ctx.has_docstring.len() {
2128                                ctx.has_docstring[row] = true;
2129                            }
2130                        }
2131                        return;
2132                    }
2133                }
2134            }
2135        }
2136
2137        // Leaf non-comment node: mark as code.
2138        if node.child_count() == 0 && !node.is_extra() {
2139            for row in start_row..=end_row {
2140                if row < ctx.has_code.len() {
2141                    ctx.has_code[row] = true;
2142                }
2143            }
2144            return;
2145        }
2146
2147        for i in 0..node.child_count() {
2148            if let Some(child) = node.child(i) {
2149                visit(child, ctx);
2150            }
2151        }
2152    }
2153
2154    /// Parse C or C++ source with tree-sitter-c.
2155    pub fn analyze_c(text: &str) -> Option<RawFileAnalysis> {
2156        analyze_lines(text, tree_sitter_c::language(), &["comment"], None)
2157    }
2158
2159    /// Parse Python source with tree-sitter-python.
2160    pub fn analyze_python(text: &str) -> Option<RawFileAnalysis> {
2161        analyze_lines(
2162            text,
2163            tree_sitter_python::language(),
2164            &["comment"],
2165            Some("expression_statement"),
2166        )
2167    }
2168}
2169
2170#[cfg(test)]
2171mod tests {
2172    use super::*;
2173
2174    #[test]
2175    fn python_docstrings_are_separated() {
2176        let input = r####""""module docs""""
2177
2178
2179def fn_a():
2180    """function docs"""
2181    value = 1  # trailing comment
2182    return value
2183"####;
2184
2185        let result = analyze_text(Language::Python, input, AnalysisOptions::default());
2186        assert_eq!(result.raw.docstring_comment_lines, 2);
2187        assert_eq!(result.raw.mixed_code_single_comment_lines, 1);
2188        assert_eq!(result.raw.code_only_lines, 2);
2189    }
2190
2191    #[test]
2192    fn c_style_mixed_lines_are_captured() {
2193        let input = "int x = 1; // note\n/* block */\n";
2194        let result = analyze_text(Language::C, input, AnalysisOptions::default());
2195        assert_eq!(result.raw.mixed_code_single_comment_lines, 1);
2196        assert_eq!(result.raw.multi_comment_only_lines, 1);
2197    }
2198
2199    #[test]
2200    fn detect_language_by_shebang() {
2201        let language = detect_language(
2202            Path::new("script"),
2203            Some("#!/usr/bin/env bash"),
2204            &BTreeMap::new(),
2205            true,
2206        );
2207        assert_eq!(language, Some(Language::Shell));
2208    }
2209}