1use std::collections::{BTreeMap, BTreeSet, HashSet};
5use std::path::Path;
6
7use serde::{Deserialize, Serialize};
8
9#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
10#[serde(rename_all = "snake_case")]
11pub enum Language {
12 C,
13 Cpp,
14 CSharp,
15 Go,
16 Java,
17 JavaScript,
18 Python,
19 Rust,
20 Shell,
21 PowerShell,
22 TypeScript,
23 Assembly,
25 Clojure,
26 Css,
27 Dart,
28 Dockerfile,
29 Elixir,
30 Erlang,
31 FSharp,
32 Groovy,
33 Haskell,
34 Html,
35 Julia,
36 Kotlin,
37 Lua,
38 Makefile,
39 Nim,
40 ObjectiveC,
41 Ocaml,
42 Perl,
43 Php,
44 R,
45 Ruby,
46 Scala,
47 Scss,
48 Sql,
49 Svelte,
50 Swift,
51 Vue,
52 Xml,
53 Zig,
54}
55
56impl Language {
57 #[must_use]
58 pub const fn display_name(&self) -> &'static str {
59 match self {
60 Self::C => "C",
61 Self::Cpp => "C++",
62 Self::CSharp => "C#",
63 Self::Go => "Go",
64 Self::Java => "Java",
65 Self::JavaScript => "JavaScript",
66 Self::Python => "Python",
67 Self::Rust => "Rust",
68 Self::Shell => "Shell",
69 Self::PowerShell => "PowerShell",
70 Self::TypeScript => "TypeScript",
71 Self::Assembly => "Assembly",
72 Self::Clojure => "Clojure",
73 Self::Css => "CSS",
74 Self::Dart => "Dart",
75 Self::Dockerfile => "Dockerfile",
76 Self::Elixir => "Elixir",
77 Self::Erlang => "Erlang",
78 Self::FSharp => "F#",
79 Self::Groovy => "Groovy",
80 Self::Haskell => "Haskell",
81 Self::Html => "HTML",
82 Self::Julia => "Julia",
83 Self::Kotlin => "Kotlin",
84 Self::Lua => "Lua",
85 Self::Makefile => "Makefile",
86 Self::Nim => "Nim",
87 Self::ObjectiveC => "Objective-C",
88 Self::Ocaml => "OCaml",
89 Self::Perl => "Perl",
90 Self::Php => "PHP",
91 Self::R => "R",
92 Self::Ruby => "Ruby",
93 Self::Scala => "Scala",
94 Self::Scss => "SCSS",
95 Self::Sql => "SQL",
96 Self::Svelte => "Svelte",
97 Self::Swift => "Swift",
98 Self::Vue => "Vue",
99 Self::Xml => "XML",
100 Self::Zig => "Zig",
101 }
102 }
103
104 #[must_use]
105 pub const fn as_slug(&self) -> &'static str {
106 match self {
107 Self::C => "c",
108 Self::Cpp => "cpp",
109 Self::CSharp => "csharp",
110 Self::Go => "go",
111 Self::Java => "java",
112 Self::JavaScript => "javascript",
113 Self::Python => "python",
114 Self::Rust => "rust",
115 Self::Shell => "shell",
116 Self::PowerShell => "powershell",
117 Self::TypeScript => "typescript",
118 Self::Assembly => "assembly",
119 Self::Clojure => "clojure",
120 Self::Css => "css",
121 Self::Dart => "dart",
122 Self::Dockerfile => "dockerfile",
123 Self::Elixir => "elixir",
124 Self::Erlang => "erlang",
125 Self::FSharp => "fsharp",
126 Self::Groovy => "groovy",
127 Self::Haskell => "haskell",
128 Self::Html => "html",
129 Self::Julia => "julia",
130 Self::Kotlin => "kotlin",
131 Self::Lua => "lua",
132 Self::Makefile => "makefile",
133 Self::Nim => "nim",
134 Self::ObjectiveC => "objectivec",
135 Self::Ocaml => "ocaml",
136 Self::Perl => "perl",
137 Self::Php => "php",
138 Self::R => "r",
139 Self::Ruby => "ruby",
140 Self::Scala => "scala",
141 Self::Scss => "scss",
142 Self::Sql => "sql",
143 Self::Svelte => "svelte",
144 Self::Swift => "swift",
145 Self::Vue => "vue",
146 Self::Xml => "xml",
147 Self::Zig => "zig",
148 }
149 }
150
151 #[must_use]
152 pub fn from_name(name: &str) -> Option<Self> {
153 match name.trim().to_ascii_lowercase().as_str() {
154 "c" => Some(Self::C),
155 "cpp" | "c++" | "cplusplus" => Some(Self::Cpp),
156 "csharp" | "c#" | "cs" => Some(Self::CSharp),
157 "go" | "golang" => Some(Self::Go),
158 "java" => Some(Self::Java),
159 "javascript" | "js" => Some(Self::JavaScript),
160 "python" | "py" => Some(Self::Python),
161 "rust" | "rs" => Some(Self::Rust),
162 "shell" | "sh" | "bash" => Some(Self::Shell),
163 "powershell" | "pwsh" | "ps" => Some(Self::PowerShell),
164 "typescript" | "ts" => Some(Self::TypeScript),
165 "assembly" | "asm" => Some(Self::Assembly),
166 "clojure" | "clj" => Some(Self::Clojure),
167 "css" => Some(Self::Css),
168 "dart" => Some(Self::Dart),
169 "dockerfile" | "docker" => Some(Self::Dockerfile),
170 "elixir" | "ex" => Some(Self::Elixir),
171 "erlang" | "erl" => Some(Self::Erlang),
172 "fsharp" | "f#" | "fs" => Some(Self::FSharp),
173 "groovy" => Some(Self::Groovy),
174 "haskell" | "hs" => Some(Self::Haskell),
175 "html" | "htm" => Some(Self::Html),
176 "julia" | "jl" => Some(Self::Julia),
177 "kotlin" | "kt" => Some(Self::Kotlin),
178 "lua" => Some(Self::Lua),
179 "makefile" | "make" | "mk" => Some(Self::Makefile),
180 "nim" => Some(Self::Nim),
181 "objectivec" | "objc" | "objective-c" => Some(Self::ObjectiveC),
182 "ocaml" | "ml" => Some(Self::Ocaml),
183 "perl" | "pl" => Some(Self::Perl),
184 "php" => Some(Self::Php),
185 "r" => Some(Self::R),
186 "ruby" | "rb" => Some(Self::Ruby),
187 "scala" => Some(Self::Scala),
188 "scss" | "sass" => Some(Self::Scss),
189 "sql" => Some(Self::Sql),
190 "svelte" => Some(Self::Svelte),
191 "swift" => Some(Self::Swift),
192 "vue" => Some(Self::Vue),
193 "xml" => Some(Self::Xml),
194 "zig" => Some(Self::Zig),
195 _ => None,
196 }
197 }
198}
199
200#[derive(Debug, Clone, Serialize, Deserialize, Default)]
201pub struct RawLineCounts {
202 pub total_physical_lines: u64,
203 pub blank_only_lines: u64,
204 pub code_only_lines: u64,
205 pub single_comment_only_lines: u64,
206 pub multi_comment_only_lines: u64,
207 pub mixed_code_single_comment_lines: u64,
208 pub mixed_code_multi_comment_lines: u64,
209 pub docstring_comment_lines: u64,
210 pub skipped_unknown_lines: u64,
211 #[serde(default)]
213 pub functions: u64,
214 #[serde(default)]
216 pub classes: u64,
217 #[serde(default)]
219 pub variables: u64,
220 #[serde(default)]
222 pub imports: u64,
223 #[serde(default)]
227 pub compiler_directive_lines: u64,
228}
229
230#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
231#[serde(rename_all = "snake_case")]
232pub enum ParseMode {
233 Lexical,
234 LexicalBestEffort,
235 TreeSitter,
236}
237
238#[derive(Debug, Clone, Serialize, Deserialize)]
239pub struct RawFileAnalysis {
240 pub raw: RawLineCounts,
241 pub parse_mode: ParseMode,
242 pub warnings: Vec<String>,
243}
244
245#[derive(Debug, Clone, Copy)]
250pub struct AnalysisOptions {
251 pub blank_in_block_comment_as_comment: bool,
254 pub collapse_continuation_lines: bool,
257}
258
259impl Default for AnalysisOptions {
260 fn default() -> Self {
261 Self {
262 blank_in_block_comment_as_comment: true,
263 collapse_continuation_lines: false,
264 }
265 }
266}
267
268#[must_use]
269pub fn supported_languages() -> BTreeSet<Language> {
270 [
271 Language::Assembly,
272 Language::C,
273 Language::Clojure,
274 Language::Cpp,
275 Language::CSharp,
276 Language::Css,
277 Language::Dart,
278 Language::Dockerfile,
279 Language::Elixir,
280 Language::Erlang,
281 Language::FSharp,
282 Language::Go,
283 Language::Groovy,
284 Language::Haskell,
285 Language::Html,
286 Language::Java,
287 Language::JavaScript,
288 Language::Julia,
289 Language::Kotlin,
290 Language::Lua,
291 Language::Makefile,
292 Language::Nim,
293 Language::ObjectiveC,
294 Language::Ocaml,
295 Language::Perl,
296 Language::Php,
297 Language::PowerShell,
298 Language::Python,
299 Language::R,
300 Language::Ruby,
301 Language::Rust,
302 Language::Scala,
303 Language::Scss,
304 Language::Shell,
305 Language::Sql,
306 Language::Svelte,
307 Language::Swift,
308 Language::TypeScript,
309 Language::Vue,
310 Language::Xml,
311 Language::Zig,
312 ]
313 .into_iter()
314 .collect()
315}
316
317fn detect_by_shebang(line: &str) -> Option<Language> {
319 let lower = line.to_ascii_lowercase();
320 if !lower.starts_with("#!") {
321 return None;
322 }
323 if lower.contains("python") {
324 return Some(Language::Python);
325 }
326 if lower.contains("pwsh") || lower.contains("powershell") {
327 return Some(Language::PowerShell);
328 }
329 if lower.contains("bash")
330 || lower.contains("/sh")
331 || lower.contains("zsh")
332 || lower.contains("ksh")
333 {
334 return Some(Language::Shell);
335 }
336 if lower.contains("ruby") {
337 return Some(Language::Ruby);
338 }
339 if lower.contains("perl") {
340 return Some(Language::Perl);
341 }
342 if lower.contains("php") {
343 return Some(Language::Php);
344 }
345 if lower.contains("node") || lower.contains("nodejs") {
346 return Some(Language::JavaScript);
347 }
348 None
349}
350
351fn detect_by_extension(ext: &str) -> Option<Language> {
353 match ext {
354 "c" | "h" => Some(Language::C),
356 "cc" | "cp" | "cpp" | "cxx" | "hh" | "hpp" | "hxx" => Some(Language::Cpp),
357 "cs" => Some(Language::CSharp),
358 "go" => Some(Language::Go),
359 "java" => Some(Language::Java),
360 "js" | "mjs" | "cjs" => Some(Language::JavaScript),
361 "py" => Some(Language::Python),
362 "rs" => Some(Language::Rust),
363 "sh" | "bash" | "zsh" | "ksh" => Some(Language::Shell),
364 "ps1" | "psm1" | "psd1" => Some(Language::PowerShell),
365 "ts" | "mts" | "cts" => Some(Language::TypeScript),
366 "asm" | "s" => Some(Language::Assembly),
368 "clj" | "cljs" | "cljc" | "edn" => Some(Language::Clojure),
369 "css" => Some(Language::Css),
370 "dart" => Some(Language::Dart),
371 "ex" | "exs" => Some(Language::Elixir),
372 "erl" | "hrl" => Some(Language::Erlang),
373 "fs" | "fsi" | "fsx" => Some(Language::FSharp),
374 "groovy" | "gradle" => Some(Language::Groovy),
375 "hs" | "lhs" => Some(Language::Haskell),
376 "html" | "htm" | "xhtml" => Some(Language::Html),
377 "jl" => Some(Language::Julia),
378 "kt" | "kts" => Some(Language::Kotlin),
379 "lua" => Some(Language::Lua),
380 "mk" => Some(Language::Makefile),
381 "nim" | "nims" => Some(Language::Nim),
382 "m" | "mm" => Some(Language::ObjectiveC),
383 "ml" | "mli" => Some(Language::Ocaml),
384 "pl" | "pm" | "t" => Some(Language::Perl),
385 "php" | "php3" | "php4" | "php5" | "php7" | "phtml" => Some(Language::Php),
386 "r" => Some(Language::R),
387 "rb" | "rake" => Some(Language::Ruby),
388 "scala" | "sc" => Some(Language::Scala),
389 "scss" | "sass" => Some(Language::Scss),
390 "sql" => Some(Language::Sql),
391 "svelte" => Some(Language::Svelte),
392 "swift" => Some(Language::Swift),
393 "vue" => Some(Language::Vue),
394 "xml" | "xsd" | "xsl" | "xslt" | "svg" => Some(Language::Xml),
395 "zig" => Some(Language::Zig),
396 _ => None,
397 }
398}
399
400fn detect_by_filename(filename: &str, filename_lower: &str) -> Option<Language> {
402 if filename == "Dockerfile"
404 || filename.starts_with("Dockerfile.")
405 || filename_lower == "dockerfile"
406 {
407 return Some(Language::Dockerfile);
408 }
409 if matches!(
411 filename,
412 "Makefile" | "GNUmakefile" | "makefile" | "BSDmakefile"
413 ) {
414 return Some(Language::Makefile);
415 }
416 if matches!(
418 filename,
419 "Rakefile" | "Gemfile" | "Guardfile" | "Vagrantfile" | "Fastfile" | "Podfile"
420 ) {
421 return Some(Language::Ruby);
422 }
423 None
424}
425
426#[must_use]
427#[allow(clippy::too_many_lines)]
428pub fn detect_language(
429 path: &Path,
430 first_line: Option<&str>,
431 extension_overrides: &BTreeMap<String, String>,
432 shebang_detection: bool,
433) -> Option<Language> {
434 let extension = path
435 .extension()
436 .and_then(|ext| ext.to_str())
437 .map(str::to_ascii_lowercase);
438
439 if let Some(ext) = extension.as_ref() {
441 if let Some(override_name) = extension_overrides.get(ext.as_str()) {
442 if let Some(lang) = Language::from_name(override_name) {
443 return Some(lang);
444 }
445 }
446 }
447
448 let filename = path.file_name().and_then(|s| s.to_str()).unwrap_or("");
450 let filename_lower = filename.to_ascii_lowercase();
451
452 if let Some(lang) = detect_by_filename(filename, &filename_lower) {
453 return Some(lang);
454 }
455
456 if let Some(lang) = extension.as_deref().and_then(detect_by_extension) {
458 return Some(lang);
459 }
460
461 if shebang_detection {
463 if let Some(line) = first_line {
464 if let Some(lang) = detect_by_shebang(line) {
465 return Some(lang);
466 }
467 }
468 }
469
470 None
471}
472
473#[must_use]
474#[allow(clippy::too_many_lines)]
475pub fn analyze_text(language: Language, text: &str, options: AnalysisOptions) -> RawFileAnalysis {
476 let base = IeeeFlags {
478 has_preprocessor_directives: false,
479 blank_in_block_comment_as_comment: options.blank_in_block_comment_as_comment,
480 collapse_continuation_lines: options.collapse_continuation_lines,
481 };
482 let cpp = IeeeFlags {
485 has_preprocessor_directives: true,
486 ..base
487 };
488
489 match language {
490 Language::C => {
491 #[cfg(feature = "tree-sitter")]
492 if let Some(result) = ts::analyze_c(text) {
493 return result;
494 }
495 analyze_generic(
496 text,
497 ScanConfig {
498 line_comments: &["//"],
499 block_comment: Some(("/*", "*/")),
500 allow_single_quote_strings: true,
501 allow_double_quote_strings: true,
502 allow_triple_quote_strings: false,
503 allow_csharp_verbatim_strings: false,
504 skip_lines: HashSet::new(),
505 symbol_patterns: SP_C,
506 },
507 cpp,
508 )
509 }
510 Language::Cpp => {
511 #[cfg(feature = "tree-sitter")]
513 if let Some(result) = ts::analyze_c(text) {
514 return result;
515 }
516 analyze_generic(
517 text,
518 ScanConfig {
519 line_comments: &["//"],
520 block_comment: Some(("/*", "*/")),
521 allow_single_quote_strings: true,
522 allow_double_quote_strings: true,
523 allow_triple_quote_strings: false,
524 allow_csharp_verbatim_strings: false,
525 skip_lines: HashSet::new(),
526 symbol_patterns: SP_CPP,
527 },
528 cpp,
529 )
530 }
531 Language::CSharp => analyze_generic(
532 text,
533 ScanConfig {
534 line_comments: &["//"],
535 block_comment: Some(("/*", "*/")),
536 allow_single_quote_strings: true,
537 allow_double_quote_strings: true,
538 allow_triple_quote_strings: false,
539 allow_csharp_verbatim_strings: true,
540 skip_lines: HashSet::new(),
541 symbol_patterns: SP_CSHARP,
542 },
543 base,
544 ),
545 Language::Go => analyze_generic(
546 text,
547 ScanConfig {
548 line_comments: &["//"],
549 block_comment: Some(("/*", "*/")),
550 allow_single_quote_strings: true,
551 allow_double_quote_strings: true,
552 allow_triple_quote_strings: false,
553 allow_csharp_verbatim_strings: false,
554 skip_lines: HashSet::new(),
555 symbol_patterns: SP_GO,
556 },
557 base,
558 ),
559 Language::Java => analyze_generic(
560 text,
561 ScanConfig {
562 line_comments: &["//"],
563 block_comment: Some(("/*", "*/")),
564 allow_single_quote_strings: true,
565 allow_double_quote_strings: true,
566 allow_triple_quote_strings: false,
567 allow_csharp_verbatim_strings: false,
568 skip_lines: HashSet::new(),
569 symbol_patterns: SP_JAVA,
570 },
571 base,
572 ),
573 Language::JavaScript | Language::Svelte | Language::Vue => analyze_generic(
574 text,
575 ScanConfig {
576 line_comments: &["//"],
577 block_comment: Some(("/*", "*/")),
578 allow_single_quote_strings: true,
579 allow_double_quote_strings: true,
580 allow_triple_quote_strings: false,
581 allow_csharp_verbatim_strings: false,
582 skip_lines: HashSet::new(),
583 symbol_patterns: SP_JS,
584 },
585 base,
586 ),
587 Language::Rust => analyze_generic(
588 text,
589 ScanConfig {
590 line_comments: &["//"],
592 block_comment: Some(("/*", "*/")),
593 allow_single_quote_strings: false,
594 allow_double_quote_strings: true,
595 allow_triple_quote_strings: false,
596 allow_csharp_verbatim_strings: false,
597 skip_lines: HashSet::new(),
598 symbol_patterns: SP_RUST,
599 },
600 base,
601 ),
602 Language::Shell => analyze_generic(
603 text,
604 ScanConfig {
605 line_comments: &["#"],
606 block_comment: None,
607 allow_single_quote_strings: true,
608 allow_double_quote_strings: true,
609 allow_triple_quote_strings: false,
610 allow_csharp_verbatim_strings: false,
611 skip_lines: HashSet::new(),
612 symbol_patterns: SP_SHELL,
613 },
614 base,
615 ),
616 Language::PowerShell => analyze_generic(
617 text,
618 ScanConfig {
619 line_comments: &["#"],
620 block_comment: Some(("<#", "#>")),
621 allow_single_quote_strings: true,
622 allow_double_quote_strings: true,
623 allow_triple_quote_strings: false,
624 allow_csharp_verbatim_strings: false,
625 skip_lines: HashSet::new(),
626 symbol_patterns: SP_POWERSHELL,
627 },
628 base,
629 ),
630 Language::TypeScript => analyze_generic(
631 text,
632 ScanConfig {
633 line_comments: &["//"],
634 block_comment: Some(("/*", "*/")),
635 allow_single_quote_strings: true,
636 allow_double_quote_strings: true,
637 allow_triple_quote_strings: false,
638 allow_csharp_verbatim_strings: false,
639 skip_lines: HashSet::new(),
640 symbol_patterns: SP_TS,
641 },
642 base,
643 ),
644 Language::Python => {
645 #[cfg(feature = "tree-sitter")]
646 if let Some(result) = ts::analyze_python(text) {
647 return result;
648 }
649 let docstring_lines = detect_python_docstring_lines(text);
650 analyze_generic(
651 text,
652 ScanConfig {
653 line_comments: &["#"],
654 block_comment: None,
655 allow_single_quote_strings: true,
656 allow_double_quote_strings: true,
657 allow_triple_quote_strings: true,
658 allow_csharp_verbatim_strings: false,
659 skip_lines: docstring_lines,
660 symbol_patterns: SP_PYTHON,
661 },
662 base,
663 )
664 }
665 Language::Assembly => analyze_generic(
667 text,
668 ScanConfig {
669 line_comments: &[";"],
670 block_comment: None,
671 allow_single_quote_strings: false,
672 allow_double_quote_strings: false,
673 allow_triple_quote_strings: false,
674 allow_csharp_verbatim_strings: false,
675 skip_lines: HashSet::new(),
676 symbol_patterns: SP_ASSEMBLY,
677 },
678 base,
679 ),
680 Language::Clojure => analyze_generic(
681 text,
682 ScanConfig {
683 line_comments: &[";"],
684 block_comment: None,
685 allow_single_quote_strings: false,
686 allow_double_quote_strings: true,
687 allow_triple_quote_strings: false,
688 allow_csharp_verbatim_strings: false,
689 skip_lines: HashSet::new(),
690 symbol_patterns: SP_CLOJURE,
691 },
692 base,
693 ),
694 Language::Css => analyze_generic(
695 text,
696 ScanConfig {
697 line_comments: &[],
698 block_comment: Some(("/*", "*/")),
699 allow_single_quote_strings: true,
700 allow_double_quote_strings: true,
701 allow_triple_quote_strings: false,
702 allow_csharp_verbatim_strings: false,
703 skip_lines: HashSet::new(),
704 symbol_patterns: SP_NONE,
705 },
706 base,
707 ),
708 Language::Dart => analyze_generic(
709 text,
710 ScanConfig {
711 line_comments: &["//"],
712 block_comment: Some(("/*", "*/")),
713 allow_single_quote_strings: true,
714 allow_double_quote_strings: true,
715 allow_triple_quote_strings: false,
716 allow_csharp_verbatim_strings: false,
717 skip_lines: HashSet::new(),
718 symbol_patterns: SP_DART,
719 },
720 base,
721 ),
722 Language::Dockerfile | Language::Makefile => analyze_generic(
723 text,
724 ScanConfig {
725 line_comments: &["#"],
726 block_comment: None,
727 allow_single_quote_strings: false,
728 allow_double_quote_strings: false,
729 allow_triple_quote_strings: false,
730 allow_csharp_verbatim_strings: false,
731 skip_lines: HashSet::new(),
732 symbol_patterns: SP_NONE,
733 },
734 base,
735 ),
736 Language::Elixir => analyze_generic(
737 text,
738 ScanConfig {
739 line_comments: &["#"],
740 block_comment: None,
741 allow_single_quote_strings: true,
742 allow_double_quote_strings: true,
743 allow_triple_quote_strings: false,
744 allow_csharp_verbatim_strings: false,
745 skip_lines: HashSet::new(),
746 symbol_patterns: SP_ELIXIR,
747 },
748 base,
749 ),
750 Language::Erlang => analyze_generic(
751 text,
752 ScanConfig {
753 line_comments: &["%"],
754 block_comment: None,
755 allow_single_quote_strings: false,
756 allow_double_quote_strings: true,
757 allow_triple_quote_strings: false,
758 allow_csharp_verbatim_strings: false,
759 skip_lines: HashSet::new(),
760 symbol_patterns: SP_ERLANG,
761 },
762 base,
763 ),
764 Language::FSharp => analyze_generic(
765 text,
766 ScanConfig {
767 line_comments: &["//"],
768 block_comment: Some(("(*", "*)")),
769 allow_single_quote_strings: false,
770 allow_double_quote_strings: true,
771 allow_triple_quote_strings: false,
772 allow_csharp_verbatim_strings: false,
773 skip_lines: HashSet::new(),
774 symbol_patterns: SP_FSHARP,
775 },
776 base,
777 ),
778 Language::Groovy => analyze_generic(
779 text,
780 ScanConfig {
781 line_comments: &["//"],
782 block_comment: Some(("/*", "*/")),
783 allow_single_quote_strings: true,
784 allow_double_quote_strings: true,
785 allow_triple_quote_strings: false,
786 allow_csharp_verbatim_strings: false,
787 skip_lines: HashSet::new(),
788 symbol_patterns: SP_GROOVY,
789 },
790 base,
791 ),
792 Language::Haskell => analyze_generic(
793 text,
794 ScanConfig {
795 line_comments: &["--"],
796 block_comment: Some(("{-", "-}")),
797 allow_single_quote_strings: true,
798 allow_double_quote_strings: true,
799 allow_triple_quote_strings: false,
800 allow_csharp_verbatim_strings: false,
801 skip_lines: HashSet::new(),
802 symbol_patterns: SP_HASKELL,
803 },
804 base,
805 ),
806 Language::Html | Language::Xml => analyze_generic(
807 text,
808 ScanConfig {
809 line_comments: &[],
810 block_comment: Some(("<!--", "-->")),
811 allow_single_quote_strings: false,
812 allow_double_quote_strings: false,
813 allow_triple_quote_strings: false,
814 allow_csharp_verbatim_strings: false,
815 skip_lines: HashSet::new(),
816 symbol_patterns: SP_NONE,
817 },
818 base,
819 ),
820 Language::Julia => analyze_generic(
821 text,
822 ScanConfig {
823 line_comments: &["#"],
824 block_comment: Some(("#=", "=#")),
825 allow_single_quote_strings: false,
826 allow_double_quote_strings: true,
827 allow_triple_quote_strings: true,
828 allow_csharp_verbatim_strings: false,
829 skip_lines: HashSet::new(),
830 symbol_patterns: SP_JULIA,
831 },
832 base,
833 ),
834 Language::Kotlin => analyze_generic(
835 text,
836 ScanConfig {
837 line_comments: &["//"],
838 block_comment: Some(("/*", "*/")),
839 allow_single_quote_strings: true,
840 allow_double_quote_strings: true,
841 allow_triple_quote_strings: false,
842 allow_csharp_verbatim_strings: false,
843 skip_lines: HashSet::new(),
844 symbol_patterns: SP_KOTLIN,
845 },
846 base,
847 ),
848 Language::Lua => analyze_generic(
849 text,
850 ScanConfig {
851 line_comments: &["--"],
852 block_comment: Some(("--[[", "]]")),
853 allow_single_quote_strings: true,
854 allow_double_quote_strings: true,
855 allow_triple_quote_strings: false,
856 allow_csharp_verbatim_strings: false,
857 skip_lines: HashSet::new(),
858 symbol_patterns: SP_LUA,
859 },
860 base,
861 ),
862 Language::Nim => analyze_generic(
863 text,
864 ScanConfig {
865 line_comments: &["#"],
866 block_comment: Some(("#[", "]#")),
867 allow_single_quote_strings: true,
868 allow_double_quote_strings: true,
869 allow_triple_quote_strings: false,
870 allow_csharp_verbatim_strings: false,
871 skip_lines: HashSet::new(),
872 symbol_patterns: SP_NIM,
873 },
874 base,
875 ),
876 Language::ObjectiveC => analyze_generic(
877 text,
878 ScanConfig {
879 line_comments: &["//"],
880 block_comment: Some(("/*", "*/")),
881 allow_single_quote_strings: true,
882 allow_double_quote_strings: true,
883 allow_triple_quote_strings: false,
884 allow_csharp_verbatim_strings: false,
885 skip_lines: HashSet::new(),
886 symbol_patterns: SP_OBJECTIVEC,
887 },
888 cpp,
889 ),
890 Language::Ocaml => analyze_generic(
891 text,
892 ScanConfig {
893 line_comments: &[],
894 block_comment: Some(("(*", "*)")),
895 allow_single_quote_strings: false,
896 allow_double_quote_strings: true,
897 allow_triple_quote_strings: false,
898 allow_csharp_verbatim_strings: false,
899 skip_lines: HashSet::new(),
900 symbol_patterns: SP_OCAML,
901 },
902 base,
903 ),
904 Language::Perl => analyze_generic(
905 text,
906 ScanConfig {
907 line_comments: &["#"],
908 block_comment: None,
909 allow_single_quote_strings: true,
910 allow_double_quote_strings: true,
911 allow_triple_quote_strings: false,
912 allow_csharp_verbatim_strings: false,
913 skip_lines: HashSet::new(),
914 symbol_patterns: SP_PERL,
915 },
916 base,
917 ),
918 Language::Php => analyze_generic(
919 text,
920 ScanConfig {
921 line_comments: &["//", "#"],
922 block_comment: Some(("/*", "*/")),
923 allow_single_quote_strings: true,
924 allow_double_quote_strings: true,
925 allow_triple_quote_strings: false,
926 allow_csharp_verbatim_strings: false,
927 skip_lines: HashSet::new(),
928 symbol_patterns: SP_PHP,
929 },
930 base,
931 ),
932 Language::R => analyze_generic(
933 text,
934 ScanConfig {
935 line_comments: &["#"],
936 block_comment: None,
937 allow_single_quote_strings: true,
938 allow_double_quote_strings: true,
939 allow_triple_quote_strings: false,
940 allow_csharp_verbatim_strings: false,
941 skip_lines: HashSet::new(),
942 symbol_patterns: SP_R,
943 },
944 base,
945 ),
946 Language::Ruby => analyze_generic(
947 text,
948 ScanConfig {
949 line_comments: &["#"],
950 block_comment: None,
951 allow_single_quote_strings: true,
952 allow_double_quote_strings: true,
953 allow_triple_quote_strings: false,
954 allow_csharp_verbatim_strings: false,
955 skip_lines: HashSet::new(),
956 symbol_patterns: SP_RUBY,
957 },
958 base,
959 ),
960 Language::Scala => analyze_generic(
961 text,
962 ScanConfig {
963 line_comments: &["//"],
964 block_comment: Some(("/*", "*/")),
965 allow_single_quote_strings: true,
966 allow_double_quote_strings: true,
967 allow_triple_quote_strings: false,
968 allow_csharp_verbatim_strings: false,
969 skip_lines: HashSet::new(),
970 symbol_patterns: SP_SCALA,
971 },
972 base,
973 ),
974 Language::Scss => analyze_generic(
975 text,
976 ScanConfig {
977 line_comments: &["//"],
978 block_comment: Some(("/*", "*/")),
979 allow_single_quote_strings: true,
980 allow_double_quote_strings: true,
981 allow_triple_quote_strings: false,
982 allow_csharp_verbatim_strings: false,
983 skip_lines: HashSet::new(),
984 symbol_patterns: SP_NONE,
985 },
986 base,
987 ),
988 Language::Sql => analyze_generic(
989 text,
990 ScanConfig {
991 line_comments: &["--"],
992 block_comment: Some(("/*", "*/")),
993 allow_single_quote_strings: true,
994 allow_double_quote_strings: false,
995 allow_triple_quote_strings: false,
996 allow_csharp_verbatim_strings: false,
997 skip_lines: HashSet::new(),
998 symbol_patterns: SP_SQL,
999 },
1000 base,
1001 ),
1002 Language::Swift => analyze_generic(
1003 text,
1004 ScanConfig {
1005 line_comments: &["//"],
1006 block_comment: Some(("/*", "*/")),
1007 allow_single_quote_strings: false,
1008 allow_double_quote_strings: true,
1009 allow_triple_quote_strings: false,
1010 allow_csharp_verbatim_strings: false,
1011 skip_lines: HashSet::new(),
1012 symbol_patterns: SP_SWIFT,
1013 },
1014 base,
1015 ),
1016 Language::Zig => analyze_generic(
1017 text,
1018 ScanConfig {
1019 line_comments: &["//"],
1020 block_comment: None,
1021 allow_single_quote_strings: true,
1022 allow_double_quote_strings: true,
1023 allow_triple_quote_strings: false,
1024 allow_csharp_verbatim_strings: false,
1025 skip_lines: HashSet::new(),
1026 symbol_patterns: SP_ZIG,
1027 },
1028 base,
1029 ),
1030 }
1031}
1032
1033#[derive(Debug, Clone, Copy)]
1037struct SymbolPatterns {
1038 functions: &'static [&'static str],
1039 classes: &'static [&'static str],
1040 variables: &'static [&'static str],
1041 imports: &'static [&'static str],
1042}
1043
1044impl SymbolPatterns {
1045 const fn none() -> Self {
1046 Self {
1047 functions: &[],
1048 classes: &[],
1049 variables: &[],
1050 imports: &[],
1051 }
1052 }
1053}
1054
1055const SP_NONE: SymbolPatterns = SymbolPatterns::none();
1056
1057const SP_RUST: SymbolPatterns = SymbolPatterns {
1058 functions: &[
1059 "fn ",
1060 "pub fn ",
1061 "pub(crate) fn ",
1062 "pub(super) fn ",
1063 "async fn ",
1064 "pub async fn ",
1065 "pub(crate) async fn ",
1066 "unsafe fn ",
1067 "pub unsafe fn ",
1068 "pub(crate) unsafe fn ",
1069 "const fn ",
1070 "pub const fn ",
1071 "pub(crate) const fn ",
1072 "extern fn ",
1073 "pub extern fn ",
1074 ],
1075 classes: &[
1076 "struct ",
1077 "pub struct ",
1078 "pub(crate) struct ",
1079 "enum ",
1080 "pub enum ",
1081 "pub(crate) enum ",
1082 "trait ",
1083 "pub trait ",
1084 "pub(crate) trait ",
1085 "impl ",
1086 "impl<",
1087 "type ",
1088 "pub type ",
1089 "pub(crate) type ",
1090 ],
1091 variables: &["let ", "let mut "],
1092 imports: &["use ", "pub use ", "pub(crate) use ", "extern crate "],
1093};
1094
1095const SP_PYTHON: SymbolPatterns = SymbolPatterns {
1096 functions: &["def ", "async def "],
1097 classes: &["class "],
1098 variables: &[],
1099 imports: &["import ", "from "],
1100};
1101
1102const SP_JS: SymbolPatterns = SymbolPatterns {
1103 functions: &[
1104 "function ",
1105 "async function ",
1106 "export function ",
1107 "export async function ",
1108 "export default function ",
1109 ],
1110 classes: &["class ", "export class ", "export default class "],
1111 variables: &[
1112 "var ",
1113 "let ",
1114 "const ",
1115 "export var ",
1116 "export let ",
1117 "export const ",
1118 ],
1119 imports: &["import "],
1120};
1121
1122const SP_TS: SymbolPatterns = SymbolPatterns {
1123 functions: &[
1124 "function ",
1125 "async function ",
1126 "export function ",
1127 "export async function ",
1128 "export default function ",
1129 ],
1130 classes: &[
1131 "class ",
1132 "export class ",
1133 "export default class ",
1134 "abstract class ",
1135 "export abstract class ",
1136 "interface ",
1137 "export interface ",
1138 "declare class ",
1139 "declare interface ",
1140 ],
1141 variables: &[
1142 "var ",
1143 "let ",
1144 "const ",
1145 "export var ",
1146 "export let ",
1147 "export const ",
1148 ],
1149 imports: &["import "],
1150};
1151
1152const SP_GO: SymbolPatterns = SymbolPatterns {
1153 functions: &["func "],
1154 classes: &["type "],
1155 variables: &["var "],
1156 imports: &["import "],
1157};
1158
1159const SP_JAVA: SymbolPatterns = SymbolPatterns {
1160 functions: &[],
1161 classes: &[
1162 "class ",
1163 "public class ",
1164 "private class ",
1165 "protected class ",
1166 "abstract class ",
1167 "final class ",
1168 "public abstract class ",
1169 "public final class ",
1170 "interface ",
1171 "public interface ",
1172 "enum ",
1173 "public enum ",
1174 "record ",
1175 "public record ",
1176 "@interface ",
1177 ],
1178 variables: &[],
1179 imports: &["import "],
1180};
1181
1182const SP_CSHARP: SymbolPatterns = SymbolPatterns {
1183 functions: &[],
1184 classes: &[
1185 "class ",
1186 "public class ",
1187 "private class ",
1188 "protected class ",
1189 "internal class ",
1190 "abstract class ",
1191 "sealed class ",
1192 "static class ",
1193 "partial class ",
1194 "public abstract class ",
1195 "public sealed class ",
1196 "public static class ",
1197 "interface ",
1198 "public interface ",
1199 "internal interface ",
1200 "enum ",
1201 "public enum ",
1202 "struct ",
1203 "public struct ",
1204 "record ",
1205 "public record ",
1206 ],
1207 variables: &["var "],
1208 imports: &["using "],
1209};
1210
1211const SP_C: SymbolPatterns = SymbolPatterns {
1212 functions: &[],
1213 classes: &[
1214 "struct ",
1215 "typedef struct ",
1216 "union ",
1217 "typedef union ",
1218 "typedef enum ",
1219 ],
1220 variables: &[],
1221 imports: &["#include "],
1222};
1223
1224const SP_CPP: SymbolPatterns = SymbolPatterns {
1225 functions: &[],
1226 classes: &["class ", "struct ", "namespace ", "template "],
1227 variables: &[],
1228 imports: &["#include "],
1229};
1230
1231const SP_SHELL: SymbolPatterns = SymbolPatterns {
1232 functions: &["function "],
1233 classes: &[],
1234 variables: &["declare ", "local ", "export "],
1235 imports: &["source ", ". "],
1236};
1237
1238const SP_POWERSHELL: SymbolPatterns = SymbolPatterns {
1239 functions: &["function ", "Function "],
1240 classes: &["class "],
1241 variables: &[],
1242 imports: &["Import-Module ", "using "],
1243};
1244
1245const SP_KOTLIN: SymbolPatterns = SymbolPatterns {
1246 functions: &[
1247 "fun ",
1248 "private fun ",
1249 "public fun ",
1250 "protected fun ",
1251 "internal fun ",
1252 "override fun ",
1253 "suspend fun ",
1254 "abstract fun ",
1255 "open fun ",
1256 "private suspend fun ",
1257 "public suspend fun ",
1258 ],
1259 classes: &[
1260 "class ",
1261 "data class ",
1262 "sealed class ",
1263 "abstract class ",
1264 "open class ",
1265 "object ",
1266 "companion object",
1267 "interface ",
1268 "enum class ",
1269 "annotation class ",
1270 ],
1271 variables: &["val ", "var ", "private val ", "private var ", "const val "],
1272 imports: &["import "],
1273};
1274
1275const SP_SWIFT: SymbolPatterns = SymbolPatterns {
1276 functions: &[
1277 "func ",
1278 "private func ",
1279 "public func ",
1280 "internal func ",
1281 "override func ",
1282 "open func ",
1283 "static func ",
1284 "class func ",
1285 "mutating func ",
1286 "private static func ",
1287 "public static func ",
1288 ],
1289 classes: &[
1290 "class ",
1291 "struct ",
1292 "protocol ",
1293 "enum ",
1294 "extension ",
1295 "actor ",
1296 "public class ",
1297 "private class ",
1298 "open class ",
1299 "final class ",
1300 "public struct ",
1301 "private struct ",
1302 "public protocol ",
1303 ],
1304 variables: &[
1305 "var ",
1306 "let ",
1307 "private var ",
1308 "private let ",
1309 "static var ",
1310 "static let ",
1311 ],
1312 imports: &["import "],
1313};
1314
1315const SP_RUBY: SymbolPatterns = SymbolPatterns {
1316 functions: &["def ", "private def ", "protected def "],
1317 classes: &["class ", "module "],
1318 variables: &[],
1319 imports: &["require ", "require_relative "],
1320};
1321
1322const SP_SCALA: SymbolPatterns = SymbolPatterns {
1323 functions: &["def ", "private def ", "protected def ", "override def "],
1324 classes: &[
1325 "class ",
1326 "case class ",
1327 "abstract class ",
1328 "sealed class ",
1329 "object ",
1330 "trait ",
1331 ],
1332 variables: &["val ", "var ", "lazy val "],
1333 imports: &["import "],
1334};
1335
1336const SP_PHP: SymbolPatterns = SymbolPatterns {
1337 functions: &[
1338 "function ",
1339 "public function ",
1340 "private function ",
1341 "protected function ",
1342 "static function ",
1343 "abstract function ",
1344 "final function ",
1345 "public static function ",
1346 "private static function ",
1347 "protected static function ",
1348 ],
1349 classes: &[
1350 "class ",
1351 "abstract class ",
1352 "final class ",
1353 "interface ",
1354 "trait ",
1355 "enum ",
1356 ],
1357 variables: &[],
1358 imports: &[
1359 "use ",
1360 "require ",
1361 "require_once ",
1362 "include ",
1363 "include_once ",
1364 ],
1365};
1366
1367const SP_ELIXIR: SymbolPatterns = SymbolPatterns {
1368 functions: &[
1369 "def ",
1370 "defp ",
1371 "defmacro ",
1372 "defmacrop ",
1373 "defguard ",
1374 "defguardp ",
1375 ],
1376 classes: &["defmodule ", "defprotocol ", "defimpl "],
1377 variables: &[],
1378 imports: &["import ", "alias ", "use ", "require "],
1379};
1380
1381const SP_ERLANG: SymbolPatterns = SymbolPatterns {
1382 functions: &[],
1383 classes: &["-module("],
1384 variables: &[],
1385 imports: &["-import(", "-include(", "-include_lib("],
1386};
1387
1388const SP_FSHARP: SymbolPatterns = SymbolPatterns {
1389 functions: &[
1390 "let ",
1391 "let rec ",
1392 "member ",
1393 "override ",
1394 "abstract member ",
1395 ],
1396 classes: &["type "],
1397 variables: &["let mutable "],
1398 imports: &["open "],
1399};
1400
1401const SP_GROOVY: SymbolPatterns = SymbolPatterns {
1402 functions: &["def ", "private def ", "public def ", "protected def "],
1403 classes: &["class ", "abstract class ", "interface ", "enum ", "trait "],
1404 variables: &[],
1405 imports: &["import "],
1406};
1407
1408const SP_HASKELL: SymbolPatterns = SymbolPatterns {
1409 functions: &[],
1410 classes: &["class ", "data ", "newtype ", "type "],
1411 variables: &[],
1412 imports: &["import "],
1413};
1414
1415const SP_LUA: SymbolPatterns = SymbolPatterns {
1416 functions: &["function ", "local function "],
1417 classes: &[],
1418 variables: &["local "],
1419 imports: &[],
1420};
1421
1422const SP_NIM: SymbolPatterns = SymbolPatterns {
1423 functions: &[
1424 "proc ",
1425 "func ",
1426 "method ",
1427 "iterator ",
1428 "converter ",
1429 "template ",
1430 "macro ",
1431 ],
1432 classes: &["type "],
1433 variables: &["var ", "let ", "const "],
1434 imports: &["import ", "from "],
1435};
1436
1437const SP_OBJECTIVEC: SymbolPatterns = SymbolPatterns {
1438 functions: &["- (", "+ ("],
1439 classes: &["@interface ", "@implementation ", "@protocol "],
1440 variables: &[],
1441 imports: &["#import ", "#include "],
1442};
1443
1444const SP_OCAML: SymbolPatterns = SymbolPatterns {
1445 functions: &["let ", "let rec "],
1446 classes: &["type ", "module ", "class "],
1447 variables: &[],
1448 imports: &["open "],
1449};
1450
1451const SP_PERL: SymbolPatterns = SymbolPatterns {
1452 functions: &["sub "],
1453 classes: &["package "],
1454 variables: &["my ", "our ", "local "],
1455 imports: &["use ", "require "],
1456};
1457
1458const SP_CLOJURE: SymbolPatterns = SymbolPatterns {
1459 functions: &["(defn ", "(defn- ", "(defmacro ", "(defmulti "],
1460 classes: &[
1461 "(defrecord ",
1462 "(defprotocol ",
1463 "(deftype ",
1464 "(definterface ",
1465 ],
1466 variables: &["(def ", "(defonce "],
1467 imports: &["(ns ", "(require "],
1468};
1469
1470const SP_JULIA: SymbolPatterns = SymbolPatterns {
1471 functions: &["function ", "macro "],
1472 classes: &[
1473 "struct ",
1474 "mutable struct ",
1475 "abstract type ",
1476 "primitive type ",
1477 ],
1478 variables: &["const "],
1479 imports: &["import ", "using "],
1480};
1481
1482const SP_DART: SymbolPatterns = SymbolPatterns {
1483 functions: &[],
1484 classes: &["class ", "abstract class ", "mixin ", "extension ", "enum "],
1485 variables: &["var ", "final ", "const ", "late "],
1486 imports: &["import "],
1487};
1488
1489const SP_R: SymbolPatterns = SymbolPatterns {
1490 functions: &[],
1491 classes: &[],
1492 variables: &[],
1493 imports: &["library(", "source("],
1494};
1495
1496const SP_SQL: SymbolPatterns = SymbolPatterns {
1497 functions: &[
1498 "create function ",
1499 "create or replace function ",
1500 "create procedure ",
1501 "create or replace procedure ",
1502 "CREATE FUNCTION ",
1503 "CREATE OR REPLACE FUNCTION ",
1504 "CREATE PROCEDURE ",
1505 "CREATE OR REPLACE PROCEDURE ",
1506 ],
1507 classes: &[
1508 "create table ",
1509 "create view ",
1510 "create schema ",
1511 "CREATE TABLE ",
1512 "CREATE VIEW ",
1513 "CREATE SCHEMA ",
1514 ],
1515 variables: &["declare ", "DECLARE "],
1516 imports: &[],
1517};
1518
1519const SP_ASSEMBLY: SymbolPatterns = SymbolPatterns {
1520 functions: &["proc ", "PROC "],
1521 classes: &[],
1522 variables: &[],
1523 imports: &["include ", "INCLUDE ", "%include "],
1524};
1525
1526const SP_ZIG: SymbolPatterns = SymbolPatterns {
1527 functions: &[
1528 "fn ",
1529 "pub fn ",
1530 "export fn ",
1531 "inline fn ",
1532 "pub inline fn ",
1533 ],
1534 classes: &[],
1535 variables: &["var ", "pub var "],
1536 imports: &[],
1537};
1538
1539#[allow(clippy::struct_excessive_bools)]
1540#[derive(Debug, Clone)]
1541struct ScanConfig {
1542 line_comments: &'static [&'static str],
1543 block_comment: Option<(&'static str, &'static str)>,
1544 allow_single_quote_strings: bool,
1545 allow_double_quote_strings: bool,
1546 allow_triple_quote_strings: bool,
1547 allow_csharp_verbatim_strings: bool,
1548 skip_lines: HashSet<usize>,
1549 symbol_patterns: SymbolPatterns,
1550}
1551
1552#[derive(Debug, Clone, Copy)]
1555struct IeeeFlags {
1556 has_preprocessor_directives: bool,
1558 blank_in_block_comment_as_comment: bool,
1560 collapse_continuation_lines: bool,
1562}
1563
1564#[derive(Debug, Clone, Copy)]
1565enum StringState {
1566 Single(char),
1567 Triple(&'static str),
1568 VerbatimDouble,
1569}
1570
1571#[allow(clippy::struct_excessive_bools)]
1572#[derive(Debug, Default)]
1573struct LineFacts {
1574 has_code: bool,
1575 has_single_comment: bool,
1576 has_multi_comment: bool,
1577 has_docstring: bool,
1578}
1579
1580fn process_string_char(
1584 state: StringState,
1585 chars: &[char],
1586 i: usize,
1587) -> (Option<StringState>, usize) {
1588 match state {
1589 StringState::Single(delim) => {
1590 if chars[i] == '\\' {
1591 return (Some(state), 2); }
1593 if chars[i] == delim {
1594 (None, 1)
1595 } else {
1596 (Some(state), 1)
1597 }
1598 }
1599 StringState::Triple(delim) => {
1600 if starts_with(chars, i, delim) {
1601 (None, delim.len())
1602 } else {
1603 (Some(state), 1)
1604 }
1605 }
1606 StringState::VerbatimDouble => {
1607 if starts_with(chars, i, "\"\"") {
1608 return (Some(state), 2); }
1610 if chars[i] == '"' {
1611 (None, 1)
1612 } else {
1613 (Some(state), 1)
1614 }
1615 }
1616 }
1617}
1618
1619fn process_block_comment_char(chars: &[char], i: usize, close: &str) -> (bool, usize) {
1623 if starts_with(chars, i, close) {
1624 (false, close.len())
1625 } else {
1626 (true, 1)
1627 }
1628}
1629
1630fn try_open_string(chars: &[char], i: usize, config: &ScanConfig) -> Option<(StringState, usize)> {
1634 if config.allow_csharp_verbatim_strings && starts_with(chars, i, "@\"") {
1635 return Some((StringState::VerbatimDouble, 2));
1636 }
1637 if config.allow_triple_quote_strings {
1638 if starts_with(chars, i, "\"\"\"") {
1639 return Some((StringState::Triple("\"\"\""), 3));
1640 }
1641 if starts_with(chars, i, "'''") {
1642 return Some((StringState::Triple("'''"), 3));
1643 }
1644 }
1645 if config.allow_single_quote_strings && chars[i] == '\'' {
1646 return Some((StringState::Single('\''), 1));
1647 }
1648 if config.allow_double_quote_strings && chars[i] == '"' {
1649 return Some((StringState::Single('"'), 1));
1650 }
1651 None
1652}
1653
1654fn step_through_block_comment(
1660 chars: &[char],
1661 i: usize,
1662 block_comment: Option<(&'static str, &'static str)>,
1663 in_block_comment: &mut bool,
1664) -> usize {
1665 if let Some((_, close)) = block_comment {
1666 let (still_in, advance) = process_block_comment_char(chars, i, close);
1667 *in_block_comment = still_in;
1668 return advance;
1669 }
1670 0
1671}
1672
1673fn try_open_block_comment(
1676 chars: &[char],
1677 i: usize,
1678 block_comment: Option<(&'static str, &'static str)>,
1679) -> Option<usize> {
1680 let (open, _) = block_comment?;
1681 starts_with(chars, i, open).then_some(open.len())
1682}
1683
1684fn scan_line(
1688 chars: &[char],
1689 config: &ScanConfig,
1690 facts: &mut LineFacts,
1691 in_block_comment: &mut bool,
1692 string_state: &mut Option<StringState>,
1693) {
1694 let mut i = 0usize;
1695 while i < chars.len() {
1696 if let Some(state) = *string_state {
1698 facts.has_code = true;
1699 let (new_state, advance) = process_string_char(state, chars, i);
1700 *string_state = new_state;
1701 i += advance;
1702 continue;
1703 }
1704
1705 if *in_block_comment {
1707 facts.has_multi_comment = true;
1708 i += step_through_block_comment(chars, i, config.block_comment, in_block_comment);
1709 continue;
1710 }
1711
1712 if chars[i].is_whitespace() {
1714 i += 1;
1715 continue;
1716 }
1717
1718 if let Some((new_state, advance)) = try_open_string(chars, i, config) {
1720 facts.has_code = true;
1721 *string_state = Some(new_state);
1722 i += advance;
1723 continue;
1724 }
1725
1726 if let Some(advance) = try_open_block_comment(chars, i, config.block_comment) {
1728 facts.has_multi_comment = true;
1729 *in_block_comment = true;
1730 i += advance;
1731 continue;
1732 }
1733
1734 if config
1736 .line_comments
1737 .iter()
1738 .any(|prefix| starts_with(chars, i, prefix))
1739 {
1740 facts.has_single_comment = true;
1741 break;
1742 }
1743
1744 facts.has_code = true;
1746 i += 1;
1747 }
1748}
1749
1750fn finalize_line_facts(
1755 facts: LineFacts,
1756 trimmed: &str,
1757 raw: &mut RawLineCounts,
1758 ieee: IeeeFlags,
1759 in_block_comment: bool,
1760 string_state: Option<StringState>,
1761 pending_continuation: &mut Option<LineFacts>,
1762) -> Option<LineFacts> {
1763 if ieee.has_preprocessor_directives
1767 && facts.has_code
1768 && !facts.has_single_comment
1769 && !facts.has_multi_comment
1770 && trimmed.starts_with('#')
1771 {
1772 raw.compiler_directive_lines += 1;
1773 }
1774
1775 let is_continuation = ieee.collapse_continuation_lines
1778 && !in_block_comment
1779 && string_state.is_none()
1780 && trimmed.ends_with('\\');
1781
1782 if is_continuation {
1783 let pending = pending_continuation.get_or_insert_with(LineFacts::default);
1784 pending.has_code |= facts.has_code;
1785 pending.has_single_comment |= facts.has_single_comment;
1786 pending.has_multi_comment |= facts.has_multi_comment;
1787 pending.has_docstring |= facts.has_docstring;
1788 return None; }
1790
1791 let emit = if let Some(pending) = pending_continuation.take() {
1793 LineFacts {
1794 has_code: pending.has_code | facts.has_code,
1795 has_single_comment: pending.has_single_comment | facts.has_single_comment,
1796 has_multi_comment: pending.has_multi_comment | facts.has_multi_comment,
1797 has_docstring: pending.has_docstring | facts.has_docstring,
1798 }
1799 } else {
1800 facts
1801 };
1802 Some(emit)
1803}
1804
1805#[allow(clippy::needless_pass_by_value)]
1810#[allow(clippy::too_many_arguments)]
1811fn process_physical_line(
1812 line: &str,
1813 line_idx: usize,
1814 config: &ScanConfig,
1815 raw: &mut RawLineCounts,
1816 in_block_comment: &mut bool,
1817 string_state: &mut Option<StringState>,
1818 pending_continuation: &mut Option<LineFacts>,
1819 ieee: IeeeFlags,
1820) {
1821 raw.total_physical_lines += 1;
1822
1823 if config.skip_lines.contains(&line_idx) {
1824 raw.docstring_comment_lines += 1;
1825 return;
1826 }
1827
1828 let trimmed = line.trim();
1829 let mut facts = LineFacts::default();
1830
1831 if *in_block_comment && (ieee.blank_in_block_comment_as_comment || !trimmed.is_empty()) {
1835 facts.has_multi_comment = true;
1836 }
1837
1838 let chars: Vec<char> = line.chars().collect();
1839 scan_line(&chars, config, &mut facts, in_block_comment, string_state);
1840
1841 let Some(emit) = finalize_line_facts(
1842 facts,
1843 trimmed,
1844 raw,
1845 ieee,
1846 *in_block_comment,
1847 *string_state,
1848 pending_continuation,
1849 ) else {
1850 return;
1851 };
1852
1853 classify_line(raw, &emit, trimmed);
1854
1855 if emit.has_code {
1856 let (f, c, v, i) = count_symbols(&config.symbol_patterns, trimmed);
1857 raw.functions += f;
1858 raw.classes += c;
1859 raw.variables += v;
1860 raw.imports += i;
1861 }
1862}
1863
1864#[allow(clippy::needless_pass_by_value)]
1865fn analyze_generic(text: &str, config: ScanConfig, ieee: IeeeFlags) -> RawFileAnalysis {
1866 let normalized = text.replace("\r\n", "\n").replace('\r', "\n");
1867 let lines: Vec<&str> = normalized.split_terminator('\n').collect();
1868
1869 let mut raw = RawLineCounts::default();
1870 let mut warnings = Vec::new();
1871
1872 let mut in_block_comment = false;
1873 let mut string_state: Option<StringState> = None;
1874 let mut pending_continuation: Option<LineFacts> = None;
1876
1877 for (line_idx, line) in lines.iter().enumerate() {
1878 process_physical_line(
1879 line,
1880 line_idx,
1881 &config,
1882 &mut raw,
1883 &mut in_block_comment,
1884 &mut string_state,
1885 &mut pending_continuation,
1886 ieee,
1887 );
1888 }
1889
1890 if let Some(pending) = pending_continuation.take() {
1892 classify_line(&mut raw, &pending, "");
1893 }
1894
1895 if in_block_comment {
1896 warnings.push("unclosed block comment detected; result is best effort".into());
1897 }
1898 if string_state.is_some() {
1899 warnings.push("unclosed string literal detected; result is best effort".into());
1900 }
1901
1902 RawFileAnalysis {
1903 raw,
1904 parse_mode: if warnings.is_empty() {
1905 ParseMode::Lexical
1906 } else {
1907 ParseMode::LexicalBestEffort
1908 },
1909 warnings,
1910 }
1911}
1912
1913const fn classify_line(raw: &mut RawLineCounts, facts: &LineFacts, trimmed: &str) {
1914 if facts.has_docstring {
1915 raw.docstring_comment_lines += 1;
1916 } else if !facts.has_code
1917 && !facts.has_single_comment
1918 && !facts.has_multi_comment
1919 && trimmed.is_empty()
1920 {
1921 raw.blank_only_lines += 1;
1922 } else if facts.has_code && facts.has_single_comment {
1923 raw.mixed_code_single_comment_lines += 1;
1924 } else if facts.has_code && facts.has_multi_comment {
1925 raw.mixed_code_multi_comment_lines += 1;
1926 } else if facts.has_code {
1927 raw.code_only_lines += 1;
1928 } else if facts.has_single_comment {
1929 raw.single_comment_only_lines += 1;
1930 } else if facts.has_multi_comment {
1931 raw.multi_comment_only_lines += 1;
1932 } else if trimmed.is_empty() {
1933 raw.blank_only_lines += 1;
1934 } else {
1935 raw.skipped_unknown_lines += 1;
1936 }
1937}
1938
1939fn count_symbols(patterns: &SymbolPatterns, trimmed: &str) -> (u64, u64, u64, u64) {
1940 let hit = |pats: &[&str]| u64::from(pats.iter().any(|p| trimmed.starts_with(p)));
1941 (
1942 hit(patterns.functions),
1943 hit(patterns.classes),
1944 hit(patterns.variables),
1945 hit(patterns.imports),
1946 )
1947}
1948
1949fn starts_with(chars: &[char], index: usize, needle: &str) -> bool {
1950 let needle_chars: Vec<char> = needle.chars().collect();
1951 chars.get(index..index + needle_chars.len()) == Some(needle_chars.as_slice())
1952}
1953
1954#[derive(Debug, Clone)]
1955struct PyContext {
1956 indent: usize,
1957 expect_docstring: bool,
1958}
1959
1960fn py_pop_outdented_contexts(contexts: &mut Vec<PyContext>, indent: usize) {
1962 while contexts.len() > 1 && indent < contexts.last().map_or(0, |c| c.indent) {
1963 contexts.pop();
1964 }
1965}
1966
1967fn py_handle_pending_indent(
1970 pending_block_indent: &mut Option<usize>,
1971 contexts: &mut Vec<PyContext>,
1972 indent: usize,
1973 trimmed: &str,
1974) {
1975 let Some(base_indent) = *pending_block_indent else {
1976 return;
1977 };
1978 if indent > base_indent {
1979 contexts.push(PyContext {
1980 indent,
1981 expect_docstring: true,
1982 });
1983 *pending_block_indent = None;
1984 } else if !trimmed.starts_with('@') {
1985 *pending_block_indent = None;
1986 }
1987}
1988
1989fn py_try_record_docstring(
1995 ctx: &mut PyContext,
1996 trimmed: &str,
1997 idx: usize,
1998 docstring_lines: &mut HashSet<usize>,
1999 active_docstring: &mut Option<(&'static str, usize)>,
2000) -> bool {
2001 if !ctx.expect_docstring {
2002 return false;
2003 }
2004 if let Some(delim) = docstring_delimiter(trimmed) {
2005 docstring_lines.insert(idx);
2006 ctx.expect_docstring = false;
2007 if !closes_triple_docstring(trimmed, delim, true) {
2008 *active_docstring = Some((delim, idx));
2009 }
2010 return true;
2011 }
2012 ctx.expect_docstring = false;
2013 false
2014}
2015
2016fn track_active_docstring(
2020 active_docstring: &mut Option<(&'static str, usize)>,
2021 docstring_lines: &mut HashSet<usize>,
2022 idx: usize,
2023 trimmed: &str,
2024) -> bool {
2025 let Some((delim, start_line)) = *active_docstring else {
2026 return false;
2027 };
2028 docstring_lines.insert(idx);
2029 if closes_triple_docstring(trimmed, delim, idx == start_line) {
2030 *active_docstring = None;
2031 }
2032 true
2033}
2034
2035fn try_record_docstring_if_context(
2038 contexts: &mut [PyContext],
2039 trimmed: &str,
2040 idx: usize,
2041 docstring_lines: &mut HashSet<usize>,
2042 active_docstring: &mut Option<(&'static str, usize)>,
2043) -> bool {
2044 let Some(ctx) = contexts.last_mut() else {
2045 return false;
2046 };
2047 py_try_record_docstring(ctx, trimmed, idx, docstring_lines, active_docstring)
2048}
2049
2050fn mark_unclosed_docstring_lines(
2052 active_docstring: Option<&(&'static str, usize)>,
2053 docstring_lines: &mut HashSet<usize>,
2054 num_lines: usize,
2055) {
2056 if let Some(&(_, start_line)) = active_docstring {
2057 for idx in start_line..num_lines {
2058 docstring_lines.insert(idx);
2059 }
2060 }
2061}
2062
2063fn detect_python_docstring_lines(text: &str) -> HashSet<usize> {
2064 let normalized = text.replace("\r\n", "\n").replace('\r', "\n");
2065 let lines: Vec<&str> = normalized.split_terminator('\n').collect();
2066
2067 let mut docstring_lines = HashSet::new();
2068 let mut contexts = vec![PyContext {
2069 indent: 0,
2070 expect_docstring: true,
2071 }];
2072 let mut pending_block_indent: Option<usize> = None;
2073 let mut active_docstring: Option<(&'static str, usize)> = None;
2074
2075 for (idx, line) in lines.iter().enumerate() {
2076 let trimmed = line.trim();
2077 let indent = leading_indent(line);
2078
2079 if track_active_docstring(&mut active_docstring, &mut docstring_lines, idx, trimmed) {
2080 continue;
2081 }
2082
2083 if trimmed.is_empty() || trimmed.starts_with('#') {
2085 continue;
2086 }
2087
2088 py_pop_outdented_contexts(&mut contexts, indent);
2089 py_handle_pending_indent(&mut pending_block_indent, &mut contexts, indent, trimmed);
2090
2091 if try_record_docstring_if_context(
2092 &mut contexts,
2093 trimmed,
2094 idx,
2095 &mut docstring_lines,
2096 &mut active_docstring,
2097 ) {
2098 continue;
2099 }
2100
2101 if is_python_block_header(trimmed) {
2102 pending_block_indent = Some(indent);
2103 }
2104 }
2105
2106 mark_unclosed_docstring_lines(active_docstring.as_ref(), &mut docstring_lines, lines.len());
2107
2108 docstring_lines
2109}
2110
2111fn leading_indent(line: &str) -> usize {
2112 line.chars().take_while(|c| c.is_whitespace()).count()
2113}
2114
2115fn is_python_block_header(trimmed: &str) -> bool {
2116 (trimmed.starts_with("def ")
2117 || trimmed.starts_with("async def ")
2118 || trimmed.starts_with("class "))
2119 && trimmed.ends_with(':')
2120}
2121
2122fn docstring_delimiter(trimmed: &str) -> Option<&'static str> {
2123 let mut idx = 0usize;
2124 let bytes = trimmed.as_bytes();
2125 while idx < bytes.len() {
2126 let c = bytes[idx] as char;
2127 if matches!(c, 'r' | 'R' | 'u' | 'U' | 'b' | 'B' | 'f' | 'F') {
2128 idx += 1;
2129 continue;
2130 }
2131 break;
2132 }
2133
2134 let rest = &trimmed[idx..];
2135 if rest.starts_with("\"\"\"") {
2136 Some("\"\"\"")
2137 } else if rest.starts_with("'''") {
2138 Some("'''")
2139 } else {
2140 None
2141 }
2142}
2143
2144fn closes_triple_docstring(trimmed: &str, delim: &str, same_line_as_start: bool) -> bool {
2145 let mut occurrences = 0usize;
2146 let mut search = trimmed;
2147 while let Some(index) = search.find(delim) {
2148 occurrences += 1;
2149 search = &search[index + delim.len()..];
2150 }
2151
2152 if same_line_as_start {
2153 occurrences >= 2
2154 } else {
2155 occurrences >= 1
2156 }
2157}
2158
2159#[cfg(feature = "tree-sitter")]
2164pub mod ts {
2165 use tree_sitter::Node;
2166
2167 use super::{ParseMode, RawFileAnalysis, RawLineCounts};
2168
2169 fn analyze_lines(
2174 text: &str,
2175 ts_language: &tree_sitter::Language,
2176 comment_node_kinds: &[&str],
2177 docstring_stmt_kind: Option<&str>,
2178 ) -> Option<RawFileAnalysis> {
2179 let mut parser = tree_sitter::Parser::new();
2180 parser.set_language(ts_language).ok()?;
2181 let tree = parser.parse(text, None)?;
2182
2183 let lines: Vec<&str> = text.split_terminator('\n').collect();
2184 let n = lines.len();
2185
2186 let mut has_code = vec![false; n];
2187 let mut has_comment = vec![false; n];
2188 let mut comment_is_block = vec![false; n];
2189 let mut has_docstring = vec![false; n];
2190
2191 let mut ctx = VisitCtx {
2193 source: text.as_bytes(),
2194 comment_kinds: comment_node_kinds,
2195 docstring_stmt_kind,
2196 has_code: &mut has_code,
2197 has_comment: &mut has_comment,
2198 comment_is_block: &mut comment_is_block,
2199 has_docstring: &mut has_docstring,
2200 };
2201 visit(tree.root_node(), &mut ctx);
2202
2203 let mut raw = RawLineCounts::default();
2204 classify_ts_lines(
2205 &lines,
2206 &has_code,
2207 &has_comment,
2208 &comment_is_block,
2209 &has_docstring,
2210 &mut raw,
2211 );
2212
2213 Some(RawFileAnalysis {
2214 raw,
2215 parse_mode: ParseMode::TreeSitter,
2216 warnings: Vec::new(),
2217 })
2218 }
2219
2220 #[allow(clippy::struct_excessive_bools)]
2223 #[derive(Clone, Copy)]
2224 struct TsLineFlags {
2225 has_code: bool,
2226 has_comment: bool,
2227 comment_is_block: bool,
2228 has_docstring: bool,
2229 }
2230
2231 const fn classify_ts_line(trimmed: &str, flags: TsLineFlags, raw: &mut RawLineCounts) {
2233 if trimmed.is_empty() {
2234 raw.blank_only_lines += 1;
2235 } else if flags.has_docstring && !flags.has_code {
2236 raw.docstring_comment_lines += 1;
2237 } else if flags.has_code && flags.has_comment {
2238 if flags.comment_is_block {
2240 raw.mixed_code_multi_comment_lines += 1;
2241 } else {
2242 raw.mixed_code_single_comment_lines += 1;
2243 }
2244 } else if flags.has_comment {
2245 if flags.comment_is_block {
2246 raw.multi_comment_only_lines += 1;
2247 } else {
2248 raw.single_comment_only_lines += 1;
2249 }
2250 } else {
2251 raw.code_only_lines += 1;
2252 }
2253 }
2254
2255 fn classify_ts_lines(
2257 lines: &[&str],
2258 has_code: &[bool],
2259 has_comment: &[bool],
2260 comment_is_block: &[bool],
2261 has_docstring: &[bool],
2262 raw: &mut RawLineCounts,
2263 ) {
2264 for i in 0..lines.len() {
2265 raw.total_physical_lines += 1;
2266 classify_ts_line(
2267 lines[i].trim(),
2268 TsLineFlags {
2269 has_code: has_code[i],
2270 has_comment: has_comment[i],
2271 comment_is_block: comment_is_block[i],
2272 has_docstring: has_docstring[i],
2273 },
2274 raw,
2275 );
2276 }
2277 }
2278
2279 struct VisitCtx<'a> {
2280 source: &'a [u8],
2281 comment_kinds: &'a [&'a str],
2282 docstring_stmt_kind: Option<&'a str>,
2283 has_code: &'a mut Vec<bool>,
2284 has_comment: &'a mut Vec<bool>,
2285 comment_is_block: &'a mut Vec<bool>,
2286 has_docstring: &'a mut Vec<bool>,
2287 }
2288
2289 fn visit_comment_node(node: Node, ctx: &mut VisitCtx<'_>) {
2291 let start_row = node.start_position().row;
2292 let end_row = node.end_position().row;
2293 let first_two = node
2294 .utf8_text(ctx.source)
2295 .unwrap_or("")
2296 .get(..2)
2297 .unwrap_or("");
2298 let is_block = first_two == "/*" || first_two == "<#";
2299 for row in start_row..=end_row {
2300 if row < ctx.has_comment.len() {
2301 ctx.has_comment[row] = true;
2302 if is_block {
2303 ctx.comment_is_block[row] = true;
2304 }
2305 }
2306 }
2307 }
2308
2309 fn visit_maybe_docstring(node: Node, kind: &str, ctx: &mut VisitCtx<'_>) -> bool {
2312 let Some(stmt_kind) = ctx.docstring_stmt_kind else {
2313 return false;
2314 };
2315 if kind != stmt_kind || node.named_child_count() != 1 {
2316 return false;
2317 }
2318 let Some(child) = node.named_child(0) else {
2319 return false;
2320 };
2321 if child.kind() != "string" {
2322 return false;
2323 }
2324 let child_start = child.start_position().row;
2325 let child_end = child.end_position().row;
2326 for row in child_start..=child_end {
2327 if row < ctx.has_docstring.len() {
2328 ctx.has_docstring[row] = true;
2329 }
2330 }
2331 true
2332 }
2333
2334 fn visit_leaf_code(node: Node, ctx: &mut VisitCtx<'_>) {
2336 let start_row = node.start_position().row;
2337 let end_row = node.end_position().row;
2338 for row in start_row..=end_row {
2339 if row < ctx.has_code.len() {
2340 ctx.has_code[row] = true;
2341 }
2342 }
2343 }
2344
2345 #[allow(clippy::too_many_lines)]
2346 fn visit(node: Node, ctx: &mut VisitCtx<'_>) {
2347 let kind = node.kind();
2349
2350 if ctx.comment_kinds.contains(&kind) {
2352 visit_comment_node(node, ctx);
2353 return;
2354 }
2355
2356 if visit_maybe_docstring(node, kind, ctx) {
2358 return;
2359 }
2360
2361 if node.child_count() == 0 && !node.is_extra() {
2363 visit_leaf_code(node, ctx);
2364 return;
2365 }
2366
2367 for i in 0..node.child_count() {
2368 if let Some(child) = node.child(i) {
2369 visit(child, ctx);
2370 }
2371 }
2372 }
2373
2374 #[must_use]
2376 pub fn analyze_c(text: &str) -> Option<RawFileAnalysis> {
2377 let lang: tree_sitter::Language = tree_sitter_c::LANGUAGE.into();
2378 analyze_lines(text, &lang, &["comment"], None)
2379 }
2380
2381 #[must_use]
2383 pub fn analyze_python(text: &str) -> Option<RawFileAnalysis> {
2384 let lang: tree_sitter::Language = tree_sitter_python::LANGUAGE.into();
2385 analyze_lines(text, &lang, &["comment"], Some("expression_statement"))
2386 }
2387}
2388
2389#[cfg(test)]
2390mod tests {
2391 use super::*;
2392
2393 #[test]
2394 fn python_docstrings_are_separated() {
2395 let input = r#""""module docs"""
2396
2397
2398def fn_a():
2399 """function docs"""
2400 value = 1 # trailing comment
2401 return value
2402"#;
2403
2404 let result = analyze_text(Language::Python, input, AnalysisOptions::default());
2405 assert_eq!(result.raw.docstring_comment_lines, 2);
2406 assert_eq!(result.raw.mixed_code_single_comment_lines, 1);
2407 assert_eq!(result.raw.code_only_lines, 2);
2408 }
2409
2410 #[test]
2411 fn c_style_mixed_lines_are_captured() {
2412 let input = "int x = 1; // note\n/* block */\n";
2413 let result = analyze_text(Language::C, input, AnalysisOptions::default());
2414 assert_eq!(result.raw.mixed_code_single_comment_lines, 1);
2415 assert_eq!(result.raw.multi_comment_only_lines, 1);
2416 }
2417
2418 #[test]
2419 fn detect_language_by_shebang() {
2420 let language = detect_language(
2421 Path::new("script"),
2422 Some("#!/usr/bin/env bash"),
2423 &BTreeMap::new(),
2424 true,
2425 );
2426 assert_eq!(language, Some(Language::Shell));
2427 }
2428}