1use serde::{Deserialize, Serialize};
7use std::fs::File;
8use std::io::Read;
9use std::path::{Path, PathBuf};
10use std::time::SystemTime;
11
12use crate::error::{Result, ScribeError};
13
14pub const BINARY_EXTENSIONS: &[&str] = &[
16 ".png", ".jpg", ".jpeg", ".gif", ".webp", ".bmp", ".svg", ".ico", ".tiff",
18 ".pdf", ".doc", ".docx", ".ppt", ".pptx", ".xls", ".xlsx", ".zip", ".tar", ".gz", ".bz2", ".xz", ".7z", ".rar", ".mp3", ".mp4", ".mov", ".avi", ".mkv", ".wav", ".ogg", ".flac", ".ttf", ".otf", ".eot", ".woff", ".woff2", ".so", ".dll", ".dylib", ".class", ".jar", ".exe", ".bin", ".app",
24];
25
26pub const MARKDOWN_EXTENSIONS: &[&str] = &[".md", ".markdown", ".mdown", ".mkd", ".mkdn"];
28
29const TEXTUAL_APPLICATION_MIME_TYPES: &[&str] = &[
31 "application/json",
32 "application/ld+json",
33 "application/graphql",
34 "application/javascript",
35 "application/x-javascript",
36 "application/typescript",
37 "application/x-typescript",
38 "application/xml",
39 "application/xhtml+xml",
40 "application/x-sh",
41 "application/x-shellscript",
42 "application/x-bash",
43 "application/x-zsh",
44 "application/x-python",
45 "application/x-ruby",
46 "application/x-perl",
47 "application/x-php",
48 "application/x-httpd-php",
49 "application/x-toml",
50 "application/toml",
51 "application/x-yaml",
52 "application/yaml",
53 "application/x-sql",
54 "application/sql",
55 "application/x-rust",
56 "application/x-go",
57 "application/x-java",
58 "application/x-scala",
59 "application/x-kotlin",
60 "application/x-swift",
61 "application/x-dart",
62 "application/x-haskell",
63 "application/x-clojure",
64 "application/x-ocaml",
65 "application/x-lisp",
66 "application/x-r",
67 "application/x-matlab",
68 "application/x-tex",
69 "application/x-empty",
70];
71
72const TEXTUAL_APPLICATION_KEYWORDS: &[&str] = &[
75 "+json",
76 "+xml",
77 "json",
78 "xml",
79 "yaml",
80 "yml",
81 "toml",
82 "graphql",
83 "javascript",
84 "typescript",
85 "ecmascript",
86 "shellscript",
87 "shell",
88 "bash",
89 "zsh",
90 "sh",
91 "python",
92 "ruby",
93 "perl",
94 "php",
95 "rust",
96 "go",
97 "java",
98 "scala",
99 "kotlin",
100 "swift",
101 "dart",
102 "haskell",
103 "clojure",
104 "ocaml",
105 "lisp",
106 "sql",
107 "graphql",
108 "tex",
109 "rscript",
110 "matlab",
111];
112
113#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
115pub struct RenderDecision {
116 pub include: bool,
118 pub reason: String,
120 pub context: Option<String>,
122}
123
124impl RenderDecision {
125 pub fn include<S: Into<String>>(reason: S) -> Self {
127 Self {
128 include: true,
129 reason: reason.into(),
130 context: None,
131 }
132 }
133
134 pub fn exclude<S: Into<String>>(reason: S) -> Self {
136 Self {
137 include: false,
138 reason: reason.into(),
139 context: None,
140 }
141 }
142
143 pub fn with_context<S: Into<String>>(mut self, context: S) -> Self {
145 self.context = Some(context.into());
146 self
147 }
148
149 pub fn should_include(&self) -> bool {
151 self.include
152 }
153
154 pub fn reason_category(&self) -> RenderDecisionCategory {
156 match self.reason.as_str() {
157 "ok" => RenderDecisionCategory::Ok,
158 "binary" => RenderDecisionCategory::Binary,
159 "too_large" => RenderDecisionCategory::TooLarge,
160 "ignored" => RenderDecisionCategory::Ignored,
161 "empty" => RenderDecisionCategory::Empty,
162 _ => RenderDecisionCategory::Other,
163 }
164 }
165}
166
167#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
169pub enum RenderDecisionCategory {
170 Ok,
171 Binary,
172 TooLarge,
173 Ignored,
174 Empty,
175 Other,
176}
177
178#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
180pub enum Language {
181 Rust,
183 C,
184 Cpp,
185 Go,
186 Zig,
187
188 JavaScript,
190 TypeScript,
191 HTML,
192 CSS,
193 SCSS,
194 SASS,
195
196 Python,
198 Java,
199 CSharp,
200 Kotlin,
201 Scala,
202 Ruby,
203 PHP,
204
205 Haskell,
207 OCaml,
208 FSharp,
209 Erlang,
210 Elixir,
211 Clojure,
212
213 JSON,
215 YAML,
216 TOML,
217 XML,
218 Markdown,
219
220 SQL,
222
223 Bash,
225 PowerShell,
226 Batch,
227
228 R,
230 Julia,
231 Matlab,
232
233 Swift,
235 ObjectiveC,
236 Dart,
237
238 Unknown,
240}
241
242impl Language {
243 pub fn from_extension(ext: &str) -> Self {
245 match ext.to_lowercase().as_str() {
246 "rs" => Language::Rust,
247 "c" | "h" => Language::C,
248 "cpp" | "cxx" | "cc" | "hpp" | "hxx" => Language::Cpp,
249 "go" => Language::Go,
250 "zig" => Language::Zig,
251 "js" | "mjs" | "cjs" => Language::JavaScript,
252 "ts" | "mts" | "cts" => Language::TypeScript,
253 "html" | "htm" => Language::HTML,
254 "css" => Language::CSS,
255 "scss" => Language::SCSS,
256 "sass" => Language::SASS,
257 "py" | "pyi" | "pyw" => Language::Python,
258 "java" => Language::Java,
259 "cs" => Language::CSharp,
260 "kt" | "kts" => Language::Kotlin,
261 "scala" | "sc" => Language::Scala,
262 "rb" => Language::Ruby,
263 "php" => Language::PHP,
264 "hs" | "lhs" => Language::Haskell,
265 "ml" | "mli" => Language::OCaml,
266 "fs" | "fsi" | "fsx" => Language::FSharp,
267 "erl" | "hrl" => Language::Erlang,
268 "ex" | "exs" => Language::Elixir,
269 "clj" | "cljs" | "cljc" => Language::Clojure,
270 "json" => Language::JSON,
271 "yaml" | "yml" => Language::YAML,
272 "toml" => Language::TOML,
273 "xml" => Language::XML,
274 "md" | "markdown" | "mdown" | "mkd" | "mkdn" => Language::Markdown,
275 "sql" => Language::SQL,
276 "sh" | "bash" => Language::Bash,
277 "ps1" | "psm1" | "psd1" => Language::PowerShell,
278 "bat" | "cmd" => Language::Batch,
279 "r" => Language::R,
280 "jl" => Language::Julia,
281 "swift" => Language::Swift,
282 "dart" => Language::Dart,
283 "m" | "mm" => Language::ObjectiveC,
286 _ => Language::Unknown,
287 }
288 }
289
290 pub fn is_documentation(&self) -> bool {
292 matches!(self, Language::Markdown | Language::HTML)
293 }
294
295 pub fn is_configuration(&self) -> bool {
297 matches!(
298 self,
299 Language::JSON | Language::YAML | Language::TOML | Language::XML
300 )
301 }
302
303 pub fn is_programming(&self) -> bool {
305 !matches!(
306 self,
307 Language::Markdown
308 | Language::HTML
309 | Language::JSON
310 | Language::YAML
311 | Language::TOML
312 | Language::XML
313 | Language::Unknown
314 )
315 }
316
317 pub fn display_name(&self) -> &'static str {
319 match self {
320 Language::Rust => "Rust",
321 Language::C => "C",
322 Language::Cpp => "C++",
323 Language::Go => "Go",
324 Language::Zig => "Zig",
325 Language::JavaScript => "JavaScript",
326 Language::TypeScript => "TypeScript",
327 Language::HTML => "HTML",
328 Language::CSS => "CSS",
329 Language::SCSS => "SCSS",
330 Language::SASS => "SASS",
331 Language::Python => "Python",
332 Language::Java => "Java",
333 Language::CSharp => "C#",
334 Language::Kotlin => "Kotlin",
335 Language::Scala => "Scala",
336 Language::Ruby => "Ruby",
337 Language::PHP => "PHP",
338 Language::Haskell => "Haskell",
339 Language::OCaml => "OCaml",
340 Language::FSharp => "F#",
341 Language::Erlang => "Erlang",
342 Language::Elixir => "Elixir",
343 Language::Clojure => "Clojure",
344 Language::JSON => "JSON",
345 Language::YAML => "YAML",
346 Language::TOML => "TOML",
347 Language::XML => "XML",
348 Language::Markdown => "Markdown",
349 Language::SQL => "SQL",
350 Language::Bash => "Bash",
351 Language::PowerShell => "PowerShell",
352 Language::Batch => "Batch",
353 Language::R => "R",
354 Language::Julia => "Julia",
355 Language::Matlab => "Matlab",
356 Language::Swift => "Swift",
357 Language::ObjectiveC => "Objective-C",
358 Language::Dart => "Dart",
359 Language::Bash => "Bash",
360 Language::Unknown => "Unknown",
361 }
362 }
363
364 pub fn extensions(&self) -> &'static [&'static str] {
366 match self {
367 Language::Rust => &["rs"],
368 Language::C => &["c", "h"],
369 Language::Cpp => &["cpp", "cxx", "cc", "hpp", "hxx"],
370 Language::Go => &["go"],
371 Language::Zig => &["zig"],
372 Language::JavaScript => &["js", "mjs", "cjs"],
373 Language::TypeScript => &["ts", "mts", "cts"],
374 Language::HTML => &["html", "htm"],
375 Language::CSS => &["css"],
376 Language::SCSS => &["scss"],
377 Language::SASS => &["sass"],
378 Language::Python => &["py", "pyi", "pyw"],
379 Language::Java => &["java"],
380 Language::CSharp => &["cs"],
381 Language::Kotlin => &["kt", "kts"],
382 Language::Scala => &["scala", "sc"],
383 Language::Ruby => &["rb"],
384 Language::PHP => &["php"],
385 Language::Haskell => &["hs", "lhs"],
386 Language::OCaml => &["ml", "mli"],
387 Language::FSharp => &["fs", "fsi", "fsx"],
388 Language::Erlang => &["erl", "hrl"],
389 Language::Elixir => &["ex", "exs"],
390 Language::Clojure => &["clj", "cljs", "cljc"],
391 Language::JSON => &["json"],
392 Language::YAML => &["yaml", "yml"],
393 Language::TOML => &["toml"],
394 Language::XML => &["xml"],
395 Language::Markdown => &["md", "markdown", "mdown", "mkd", "mkdn"],
396 Language::SQL => &["sql"],
397 Language::Bash => &["sh", "bash"],
398 Language::PowerShell => &["ps1", "psm1", "psd1"],
399 Language::Batch => &["bat", "cmd"],
400 Language::R => &["r"],
401 Language::Julia => &["jl"],
402 Language::Matlab => &["m"], Language::Swift => &["swift"],
404 Language::ObjectiveC => &["m", "mm"],
405 Language::Dart => &["dart"],
406 Language::Unknown => &[],
407 }
408 }
409}
410
411#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
413pub enum FileType {
414 Source { language: Language },
416 Documentation { format: DocumentationFormat },
418 Configuration { format: ConfigurationFormat },
420 Test { language: Language },
422 Binary,
424 Generated,
426 Unknown,
428}
429
430impl FileType {
431 pub fn display_label(&self) -> &'static str {
432 match self {
433 FileType::Source { .. } => "Source",
434 FileType::Documentation { .. } => "Documentation",
435 FileType::Configuration { .. } => "Configuration",
436 FileType::Test { .. } => "Test",
437 FileType::Binary => "Binary",
438 FileType::Generated => "Generated",
439 FileType::Unknown => "Unknown",
440 }
441 }
442}
443
444#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
446pub enum DocumentationFormat {
447 Markdown,
448 Html,
449 PlainText,
450 Rst,
451 Asciidoc,
452}
453
454#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
456pub enum ConfigurationFormat {
457 Json,
458 Yaml,
459 Toml,
460 Xml,
461 Ini,
462 Dotenv,
463}
464
465#[derive(Debug, Clone, Serialize, Deserialize)]
467pub struct FileInfo {
468 pub path: PathBuf,
470
471 pub relative_path: String,
473
474 pub size: u64,
476
477 pub modified: Option<SystemTime>,
479
480 pub decision: RenderDecision,
482
483 pub file_type: FileType,
485
486 pub language: Language,
488
489 pub content: Option<String>,
491
492 pub token_estimate: Option<usize>,
494
495 pub line_count: Option<usize>,
497
498 pub char_count: Option<usize>,
500
501 pub is_binary: bool,
503
504 pub git_status: Option<GitStatus>,
506
507 pub centrality_score: Option<f64>,
509}
510
511#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
513pub struct GitStatus {
514 pub working_tree: GitFileStatus,
516 pub index: GitFileStatus,
518}
519
520#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
522pub enum GitFileStatus {
523 Unmodified,
524 Modified,
525 Added,
526 Deleted,
527 Renamed,
528 Copied,
529 Unmerged,
530 Untracked,
531 Ignored,
532}
533
534impl FileInfo {
535 pub fn new<P: AsRef<Path>>(
537 path: P,
538 relative_path: String,
539 decision: RenderDecision,
540 ) -> Result<Self> {
541 let path = path.as_ref();
542 let metadata = std::fs::metadata(path)
543 .map_err(|e| ScribeError::path_with_source("Failed to read file metadata", path, e))?;
544
545 let size = metadata.len();
546 let modified = metadata.modified().ok();
547
548 let extension = path.extension().and_then(|ext| ext.to_str()).unwrap_or("");
549
550 let language = Language::from_extension(extension);
551 let is_binary = Self::detect_binary_with_hint(path, extension);
552 let file_type =
553 Self::classify_file_type_with_binary(&relative_path, &language, extension, is_binary);
554
555 Ok(Self {
556 path: path.to_path_buf(),
557 relative_path,
558 size,
559 modified,
560 decision,
561 file_type,
562 language,
563 content: None,
564 token_estimate: None,
565 line_count: None,
566 char_count: None,
567 is_binary,
568 git_status: None,
569 centrality_score: None,
570 })
571 }
572
573 pub fn load_content(&mut self) -> Result<()> {
575 if self.is_binary || !self.decision.should_include() {
576 return Ok(());
577 }
578
579 let content = std::fs::read_to_string(&self.path).map_err(|e| {
580 ScribeError::analysis(format!("Failed to read file content: {}", e), &self.path)
581 })?;
582
583 let line_count = content.lines().count();
585 let char_count = content.chars().count();
586 let token_estimate = Self::estimate_tokens(&content);
587
588 self.content = Some(content);
589 self.line_count = Some(line_count);
590 self.char_count = Some(char_count);
591 self.token_estimate = Some(token_estimate);
592
593 Ok(())
594 }
595
596 pub fn estimate_tokens(content: &str) -> usize {
601 use crate::tokenization::{utils, TokenCounter};
602
603 match TokenCounter::global().count_tokens(content) {
605 Ok(tokens) => tokens,
606 Err(_) => {
607 utils::estimate_tokens_legacy(content)
609 }
610 }
611 }
612
613 pub fn estimate_tokens_with_path(content: &str, file_path: &std::path::Path) -> usize {
618 use crate::tokenization::TokenCounter;
619
620 match TokenCounter::global().estimate_file_tokens(content, file_path) {
622 Ok(tokens) => tokens,
623 Err(_) => Self::estimate_tokens(content), }
625 }
626
627 pub fn detect_binary(path: &Path) -> bool {
630 let extension = path.extension().and_then(|ext| ext.to_str()).unwrap_or("");
631 Self::detect_binary_with_hint(path, extension)
632 }
633
634 pub fn detect_binary_with_hint(path: &Path, extension: &str) -> bool {
637 if let Some(mime) = tree_magic_mini::from_filepath(path) {
638 if !Self::is_textual_mime(mime) {
639 return true;
640 }
641 return false;
642 }
643
644 if let Ok(mut file) = File::open(path) {
645 let mut buffer = [0u8; 8192];
646 if let Ok(read) = file.read(&mut buffer) {
647 if read == 0 {
648 return false;
649 }
650
651 let slice = &buffer[..read];
652 let mime = tree_magic_mini::from_u8(slice);
653 if !Self::is_textual_mime(mime) {
654 return true;
655 }
656
657 if slice.iter().any(|byte| *byte == 0) {
658 return true;
659 }
660 }
661 }
662
663 Self::detect_binary_by_extension(extension)
664 }
665
666 pub fn detect_binary_from_bytes(bytes: &[u8], extension: Option<&str>) -> bool {
668 if bytes.is_empty() {
669 return false;
670 }
671
672 let mime = tree_magic_mini::from_u8(bytes);
673 if !Self::is_textual_mime(mime) {
674 return true;
675 }
676
677 if bytes.iter().any(|byte| *byte == 0) {
678 return true;
679 }
680
681 extension
682 .map(Self::detect_binary_by_extension)
683 .unwrap_or(false)
684 }
685
686 pub fn detect_binary_by_extension(extension: &str) -> bool {
688 if extension.is_empty() {
689 return false;
690 }
691
692 let trimmed = extension.trim_start_matches('.');
693 let lower = trimmed.to_lowercase();
694 let prefixed = format!(".{}", lower);
695
696 BINARY_EXTENSIONS.contains(&prefixed.as_str())
697 }
698
699 #[inline]
700 fn is_textual_mime(mime: &str) -> bool {
701 let canonical = mime
702 .split(';')
703 .next()
704 .unwrap_or(mime)
705 .trim()
706 .to_ascii_lowercase();
707 let mime = canonical.as_str();
708
709 if mime.starts_with("text/") || mime.starts_with("inode/") || mime.starts_with("message/") {
710 return true;
711 }
712
713 if mime.starts_with("application/") {
714 if TEXTUAL_APPLICATION_MIME_TYPES.contains(&mime) {
715 return true;
716 }
717
718 if TEXTUAL_APPLICATION_KEYWORDS
719 .iter()
720 .any(|keyword| mime.contains(keyword))
721 {
722 return true;
723 }
724 }
725
726 false
727 }
728
729 pub fn classify_file_type(path: &str, language: &Language, extension: &str) -> FileType {
731 let is_binary = Self::detect_binary_by_extension(extension);
732 Self::classify_file_type_with_binary(path, language, extension, is_binary)
733 }
734
735 pub fn classify_file_type_with_binary(
737 path: &str,
738 language: &Language,
739 extension: &str,
740 is_binary: bool,
741 ) -> FileType {
742 let path_lower = path.to_lowercase();
743
744 if is_binary {
745 return FileType::Binary;
746 }
747
748 if is_test_path(Path::new(path)) {
750 return FileType::Test {
751 language: language.clone(),
752 };
753 }
754
755 if language.is_documentation() {
757 let format = match extension {
758 "md" | "markdown" => DocumentationFormat::Markdown,
759 "html" | "htm" => DocumentationFormat::Html,
760 "rst" => DocumentationFormat::Rst,
761 "txt" => DocumentationFormat::PlainText,
762 _ => DocumentationFormat::Markdown,
763 };
764 return FileType::Documentation { format };
765 }
766
767 if language.is_configuration() {
769 let format = match extension {
770 "json" => ConfigurationFormat::Json,
771 "yaml" | "yml" => ConfigurationFormat::Yaml,
772 "toml" => ConfigurationFormat::Toml,
773 "xml" => ConfigurationFormat::Xml,
774 "ini" => ConfigurationFormat::Ini,
775 "env" => ConfigurationFormat::Dotenv,
776 _ => ConfigurationFormat::Json,
777 };
778 return FileType::Configuration { format };
779 }
780
781 if path_lower.contains("generated")
783 || path_lower.contains("build")
784 || path_lower.contains("dist")
785 || path_lower.contains("target")
786 {
787 return FileType::Generated;
788 }
789
790 if language.is_programming() {
792 return FileType::Source {
793 language: language.clone(),
794 };
795 }
796
797 FileType::Unknown
798 }
799
800 pub fn human_size(&self) -> String {
802 bytes_to_human(self.size)
803 }
804
805 pub fn should_include(&self) -> bool {
807 self.decision.should_include()
808 }
809
810 pub fn file_name(&self) -> Option<&str> {
812 self.path.file_name()?.to_str()
813 }
814
815 pub fn file_stem(&self) -> Option<&str> {
817 self.path.file_stem()?.to_str()
818 }
819
820 pub fn extension(&self) -> Option<&str> {
822 self.path.extension()?.to_str()
823 }
824}
825
826pub fn bytes_to_human(bytes: u64) -> String {
828 const UNITS: &[&str] = &["B", "KiB", "MiB", "GiB", "TiB"];
829 const THRESHOLD: f64 = 1024.0;
830
831 if bytes == 0 {
832 return "0 B".to_string();
833 }
834
835 let mut size = bytes as f64;
836 let mut unit_idx = 0;
837
838 while size >= THRESHOLD && unit_idx < UNITS.len() - 1 {
839 size /= THRESHOLD;
840 unit_idx += 1;
841 }
842
843 if unit_idx == 0 {
844 format!("{} {}", bytes, UNITS[unit_idx])
845 } else {
846 format!("{:.1} {}", size, UNITS[unit_idx])
847 }
848}
849
850pub fn detect_language_from_path(path: &Path) -> Language {
852 path.extension()
853 .and_then(|ext| ext.to_str())
854 .map(Language::from_extension)
855 .unwrap_or(Language::Unknown)
856}
857
858pub fn language_display_name(language: &Language) -> &'static str {
860 language.display_name()
861}
862
863pub fn is_test_path(path: &Path) -> bool {
865 let path_lower = path.to_string_lossy().to_lowercase();
866 let file_name = path
867 .file_name()
868 .and_then(|s| s.to_str())
869 .map(|s| s.to_lowercase())
870 .unwrap_or_default();
871
872 if file_name == "output.md" || file_name.starts_with("output.") {
873 return true;
874 }
875
876 let segments: Vec<&str> = path_lower
877 .split(|c| c == '/' || c == '\\')
878 .filter(|segment| !segment.is_empty())
879 .collect();
880
881 const TEST_DIR_MARKERS: &[&str] = &[
882 "test",
883 "tests",
884 "testing",
885 "__tests__",
886 "integration-tests",
887 "integration_test",
888 "integrationtests",
889 "e2e",
890 "qa",
891 "spec",
892 ];
893
894 if segments
895 .iter()
896 .any(|segment| TEST_DIR_MARKERS.contains(segment))
897 {
898 return true;
899 }
900
901 const TEST_PREFIXES: &[&str] = &["test_", "spec_", "itest_", "integration_"];
902 if TEST_PREFIXES
903 .iter()
904 .any(|prefix| file_name.starts_with(prefix))
905 {
906 return true;
907 }
908
909 const TEST_SUFFIXES: &[&str] = &["_test", "_tests", "_spec", "_itest", "_integration", "_e2e"];
910 if TEST_SUFFIXES
911 .iter()
912 .any(|suffix| file_name.strip_suffix(suffix).is_some())
913 {
914 return true;
915 }
916
917 if file_name.contains(".test.") || file_name.contains(".spec.") {
918 return true;
919 }
920
921 let ext = path
922 .extension()
923 .and_then(|s| s.to_str())
924 .map(|s| s.to_lowercase())
925 .unwrap_or_default();
926
927 match ext.as_str() {
928 "rs" => file_name.ends_with("_test.rs") || segments.iter().any(|seg| *seg == "tests"),
929 "py" => file_name.starts_with("test_") || file_name.ends_with("_test.py"),
930 "go" => file_name.ends_with("_test.go"),
931 "java" | "kt" => {
932 file_name.ends_with("test.java")
933 || file_name.ends_with("tests.java")
934 || file_name.ends_with("test.kt")
935 || file_name.ends_with("tests.kt")
936 }
937 "php" => file_name.ends_with("test.php"),
938 "rb" => file_name.ends_with("_spec.rb") || file_name.ends_with("_test.rb"),
939 "js" | "jsx" | "ts" | "tsx" => {
940 file_name.contains(".test.")
941 || file_name.contains(".spec.")
942 || file_name.ends_with("_test.ts")
943 }
944 _ => false,
945 }
946}
947
948pub fn is_entrypoint_path(path: &Path, language: &Language) -> bool {
950 let path_lower = path.to_string_lossy().to_lowercase();
951 let file_name = path
952 .file_name()
953 .and_then(|s| s.to_str())
954 .map(|s| s.to_lowercase())
955 .unwrap_or_default();
956
957 match language {
958 Language::Rust => file_name == "main.rs" || file_name == "lib.rs",
959 Language::Python => {
960 file_name == "main.py"
961 || path_lower.contains("/__main__.py")
962 || path_lower.contains("/manage.py")
963 || file_name == "app.py"
964 || file_name == "__init__.py"
965 }
966 Language::JavaScript | Language::TypeScript => {
967 file_name == "index.js"
968 || file_name == "index.ts"
969 || path_lower.contains("/app.js")
970 || path_lower.contains("/server.js")
971 }
972 Language::Go => file_name == "main.go",
973 Language::Java => file_name == "main.java" || path_lower.contains("/main.java"),
974 _ => file_name.starts_with("main.") || file_name.starts_with("index."),
975 }
976}
977
978#[cfg(test)]
979mod tests {
980 use super::*;
981
982 #[test]
983 fn test_language_detection() {
984 assert_eq!(Language::from_extension("rs"), Language::Rust);
985 assert_eq!(Language::from_extension("py"), Language::Python);
986 assert_eq!(Language::from_extension("js"), Language::JavaScript);
987 assert_eq!(Language::from_extension("unknown"), Language::Unknown);
988 }
989
990 #[test]
991 fn test_binary_detection() {
992 assert!(FileInfo::detect_binary_by_extension("png"));
993 assert!(FileInfo::detect_binary_by_extension("exe"));
994 assert!(!FileInfo::detect_binary_by_extension("rs"));
995 assert!(!FileInfo::detect_binary_by_extension("py"));
996 }
997
998 #[test]
999 fn test_detect_binary_magic_on_files() {
1000 use std::io::Write;
1001 use tempfile::NamedTempFile;
1002
1003 let mut text_file = NamedTempFile::new().unwrap();
1004 writeln!(text_file, "fn main() {{ println!(\"hi\"); }}").unwrap();
1005
1006 assert!(!FileInfo::detect_binary(text_file.path()));
1007
1008 let mut binary_file = NamedTempFile::new().unwrap();
1009 binary_file
1010 .write_all(&[0u8, 159, 146, 150, 0, 1, 2])
1011 .unwrap();
1012
1013 assert!(FileInfo::detect_binary(binary_file.path()));
1014 }
1015
1016 #[test]
1017 fn test_detect_binary_from_bytes() {
1018 let text_bytes = b"#!/usr/bin/env python3\nprint('hello')\n";
1019 assert!(!FileInfo::detect_binary_from_bytes(text_bytes, Some("py")));
1020
1021 let binary_bytes = [0u8, 255, 1, 2, 3, 4, 5];
1022 assert!(FileInfo::detect_binary_from_bytes(&binary_bytes, None));
1023 }
1024
1025 #[test]
1026 fn test_file_type_classification() {
1027 let rust_lang = Language::Rust;
1028 let py_lang = Language::Python;
1029 let md_lang = Language::Markdown;
1030
1031 assert!(matches!(
1033 FileInfo::classify_file_type("src/lib.rs", &rust_lang, "rs"),
1034 FileType::Source { .. }
1035 ));
1036
1037 assert!(matches!(
1038 FileInfo::classify_file_type("scribe-rs/src/lib.rs", &rust_lang, "rs"),
1039 FileType::Source { .. }
1040 ));
1041
1042 assert!(matches!(
1044 FileInfo::classify_file_type("script.py", &py_lang, "py"),
1045 FileType::Source { .. }
1046 ));
1047
1048 assert!(rust_lang.is_programming());
1050 assert!(py_lang.is_programming());
1051 assert!(!md_lang.is_programming());
1052 }
1053
1054 #[test]
1055 fn test_integration_file_classification() {
1056 let rust_lang = Language::from_extension("rs");
1060 assert_eq!(rust_lang, Language::Rust);
1061 assert!(rust_lang.is_programming());
1062
1063 let rust_file_type = FileInfo::classify_file_type("src/lib.rs", &rust_lang, "rs");
1064 assert!(matches!(rust_file_type, FileType::Source { .. }));
1065
1066 let py_lang = Language::from_extension("py");
1068 assert_eq!(py_lang, Language::Python);
1069 assert!(py_lang.is_programming());
1070
1071 let py_file_type = FileInfo::classify_file_type("script.py", &py_lang, "py");
1072 assert!(matches!(py_file_type, FileType::Source { .. }));
1073
1074 let unknown_lang = Language::from_extension("xyz");
1076 assert_eq!(unknown_lang, Language::Unknown);
1077 assert!(!unknown_lang.is_programming());
1078
1079 let unknown_file_type = FileInfo::classify_file_type("file.xyz", &unknown_lang, "xyz");
1080 assert!(matches!(unknown_file_type, FileType::Unknown));
1081
1082 let md_lang = Language::from_extension("md");
1084 assert_eq!(md_lang, Language::Markdown);
1085 assert!(!md_lang.is_programming());
1086
1087 let md_file_type = FileInfo::classify_file_type("README.md", &md_lang, "md");
1088 assert!(matches!(md_file_type, FileType::Documentation { .. }));
1089 }
1090
1091 #[test]
1092 fn test_bytes_to_human() {
1093 assert_eq!(bytes_to_human(0), "0 B");
1094 assert_eq!(bytes_to_human(512), "512 B");
1095 assert_eq!(bytes_to_human(1024), "1.0 KiB");
1096 assert_eq!(bytes_to_human(1536), "1.5 KiB");
1097 assert_eq!(bytes_to_human(1048576), "1.0 MiB");
1098 }
1099
1100 #[test]
1101 fn test_token_estimation() {
1102 let content = "Hello world, this is a test.";
1103 let tokens = FileInfo::estimate_tokens(content);
1104 assert!(tokens > 0);
1105 assert!(tokens < 20); }
1107
1108 #[test]
1109 fn test_render_decision() {
1110 let include = RenderDecision::include("valid file");
1111 assert!(include.should_include());
1112 assert_eq!(include.reason_category(), RenderDecisionCategory::Other);
1113
1114 let exclude = RenderDecision::exclude("binary").with_context("detected by extension");
1115 assert!(!exclude.should_include());
1116 assert_eq!(exclude.reason_category(), RenderDecisionCategory::Binary);
1117 assert!(exclude.context.is_some());
1118 }
1119}