scribe_core/
file.rs

1//! File-related types and utilities.
2//!
3//! Provides comprehensive file metadata structures, language detection,
4//! and file classification utilities for the Scribe analysis pipeline.
5
6use serde::{Deserialize, Serialize};
7use std::fs::File;
8use std::io::Read;
9use std::path::{Path, PathBuf};
10use std::time::SystemTime;
11
12use crate::error::{Result, ScribeError};
13
14/// Binary file extensions that should typically be excluded from text analysis
15pub const BINARY_EXTENSIONS: &[&str] = &[
16    // Images
17    ".png", ".jpg", ".jpeg", ".gif", ".webp", ".bmp", ".svg", ".ico", ".tiff",
18    // Documents
19    ".pdf", ".doc", ".docx", ".ppt", ".pptx", ".xls", ".xlsx", // Archives
20    ".zip", ".tar", ".gz", ".bz2", ".xz", ".7z", ".rar", // Media
21    ".mp3", ".mp4", ".mov", ".avi", ".mkv", ".wav", ".ogg", ".flac", // Fonts
22    ".ttf", ".otf", ".eot", ".woff", ".woff2", // Executables and libraries
23    ".so", ".dll", ".dylib", ".class", ".jar", ".exe", ".bin", ".app",
24];
25
26/// Markdown file extensions
27pub const MARKDOWN_EXTENSIONS: &[&str] = &[".md", ".markdown", ".mdown", ".mkd", ".mkdn"];
28
29/// MIME types under `application/*` that should be treated as plain text.
30const TEXTUAL_APPLICATION_MIME_TYPES: &[&str] = &[
31    "application/json",
32    "application/ld+json",
33    "application/graphql",
34    "application/javascript",
35    "application/x-javascript",
36    "application/typescript",
37    "application/x-typescript",
38    "application/xml",
39    "application/xhtml+xml",
40    "application/x-sh",
41    "application/x-shellscript",
42    "application/x-bash",
43    "application/x-zsh",
44    "application/x-python",
45    "application/x-ruby",
46    "application/x-perl",
47    "application/x-php",
48    "application/x-httpd-php",
49    "application/x-toml",
50    "application/toml",
51    "application/x-yaml",
52    "application/yaml",
53    "application/x-sql",
54    "application/sql",
55    "application/x-rust",
56    "application/x-go",
57    "application/x-java",
58    "application/x-scala",
59    "application/x-kotlin",
60    "application/x-swift",
61    "application/x-dart",
62    "application/x-haskell",
63    "application/x-clojure",
64    "application/x-ocaml",
65    "application/x-lisp",
66    "application/x-r",
67    "application/x-matlab",
68    "application/x-tex",
69    "application/x-empty",
70];
71
72/// Keywords inside MIME subtypes that indicate textual content even if the
73/// top-level type is `application/*`.
74const TEXTUAL_APPLICATION_KEYWORDS: &[&str] = &[
75    "+json",
76    "+xml",
77    "json",
78    "xml",
79    "yaml",
80    "yml",
81    "toml",
82    "graphql",
83    "javascript",
84    "typescript",
85    "ecmascript",
86    "shellscript",
87    "shell",
88    "bash",
89    "zsh",
90    "sh",
91    "python",
92    "ruby",
93    "perl",
94    "php",
95    "rust",
96    "go",
97    "java",
98    "scala",
99    "kotlin",
100    "swift",
101    "dart",
102    "haskell",
103    "clojure",
104    "ocaml",
105    "lisp",
106    "sql",
107    "graphql",
108    "tex",
109    "rscript",
110    "matlab",
111];
112
113/// Decision about whether to include a file in analysis
114#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
115pub struct RenderDecision {
116    /// Whether to include the file in analysis
117    pub include: bool,
118    /// Human-readable reason for the decision
119    pub reason: String,
120    /// Optional additional context
121    pub context: Option<String>,
122}
123
124impl RenderDecision {
125    /// Create a decision to include the file
126    pub fn include<S: Into<String>>(reason: S) -> Self {
127        Self {
128            include: true,
129            reason: reason.into(),
130            context: None,
131        }
132    }
133
134    /// Create a decision to exclude the file
135    pub fn exclude<S: Into<String>>(reason: S) -> Self {
136        Self {
137            include: false,
138            reason: reason.into(),
139            context: None,
140        }
141    }
142
143    /// Add context to the decision
144    pub fn with_context<S: Into<String>>(mut self, context: S) -> Self {
145        self.context = Some(context.into());
146        self
147    }
148
149    /// Check if the file should be included
150    pub fn should_include(&self) -> bool {
151        self.include
152    }
153
154    /// Get the reason as a standard category
155    pub fn reason_category(&self) -> RenderDecisionCategory {
156        match self.reason.as_str() {
157            "ok" => RenderDecisionCategory::Ok,
158            "binary" => RenderDecisionCategory::Binary,
159            "too_large" => RenderDecisionCategory::TooLarge,
160            "ignored" => RenderDecisionCategory::Ignored,
161            "empty" => RenderDecisionCategory::Empty,
162            _ => RenderDecisionCategory::Other,
163        }
164    }
165}
166
167/// Standard categories for render decisions
168#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
169pub enum RenderDecisionCategory {
170    Ok,
171    Binary,
172    TooLarge,
173    Ignored,
174    Empty,
175    Other,
176}
177
178/// Programming language classification
179#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
180pub enum Language {
181    // Systems languages
182    Rust,
183    C,
184    Cpp,
185    Go,
186    Zig,
187
188    // Web languages
189    JavaScript,
190    TypeScript,
191    HTML,
192    CSS,
193    SCSS,
194    SASS,
195
196    // Backend languages
197    Python,
198    Java,
199    CSharp,
200    Kotlin,
201    Scala,
202    Ruby,
203    PHP,
204
205    // Functional languages
206    Haskell,
207    OCaml,
208    FSharp,
209    Erlang,
210    Elixir,
211    Clojure,
212
213    // Configuration and markup
214    JSON,
215    YAML,
216    TOML,
217    XML,
218    Markdown,
219
220    // Database
221    SQL,
222
223    // Shell and scripts
224    Bash,
225    PowerShell,
226    Batch,
227
228    // Data science
229    R,
230    Julia,
231    Matlab,
232
233    // Mobile
234    Swift,
235    ObjectiveC,
236    Dart,
237
238    // Other
239    Unknown,
240}
241
242impl Language {
243    /// Detect language from file extension
244    pub fn from_extension(ext: &str) -> Self {
245        match ext.to_lowercase().as_str() {
246            "rs" => Language::Rust,
247            "c" | "h" => Language::C,
248            "cpp" | "cxx" | "cc" | "hpp" | "hxx" => Language::Cpp,
249            "go" => Language::Go,
250            "zig" => Language::Zig,
251            "js" | "mjs" | "cjs" => Language::JavaScript,
252            "ts" | "mts" | "cts" => Language::TypeScript,
253            "html" | "htm" => Language::HTML,
254            "css" => Language::CSS,
255            "scss" => Language::SCSS,
256            "sass" => Language::SASS,
257            "py" | "pyi" | "pyw" => Language::Python,
258            "java" => Language::Java,
259            "cs" => Language::CSharp,
260            "kt" | "kts" => Language::Kotlin,
261            "scala" | "sc" => Language::Scala,
262            "rb" => Language::Ruby,
263            "php" => Language::PHP,
264            "hs" | "lhs" => Language::Haskell,
265            "ml" | "mli" => Language::OCaml,
266            "fs" | "fsi" | "fsx" => Language::FSharp,
267            "erl" | "hrl" => Language::Erlang,
268            "ex" | "exs" => Language::Elixir,
269            "clj" | "cljs" | "cljc" => Language::Clojure,
270            "json" => Language::JSON,
271            "yaml" | "yml" => Language::YAML,
272            "toml" => Language::TOML,
273            "xml" => Language::XML,
274            "md" | "markdown" | "mdown" | "mkd" | "mkdn" => Language::Markdown,
275            "sql" => Language::SQL,
276            "sh" | "bash" => Language::Bash,
277            "ps1" | "psm1" | "psd1" => Language::PowerShell,
278            "bat" | "cmd" => Language::Batch,
279            "r" => Language::R,
280            "jl" => Language::Julia,
281            "swift" => Language::Swift,
282            "dart" => Language::Dart,
283            // Handle ambiguous .m extension - could be Matlab or Objective-C
284            // Default to Objective-C as it's more common in modern development
285            "m" | "mm" => Language::ObjectiveC,
286            _ => Language::Unknown,
287        }
288    }
289
290    /// Check if this language is typically used for documentation
291    pub fn is_documentation(&self) -> bool {
292        matches!(self, Language::Markdown | Language::HTML)
293    }
294
295    /// Check if this language is typically used for configuration
296    pub fn is_configuration(&self) -> bool {
297        matches!(
298            self,
299            Language::JSON | Language::YAML | Language::TOML | Language::XML
300        )
301    }
302
303    /// Check if this is a programming language (not markup/config)
304    pub fn is_programming(&self) -> bool {
305        !matches!(
306            self,
307            Language::Markdown
308                | Language::HTML
309                | Language::JSON
310                | Language::YAML
311                | Language::TOML
312                | Language::XML
313                | Language::Unknown
314        )
315    }
316
317    /// Display name used for user-facing messaging
318    pub fn display_name(&self) -> &'static str {
319        match self {
320            Language::Rust => "Rust",
321            Language::C => "C",
322            Language::Cpp => "C++",
323            Language::Go => "Go",
324            Language::Zig => "Zig",
325            Language::JavaScript => "JavaScript",
326            Language::TypeScript => "TypeScript",
327            Language::HTML => "HTML",
328            Language::CSS => "CSS",
329            Language::SCSS => "SCSS",
330            Language::SASS => "SASS",
331            Language::Python => "Python",
332            Language::Java => "Java",
333            Language::CSharp => "C#",
334            Language::Kotlin => "Kotlin",
335            Language::Scala => "Scala",
336            Language::Ruby => "Ruby",
337            Language::PHP => "PHP",
338            Language::Haskell => "Haskell",
339            Language::OCaml => "OCaml",
340            Language::FSharp => "F#",
341            Language::Erlang => "Erlang",
342            Language::Elixir => "Elixir",
343            Language::Clojure => "Clojure",
344            Language::JSON => "JSON",
345            Language::YAML => "YAML",
346            Language::TOML => "TOML",
347            Language::XML => "XML",
348            Language::Markdown => "Markdown",
349            Language::SQL => "SQL",
350            Language::Bash => "Bash",
351            Language::PowerShell => "PowerShell",
352            Language::Batch => "Batch",
353            Language::R => "R",
354            Language::Julia => "Julia",
355            Language::Matlab => "Matlab",
356            Language::Swift => "Swift",
357            Language::ObjectiveC => "Objective-C",
358            Language::Dart => "Dart",
359            Language::Bash => "Bash",
360            Language::Unknown => "Unknown",
361        }
362    }
363
364    /// Get the typical file extensions for this language
365    pub fn extensions(&self) -> &'static [&'static str] {
366        match self {
367            Language::Rust => &["rs"],
368            Language::C => &["c", "h"],
369            Language::Cpp => &["cpp", "cxx", "cc", "hpp", "hxx"],
370            Language::Go => &["go"],
371            Language::Zig => &["zig"],
372            Language::JavaScript => &["js", "mjs", "cjs"],
373            Language::TypeScript => &["ts", "mts", "cts"],
374            Language::HTML => &["html", "htm"],
375            Language::CSS => &["css"],
376            Language::SCSS => &["scss"],
377            Language::SASS => &["sass"],
378            Language::Python => &["py", "pyi", "pyw"],
379            Language::Java => &["java"],
380            Language::CSharp => &["cs"],
381            Language::Kotlin => &["kt", "kts"],
382            Language::Scala => &["scala", "sc"],
383            Language::Ruby => &["rb"],
384            Language::PHP => &["php"],
385            Language::Haskell => &["hs", "lhs"],
386            Language::OCaml => &["ml", "mli"],
387            Language::FSharp => &["fs", "fsi", "fsx"],
388            Language::Erlang => &["erl", "hrl"],
389            Language::Elixir => &["ex", "exs"],
390            Language::Clojure => &["clj", "cljs", "cljc"],
391            Language::JSON => &["json"],
392            Language::YAML => &["yaml", "yml"],
393            Language::TOML => &["toml"],
394            Language::XML => &["xml"],
395            Language::Markdown => &["md", "markdown", "mdown", "mkd", "mkdn"],
396            Language::SQL => &["sql"],
397            Language::Bash => &["sh", "bash"],
398            Language::PowerShell => &["ps1", "psm1", "psd1"],
399            Language::Batch => &["bat", "cmd"],
400            Language::R => &["r"],
401            Language::Julia => &["jl"],
402            Language::Matlab => &["m"], // Note: .m conflicts with Objective-C
403            Language::Swift => &["swift"],
404            Language::ObjectiveC => &["m", "mm"],
405            Language::Dart => &["dart"],
406            Language::Unknown => &[],
407        }
408    }
409}
410
411/// File type classification for analysis purposes
412#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
413pub enum FileType {
414    /// Source code files
415    Source { language: Language },
416    /// Documentation files
417    Documentation { format: DocumentationFormat },
418    /// Configuration files
419    Configuration { format: ConfigurationFormat },
420    /// Test files
421    Test { language: Language },
422    /// Binary files that should be excluded
423    Binary,
424    /// Generated or built files
425    Generated,
426    /// Unknown or unclassified
427    Unknown,
428}
429
430impl FileType {
431    pub fn display_label(&self) -> &'static str {
432        match self {
433            FileType::Source { .. } => "Source",
434            FileType::Documentation { .. } => "Documentation",
435            FileType::Configuration { .. } => "Configuration",
436            FileType::Test { .. } => "Test",
437            FileType::Binary => "Binary",
438            FileType::Generated => "Generated",
439            FileType::Unknown => "Unknown",
440        }
441    }
442}
443
444/// Documentation format classification
445#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
446pub enum DocumentationFormat {
447    Markdown,
448    Html,
449    PlainText,
450    Rst,
451    Asciidoc,
452}
453
454/// Configuration format classification
455#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
456pub enum ConfigurationFormat {
457    Json,
458    Yaml,
459    Toml,
460    Xml,
461    Ini,
462    Dotenv,
463}
464
465/// Comprehensive file metadata structure
466#[derive(Debug, Clone, Serialize, Deserialize)]
467pub struct FileInfo {
468    /// Absolute path to the file on disk
469    pub path: PathBuf,
470
471    /// Path relative to repository root (forward slash separated)
472    pub relative_path: String,
473
474    /// File size in bytes
475    pub size: u64,
476
477    /// File modification time
478    pub modified: Option<SystemTime>,
479
480    /// Analysis decision (include/exclude)
481    pub decision: RenderDecision,
482
483    /// Detected file type
484    pub file_type: FileType,
485
486    /// Detected programming language
487    pub language: Language,
488
489    /// File content (loaded on demand)
490    pub content: Option<String>,
491
492    /// Estimated token count for LLM processing
493    pub token_estimate: Option<usize>,
494
495    /// Line count (if text file)
496    pub line_count: Option<usize>,
497
498    /// Character count (if text file)
499    pub char_count: Option<usize>,
500
501    /// Whether the file is likely binary
502    pub is_binary: bool,
503
504    /// Git status information (if available)
505    pub git_status: Option<GitStatus>,
506
507    /// PageRank centrality score (0.0-1.0, higher means more important)
508    pub centrality_score: Option<f64>,
509}
510
511/// Git status information for a file
512#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
513pub struct GitStatus {
514    /// Working tree status
515    pub working_tree: GitFileStatus,
516    /// Index/staging area status
517    pub index: GitFileStatus,
518}
519
520/// Git file status
521#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
522pub enum GitFileStatus {
523    Unmodified,
524    Modified,
525    Added,
526    Deleted,
527    Renamed,
528    Copied,
529    Unmerged,
530    Untracked,
531    Ignored,
532}
533
534impl FileInfo {
535    /// Create a new FileInfo from a path
536    pub fn new<P: AsRef<Path>>(
537        path: P,
538        relative_path: String,
539        decision: RenderDecision,
540    ) -> Result<Self> {
541        let path = path.as_ref();
542        let metadata = std::fs::metadata(path)
543            .map_err(|e| ScribeError::path_with_source("Failed to read file metadata", path, e))?;
544
545        let size = metadata.len();
546        let modified = metadata.modified().ok();
547
548        let extension = path.extension().and_then(|ext| ext.to_str()).unwrap_or("");
549
550        let language = Language::from_extension(extension);
551        let is_binary = Self::detect_binary_with_hint(path, extension);
552        let file_type =
553            Self::classify_file_type_with_binary(&relative_path, &language, extension, is_binary);
554
555        Ok(Self {
556            path: path.to_path_buf(),
557            relative_path,
558            size,
559            modified,
560            decision,
561            file_type,
562            language,
563            content: None,
564            token_estimate: None,
565            line_count: None,
566            char_count: None,
567            is_binary,
568            git_status: None,
569            centrality_score: None,
570        })
571    }
572
573    /// Load file content and compute statistics
574    pub fn load_content(&mut self) -> Result<()> {
575        if self.is_binary || !self.decision.should_include() {
576            return Ok(());
577        }
578
579        let content = std::fs::read_to_string(&self.path).map_err(|e| {
580            ScribeError::analysis(format!("Failed to read file content: {}", e), &self.path)
581        })?;
582
583        // Compute statistics
584        let line_count = content.lines().count();
585        let char_count = content.chars().count();
586        let token_estimate = Self::estimate_tokens(&content);
587
588        self.content = Some(content);
589        self.line_count = Some(line_count);
590        self.char_count = Some(char_count);
591        self.token_estimate = Some(token_estimate);
592
593        Ok(())
594    }
595
596    /// Estimate token count for LLM processing using tiktoken
597    ///
598    /// This method uses the shared global TokenCounter instance for optimal performance.
599    /// If tiktoken fails, it falls back to the legacy character-based estimation.
600    pub fn estimate_tokens(content: &str) -> usize {
601        use crate::tokenization::{utils, TokenCounter};
602
603        // Use the shared global instance for optimal performance
604        match TokenCounter::global().count_tokens(content) {
605            Ok(tokens) => tokens,
606            Err(_) => {
607                // Fall back to legacy estimation if tiktoken fails
608                utils::estimate_tokens_legacy(content)
609            }
610        }
611    }
612
613    /// Estimate token count for LLM processing with file context
614    ///
615    /// This method uses the file path to apply language-specific multipliers
616    /// for more accurate token estimation.
617    pub fn estimate_tokens_with_path(content: &str, file_path: &std::path::Path) -> usize {
618        use crate::tokenization::TokenCounter;
619
620        // Use the shared global instance for optimal performance
621        match TokenCounter::global().estimate_file_tokens(content, file_path) {
622            Ok(tokens) => tokens,
623            Err(_) => Self::estimate_tokens(content), // Fall back to basic estimation
624        }
625    }
626
627    /// Detect whether a file is binary using libmagic-compatible signatures with
628    /// sensible fallbacks for small or unknown files.
629    pub fn detect_binary(path: &Path) -> bool {
630        let extension = path.extension().and_then(|ext| ext.to_str()).unwrap_or("");
631        Self::detect_binary_with_hint(path, extension)
632    }
633
634    /// Detect whether a file is binary, allowing the caller to provide an
635    /// extension hint for fallback heuristics.
636    pub fn detect_binary_with_hint(path: &Path, extension: &str) -> bool {
637        if let Some(mime) = tree_magic_mini::from_filepath(path) {
638            if !Self::is_textual_mime(mime) {
639                return true;
640            }
641            return false;
642        }
643
644        if let Ok(mut file) = File::open(path) {
645            let mut buffer = [0u8; 8192];
646            if let Ok(read) = file.read(&mut buffer) {
647                if read == 0 {
648                    return false;
649                }
650
651                let slice = &buffer[..read];
652                let mime = tree_magic_mini::from_u8(slice);
653                if !Self::is_textual_mime(mime) {
654                    return true;
655                }
656
657                if slice.iter().any(|byte| *byte == 0) {
658                    return true;
659                }
660            }
661        }
662
663        Self::detect_binary_by_extension(extension)
664    }
665
666    /// Detect whether in-memory content represents a binary file.
667    pub fn detect_binary_from_bytes(bytes: &[u8], extension: Option<&str>) -> bool {
668        if bytes.is_empty() {
669            return false;
670        }
671
672        let mime = tree_magic_mini::from_u8(bytes);
673        if !Self::is_textual_mime(mime) {
674            return true;
675        }
676
677        if bytes.iter().any(|byte| *byte == 0) {
678            return true;
679        }
680
681        extension
682            .map(Self::detect_binary_by_extension)
683            .unwrap_or(false)
684    }
685
686    /// Check if file extension indicates binary content (fallback heuristic).
687    pub fn detect_binary_by_extension(extension: &str) -> bool {
688        if extension.is_empty() {
689            return false;
690        }
691
692        let trimmed = extension.trim_start_matches('.');
693        let lower = trimmed.to_lowercase();
694        let prefixed = format!(".{}", lower);
695
696        BINARY_EXTENSIONS.contains(&prefixed.as_str())
697    }
698
699    #[inline]
700    fn is_textual_mime(mime: &str) -> bool {
701        let canonical = mime
702            .split(';')
703            .next()
704            .unwrap_or(mime)
705            .trim()
706            .to_ascii_lowercase();
707        let mime = canonical.as_str();
708
709        if mime.starts_with("text/") || mime.starts_with("inode/") || mime.starts_with("message/") {
710            return true;
711        }
712
713        if mime.starts_with("application/") {
714            if TEXTUAL_APPLICATION_MIME_TYPES.contains(&mime) {
715                return true;
716            }
717
718            if TEXTUAL_APPLICATION_KEYWORDS
719                .iter()
720                .any(|keyword| mime.contains(keyword))
721            {
722                return true;
723            }
724        }
725
726        false
727    }
728
729    /// Classify file type based on path and language
730    pub fn classify_file_type(path: &str, language: &Language, extension: &str) -> FileType {
731        let is_binary = Self::detect_binary_by_extension(extension);
732        Self::classify_file_type_with_binary(path, language, extension, is_binary)
733    }
734
735    /// Classify file type when the binary state is already known.
736    pub fn classify_file_type_with_binary(
737        path: &str,
738        language: &Language,
739        extension: &str,
740        is_binary: bool,
741    ) -> FileType {
742        let path_lower = path.to_lowercase();
743
744        if is_binary {
745            return FileType::Binary;
746        }
747
748        // Test files
749        if is_test_path(Path::new(path)) {
750            return FileType::Test {
751                language: language.clone(),
752            };
753        }
754
755        // Documentation
756        if language.is_documentation() {
757            let format = match extension {
758                "md" | "markdown" => DocumentationFormat::Markdown,
759                "html" | "htm" => DocumentationFormat::Html,
760                "rst" => DocumentationFormat::Rst,
761                "txt" => DocumentationFormat::PlainText,
762                _ => DocumentationFormat::Markdown,
763            };
764            return FileType::Documentation { format };
765        }
766
767        // Configuration
768        if language.is_configuration() {
769            let format = match extension {
770                "json" => ConfigurationFormat::Json,
771                "yaml" | "yml" => ConfigurationFormat::Yaml,
772                "toml" => ConfigurationFormat::Toml,
773                "xml" => ConfigurationFormat::Xml,
774                "ini" => ConfigurationFormat::Ini,
775                "env" => ConfigurationFormat::Dotenv,
776                _ => ConfigurationFormat::Json,
777            };
778            return FileType::Configuration { format };
779        }
780
781        // Generated files (common patterns)
782        if path_lower.contains("generated")
783            || path_lower.contains("build")
784            || path_lower.contains("dist")
785            || path_lower.contains("target")
786        {
787            return FileType::Generated;
788        }
789
790        // Source code
791        if language.is_programming() {
792            return FileType::Source {
793                language: language.clone(),
794            };
795        }
796
797        FileType::Unknown
798    }
799
800    /// Get human-readable size
801    pub fn human_size(&self) -> String {
802        bytes_to_human(self.size)
803    }
804
805    /// Check if file should be included in analysis
806    pub fn should_include(&self) -> bool {
807        self.decision.should_include()
808    }
809
810    /// Get file name (last component of path)
811    pub fn file_name(&self) -> Option<&str> {
812        self.path.file_name()?.to_str()
813    }
814
815    /// Get file stem (name without extension)
816    pub fn file_stem(&self) -> Option<&str> {
817        self.path.file_stem()?.to_str()
818    }
819
820    /// Get file extension
821    pub fn extension(&self) -> Option<&str> {
822        self.path.extension()?.to_str()
823    }
824}
825
826/// Convert bytes to human-readable format
827pub fn bytes_to_human(bytes: u64) -> String {
828    const UNITS: &[&str] = &["B", "KiB", "MiB", "GiB", "TiB"];
829    const THRESHOLD: f64 = 1024.0;
830
831    if bytes == 0 {
832        return "0 B".to_string();
833    }
834
835    let mut size = bytes as f64;
836    let mut unit_idx = 0;
837
838    while size >= THRESHOLD && unit_idx < UNITS.len() - 1 {
839        size /= THRESHOLD;
840        unit_idx += 1;
841    }
842
843    if unit_idx == 0 {
844        format!("{} {}", bytes, UNITS[unit_idx])
845    } else {
846        format!("{:.1} {}", size, UNITS[unit_idx])
847    }
848}
849
850/// Detect language from a file path based on extension or special names
851pub fn detect_language_from_path(path: &Path) -> Language {
852    path.extension()
853        .and_then(|ext| ext.to_str())
854        .map(Language::from_extension)
855        .unwrap_or(Language::Unknown)
856}
857
858/// Convenience helper returning the human-friendly language name
859pub fn language_display_name(language: &Language) -> &'static str {
860    language.display_name()
861}
862
863/// Heuristic test-file detection based on path segments and naming conventions
864pub fn is_test_path(path: &Path) -> bool {
865    let path_lower = path.to_string_lossy().to_lowercase();
866    let file_name = path
867        .file_name()
868        .and_then(|s| s.to_str())
869        .map(|s| s.to_lowercase())
870        .unwrap_or_default();
871
872    if file_name == "output.md" || file_name.starts_with("output.") {
873        return true;
874    }
875
876    let segments: Vec<&str> = path_lower
877        .split(|c| c == '/' || c == '\\')
878        .filter(|segment| !segment.is_empty())
879        .collect();
880
881    const TEST_DIR_MARKERS: &[&str] = &[
882        "test",
883        "tests",
884        "testing",
885        "__tests__",
886        "integration-tests",
887        "integration_test",
888        "integrationtests",
889        "e2e",
890        "qa",
891        "spec",
892    ];
893
894    if segments
895        .iter()
896        .any(|segment| TEST_DIR_MARKERS.contains(segment))
897    {
898        return true;
899    }
900
901    const TEST_PREFIXES: &[&str] = &["test_", "spec_", "itest_", "integration_"];
902    if TEST_PREFIXES
903        .iter()
904        .any(|prefix| file_name.starts_with(prefix))
905    {
906        return true;
907    }
908
909    const TEST_SUFFIXES: &[&str] = &["_test", "_tests", "_spec", "_itest", "_integration", "_e2e"];
910    if TEST_SUFFIXES
911        .iter()
912        .any(|suffix| file_name.strip_suffix(suffix).is_some())
913    {
914        return true;
915    }
916
917    if file_name.contains(".test.") || file_name.contains(".spec.") {
918        return true;
919    }
920
921    let ext = path
922        .extension()
923        .and_then(|s| s.to_str())
924        .map(|s| s.to_lowercase())
925        .unwrap_or_default();
926
927    match ext.as_str() {
928        "rs" => file_name.ends_with("_test.rs") || segments.iter().any(|seg| *seg == "tests"),
929        "py" => file_name.starts_with("test_") || file_name.ends_with("_test.py"),
930        "go" => file_name.ends_with("_test.go"),
931        "java" | "kt" => {
932            file_name.ends_with("test.java")
933                || file_name.ends_with("tests.java")
934                || file_name.ends_with("test.kt")
935                || file_name.ends_with("tests.kt")
936        }
937        "php" => file_name.ends_with("test.php"),
938        "rb" => file_name.ends_with("_spec.rb") || file_name.ends_with("_test.rb"),
939        "js" | "jsx" | "ts" | "tsx" => {
940            file_name.contains(".test.")
941                || file_name.contains(".spec.")
942                || file_name.ends_with("_test.ts")
943        }
944        _ => false,
945    }
946}
947
948/// Heuristic entrypoint detection based on common file names per language
949pub fn is_entrypoint_path(path: &Path, language: &Language) -> bool {
950    let path_lower = path.to_string_lossy().to_lowercase();
951    let file_name = path
952        .file_name()
953        .and_then(|s| s.to_str())
954        .map(|s| s.to_lowercase())
955        .unwrap_or_default();
956
957    match language {
958        Language::Rust => file_name == "main.rs" || file_name == "lib.rs",
959        Language::Python => {
960            file_name == "main.py"
961                || path_lower.contains("/__main__.py")
962                || path_lower.contains("/manage.py")
963                || file_name == "app.py"
964                || file_name == "__init__.py"
965        }
966        Language::JavaScript | Language::TypeScript => {
967            file_name == "index.js"
968                || file_name == "index.ts"
969                || path_lower.contains("/app.js")
970                || path_lower.contains("/server.js")
971        }
972        Language::Go => file_name == "main.go",
973        Language::Java => file_name == "main.java" || path_lower.contains("/main.java"),
974        _ => file_name.starts_with("main.") || file_name.starts_with("index."),
975    }
976}
977
978#[cfg(test)]
979mod tests {
980    use super::*;
981
982    #[test]
983    fn test_language_detection() {
984        assert_eq!(Language::from_extension("rs"), Language::Rust);
985        assert_eq!(Language::from_extension("py"), Language::Python);
986        assert_eq!(Language::from_extension("js"), Language::JavaScript);
987        assert_eq!(Language::from_extension("unknown"), Language::Unknown);
988    }
989
990    #[test]
991    fn test_binary_detection() {
992        assert!(FileInfo::detect_binary_by_extension("png"));
993        assert!(FileInfo::detect_binary_by_extension("exe"));
994        assert!(!FileInfo::detect_binary_by_extension("rs"));
995        assert!(!FileInfo::detect_binary_by_extension("py"));
996    }
997
998    #[test]
999    fn test_detect_binary_magic_on_files() {
1000        use std::io::Write;
1001        use tempfile::NamedTempFile;
1002
1003        let mut text_file = NamedTempFile::new().unwrap();
1004        writeln!(text_file, "fn main() {{ println!(\"hi\"); }}").unwrap();
1005
1006        assert!(!FileInfo::detect_binary(text_file.path()));
1007
1008        let mut binary_file = NamedTempFile::new().unwrap();
1009        binary_file
1010            .write_all(&[0u8, 159, 146, 150, 0, 1, 2])
1011            .unwrap();
1012
1013        assert!(FileInfo::detect_binary(binary_file.path()));
1014    }
1015
1016    #[test]
1017    fn test_detect_binary_from_bytes() {
1018        let text_bytes = b"#!/usr/bin/env python3\nprint('hello')\n";
1019        assert!(!FileInfo::detect_binary_from_bytes(text_bytes, Some("py")));
1020
1021        let binary_bytes = [0u8, 255, 1, 2, 3, 4, 5];
1022        assert!(FileInfo::detect_binary_from_bytes(&binary_bytes, None));
1023    }
1024
1025    #[test]
1026    fn test_file_type_classification() {
1027        let rust_lang = Language::Rust;
1028        let py_lang = Language::Python;
1029        let md_lang = Language::Markdown;
1030
1031        // Test Rust source files
1032        assert!(matches!(
1033            FileInfo::classify_file_type("src/lib.rs", &rust_lang, "rs"),
1034            FileType::Source { .. }
1035        ));
1036
1037        assert!(matches!(
1038            FileInfo::classify_file_type("scribe-rs/src/lib.rs", &rust_lang, "rs"),
1039            FileType::Source { .. }
1040        ));
1041
1042        // Test Python source files
1043        assert!(matches!(
1044            FileInfo::classify_file_type("script.py", &py_lang, "py"),
1045            FileType::Source { .. }
1046        ));
1047
1048        // Test that is_programming works correctly
1049        assert!(rust_lang.is_programming());
1050        assert!(py_lang.is_programming());
1051        assert!(!md_lang.is_programming());
1052    }
1053
1054    #[test]
1055    fn test_integration_file_classification() {
1056        // Test the full pipeline: extension -> language -> file_type
1057
1058        // Test Rust files
1059        let rust_lang = Language::from_extension("rs");
1060        assert_eq!(rust_lang, Language::Rust);
1061        assert!(rust_lang.is_programming());
1062
1063        let rust_file_type = FileInfo::classify_file_type("src/lib.rs", &rust_lang, "rs");
1064        assert!(matches!(rust_file_type, FileType::Source { .. }));
1065
1066        // Test Python files
1067        let py_lang = Language::from_extension("py");
1068        assert_eq!(py_lang, Language::Python);
1069        assert!(py_lang.is_programming());
1070
1071        let py_file_type = FileInfo::classify_file_type("script.py", &py_lang, "py");
1072        assert!(matches!(py_file_type, FileType::Source { .. }));
1073
1074        // Test that Unknown language doesn't become Source
1075        let unknown_lang = Language::from_extension("xyz");
1076        assert_eq!(unknown_lang, Language::Unknown);
1077        assert!(!unknown_lang.is_programming());
1078
1079        let unknown_file_type = FileInfo::classify_file_type("file.xyz", &unknown_lang, "xyz");
1080        assert!(matches!(unknown_file_type, FileType::Unknown));
1081
1082        // Test Markdown files
1083        let md_lang = Language::from_extension("md");
1084        assert_eq!(md_lang, Language::Markdown);
1085        assert!(!md_lang.is_programming());
1086
1087        let md_file_type = FileInfo::classify_file_type("README.md", &md_lang, "md");
1088        assert!(matches!(md_file_type, FileType::Documentation { .. }));
1089    }
1090
1091    #[test]
1092    fn test_bytes_to_human() {
1093        assert_eq!(bytes_to_human(0), "0 B");
1094        assert_eq!(bytes_to_human(512), "512 B");
1095        assert_eq!(bytes_to_human(1024), "1.0 KiB");
1096        assert_eq!(bytes_to_human(1536), "1.5 KiB");
1097        assert_eq!(bytes_to_human(1048576), "1.0 MiB");
1098    }
1099
1100    #[test]
1101    fn test_token_estimation() {
1102        let content = "Hello world, this is a test.";
1103        let tokens = FileInfo::estimate_tokens(content);
1104        assert!(tokens > 0);
1105        assert!(tokens < 20); // Should be reasonable estimate
1106    }
1107
1108    #[test]
1109    fn test_render_decision() {
1110        let include = RenderDecision::include("valid file");
1111        assert!(include.should_include());
1112        assert_eq!(include.reason_category(), RenderDecisionCategory::Other);
1113
1114        let exclude = RenderDecision::exclude("binary").with_context("detected by extension");
1115        assert!(!exclude.should_include());
1116        assert_eq!(exclude.reason_category(), RenderDecisionCategory::Binary);
1117        assert!(exclude.context.is_some());
1118    }
1119}