scribe_core/
file.rs

1//! File-related types and utilities.
2//!
3//! Provides comprehensive file metadata structures, language detection,
4//! and file classification utilities for the Scribe analysis pipeline.
5
6use std::path::{Path, PathBuf};
7use std::time::SystemTime;
8use serde::{Deserialize, Serialize};
9
10use crate::error::{Result, ScribeError};
11
12/// Binary file extensions that should typically be excluded from text analysis
13pub const BINARY_EXTENSIONS: &[&str] = &[
14    // Images
15    ".png", ".jpg", ".jpeg", ".gif", ".webp", ".bmp", ".svg", ".ico", ".tiff",
16    // Documents
17    ".pdf", ".doc", ".docx", ".ppt", ".pptx", ".xls", ".xlsx",
18    // Archives
19    ".zip", ".tar", ".gz", ".bz2", ".xz", ".7z", ".rar",
20    // Media
21    ".mp3", ".mp4", ".mov", ".avi", ".mkv", ".wav", ".ogg", ".flac",
22    // Fonts
23    ".ttf", ".otf", ".eot", ".woff", ".woff2",
24    // Executables and libraries
25    ".so", ".dll", ".dylib", ".class", ".jar", ".exe", ".bin", ".app",
26];
27
28/// Markdown file extensions
29pub const MARKDOWN_EXTENSIONS: &[&str] = &[".md", ".markdown", ".mdown", ".mkd", ".mkdn"];
30
31/// Decision about whether to include a file in analysis
32#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
33pub struct RenderDecision {
34    /// Whether to include the file in analysis
35    pub include: bool,
36    /// Human-readable reason for the decision
37    pub reason: String,
38    /// Optional additional context
39    pub context: Option<String>,
40}
41
42impl RenderDecision {
43    /// Create a decision to include the file
44    pub fn include<S: Into<String>>(reason: S) -> Self {
45        Self {
46            include: true,
47            reason: reason.into(),
48            context: None,
49        }
50    }
51
52    /// Create a decision to exclude the file
53    pub fn exclude<S: Into<String>>(reason: S) -> Self {
54        Self {
55            include: false,
56            reason: reason.into(),
57            context: None,
58        }
59    }
60
61    /// Add context to the decision
62    pub fn with_context<S: Into<String>>(mut self, context: S) -> Self {
63        self.context = Some(context.into());
64        self
65    }
66
67    /// Check if the file should be included
68    pub fn should_include(&self) -> bool {
69        self.include
70    }
71
72    /// Get the reason as a standard category
73    pub fn reason_category(&self) -> RenderDecisionCategory {
74        match self.reason.as_str() {
75            "ok" => RenderDecisionCategory::Ok,
76            "binary" => RenderDecisionCategory::Binary,
77            "too_large" => RenderDecisionCategory::TooLarge,
78            "ignored" => RenderDecisionCategory::Ignored,
79            "empty" => RenderDecisionCategory::Empty,
80            _ => RenderDecisionCategory::Other,
81        }
82    }
83}
84
85/// Standard categories for render decisions
86#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
87pub enum RenderDecisionCategory {
88    Ok,
89    Binary,
90    TooLarge,
91    Ignored,
92    Empty,
93    Other,
94}
95
96/// Programming language classification
97#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
98pub enum Language {
99    // Systems languages
100    Rust,
101    C,
102    Cpp,
103    Go,
104    Zig,
105    
106    // Web languages
107    JavaScript,
108    TypeScript,
109    HTML,
110    CSS,
111    SCSS,
112    SASS,
113    
114    // Backend languages
115    Python,
116    Java,
117    CSharp,
118    Kotlin,
119    Scala,
120    Ruby,
121    PHP,
122    
123    // Functional languages
124    Haskell,
125    OCaml,
126    FSharp,
127    Erlang,
128    Elixir,
129    Clojure,
130    
131    // Configuration and markup
132    JSON,
133    YAML,
134    TOML,
135    XML,
136    Markdown,
137    
138    // Database
139    SQL,
140    
141    // Shell and scripts
142    Bash,
143    PowerShell,
144    Batch,
145    
146    // Data science
147    R,
148    Julia,
149    Matlab,
150    
151    // Mobile
152    Swift,
153    ObjectiveC,
154    Dart,
155    
156    // Other
157    Unknown,
158}
159
160impl Language {
161    /// Detect language from file extension
162    pub fn from_extension(ext: &str) -> Self {
163        match ext.to_lowercase().as_str() {
164            "rs" => Language::Rust,
165            "c" | "h" => Language::C,
166            "cpp" | "cxx" | "cc" | "hpp" | "hxx" => Language::Cpp,
167            "go" => Language::Go,
168            "zig" => Language::Zig,
169            "js" | "mjs" | "cjs" => Language::JavaScript,
170            "ts" | "mts" | "cts" => Language::TypeScript,
171            "html" | "htm" => Language::HTML,
172            "css" => Language::CSS,
173            "scss" => Language::SCSS,
174            "sass" => Language::SASS,
175            "py" | "pyi" | "pyw" => Language::Python,
176            "java" => Language::Java,
177            "cs" => Language::CSharp,
178            "kt" | "kts" => Language::Kotlin,
179            "scala" | "sc" => Language::Scala,
180            "rb" => Language::Ruby,
181            "php" => Language::PHP,
182            "hs" | "lhs" => Language::Haskell,
183            "ml" | "mli" => Language::OCaml,
184            "fs" | "fsi" | "fsx" => Language::FSharp,
185            "erl" | "hrl" => Language::Erlang,
186            "ex" | "exs" => Language::Elixir,
187            "clj" | "cljs" | "cljc" => Language::Clojure,
188            "json" => Language::JSON,
189            "yaml" | "yml" => Language::YAML,
190            "toml" => Language::TOML,
191            "xml" => Language::XML,
192            "md" | "markdown" | "mdown" | "mkd" | "mkdn" => Language::Markdown,
193            "sql" => Language::SQL,
194            "sh" | "bash" => Language::Bash,
195            "ps1" | "psm1" | "psd1" => Language::PowerShell,
196            "bat" | "cmd" => Language::Batch,
197            "r" => Language::R,
198            "jl" => Language::Julia,
199            "swift" => Language::Swift,
200            "dart" => Language::Dart,
201            // Handle ambiguous .m extension - could be Matlab or Objective-C
202            // Default to Objective-C as it's more common in modern development
203            "m" | "mm" => Language::ObjectiveC,
204            _ => Language::Unknown,
205        }
206    }
207
208    /// Check if this language is typically used for documentation
209    pub fn is_documentation(&self) -> bool {
210        matches!(self, Language::Markdown | Language::HTML)
211    }
212
213    /// Check if this language is typically used for configuration
214    pub fn is_configuration(&self) -> bool {
215        matches!(
216            self,
217            Language::JSON | Language::YAML | Language::TOML | Language::XML
218        )
219    }
220
221    /// Check if this is a programming language (not markup/config)
222    pub fn is_programming(&self) -> bool {
223        !matches!(
224            self,
225            Language::Markdown
226                | Language::HTML
227                | Language::JSON
228                | Language::YAML
229                | Language::TOML
230                | Language::XML
231                | Language::Unknown
232        )
233    }
234
235    /// Get the typical file extensions for this language
236    pub fn extensions(&self) -> &'static [&'static str] {
237        match self {
238            Language::Rust => &["rs"],
239            Language::C => &["c", "h"],
240            Language::Cpp => &["cpp", "cxx", "cc", "hpp", "hxx"],
241            Language::Go => &["go"],
242            Language::Zig => &["zig"],
243            Language::JavaScript => &["js", "mjs", "cjs"],
244            Language::TypeScript => &["ts", "mts", "cts"],
245            Language::HTML => &["html", "htm"],
246            Language::CSS => &["css"],
247            Language::SCSS => &["scss"],
248            Language::SASS => &["sass"],
249            Language::Python => &["py", "pyi", "pyw"],
250            Language::Java => &["java"],
251            Language::CSharp => &["cs"],
252            Language::Kotlin => &["kt", "kts"],
253            Language::Scala => &["scala", "sc"],
254            Language::Ruby => &["rb"],
255            Language::PHP => &["php"],
256            Language::Haskell => &["hs", "lhs"],
257            Language::OCaml => &["ml", "mli"],
258            Language::FSharp => &["fs", "fsi", "fsx"],
259            Language::Erlang => &["erl", "hrl"],
260            Language::Elixir => &["ex", "exs"],
261            Language::Clojure => &["clj", "cljs", "cljc"],
262            Language::JSON => &["json"],
263            Language::YAML => &["yaml", "yml"],
264            Language::TOML => &["toml"],
265            Language::XML => &["xml"],
266            Language::Markdown => &["md", "markdown", "mdown", "mkd", "mkdn"],
267            Language::SQL => &["sql"],
268            Language::Bash => &["sh", "bash"],
269            Language::PowerShell => &["ps1", "psm1", "psd1"],
270            Language::Batch => &["bat", "cmd"],
271            Language::R => &["r"],
272            Language::Julia => &["jl"],
273            Language::Matlab => &["m"], // Note: .m conflicts with Objective-C
274            Language::Swift => &["swift"],
275            Language::ObjectiveC => &["m", "mm"],
276            Language::Dart => &["dart"],
277            Language::Unknown => &[],
278        }
279    }
280}
281
282/// File type classification for analysis purposes
283#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
284pub enum FileType {
285    /// Source code files
286    Source { language: Language },
287    /// Documentation files
288    Documentation { format: DocumentationFormat },
289    /// Configuration files
290    Configuration { format: ConfigurationFormat },
291    /// Test files
292    Test { language: Language },
293    /// Binary files that should be excluded
294    Binary,
295    /// Generated or built files
296    Generated,
297    /// Unknown or unclassified
298    Unknown,
299}
300
301/// Documentation format classification
302#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
303pub enum DocumentationFormat {
304    Markdown,
305    Html,
306    PlainText,
307    Rst,
308    Asciidoc,
309}
310
311/// Configuration format classification
312#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
313pub enum ConfigurationFormat {
314    Json,
315    Yaml,
316    Toml,
317    Xml,
318    Ini,
319    Dotenv,
320}
321
322/// Comprehensive file metadata structure
323#[derive(Debug, Clone, Serialize, Deserialize)]
324pub struct FileInfo {
325    /// Absolute path to the file on disk
326    pub path: PathBuf,
327    
328    /// Path relative to repository root (forward slash separated)
329    pub relative_path: String,
330    
331    /// File size in bytes
332    pub size: u64,
333    
334    /// File modification time
335    pub modified: Option<SystemTime>,
336    
337    /// Analysis decision (include/exclude)
338    pub decision: RenderDecision,
339    
340    /// Detected file type
341    pub file_type: FileType,
342    
343    /// Detected programming language
344    pub language: Language,
345    
346    /// File content (loaded on demand)
347    pub content: Option<String>,
348    
349    /// Estimated token count for LLM processing
350    pub token_estimate: Option<usize>,
351    
352    /// Line count (if text file)
353    pub line_count: Option<usize>,
354    
355    /// Character count (if text file)
356    pub char_count: Option<usize>,
357    
358    /// Whether the file is likely binary
359    pub is_binary: bool,
360    
361    /// Git status information (if available)
362    pub git_status: Option<GitStatus>,
363}
364
365/// Git status information for a file
366#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
367pub struct GitStatus {
368    /// Working tree status
369    pub working_tree: GitFileStatus,
370    /// Index/staging area status
371    pub index: GitFileStatus,
372}
373
374/// Git file status
375#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
376pub enum GitFileStatus {
377    Unmodified,
378    Modified,
379    Added,
380    Deleted,
381    Renamed,
382    Copied,
383    Unmerged,
384    Untracked,
385    Ignored,
386}
387
388impl FileInfo {
389    /// Create a new FileInfo from a path
390    pub fn new<P: AsRef<Path>>(
391        path: P,
392        relative_path: String,
393        decision: RenderDecision,
394    ) -> Result<Self> {
395        let path = path.as_ref();
396        let metadata = std::fs::metadata(path)
397            .map_err(|e| ScribeError::path_with_source("Failed to read file metadata", path, e))?;
398
399        let size = metadata.len();
400        let modified = metadata.modified().ok();
401        
402        let extension = path
403            .extension()
404            .and_then(|ext| ext.to_str())
405            .unwrap_or("");
406        
407        let language = Language::from_extension(extension);
408        let is_binary = Self::detect_binary_by_extension(extension);
409        let file_type = Self::classify_file_type(&relative_path, &language, extension);
410
411        Ok(Self {
412            path: path.to_path_buf(),
413            relative_path,
414            size,
415            modified,
416            decision,
417            file_type,
418            language,
419            content: None,
420            token_estimate: None,
421            line_count: None,
422            char_count: None,
423            is_binary,
424            git_status: None,
425        })
426    }
427
428    /// Load file content and compute statistics
429    pub fn load_content(&mut self) -> Result<()> {
430        if self.is_binary || !self.decision.should_include() {
431            return Ok(());
432        }
433
434        let content = std::fs::read_to_string(&self.path)
435            .map_err(|e| ScribeError::analysis(
436                format!("Failed to read file content: {}", e), 
437                &self.path
438            ))?;
439
440        // Compute statistics
441        let line_count = content.lines().count();
442        let char_count = content.chars().count();
443        let token_estimate = Self::estimate_tokens(&content);
444
445        self.content = Some(content);
446        self.line_count = Some(line_count);
447        self.char_count = Some(char_count);
448        self.token_estimate = Some(token_estimate);
449
450        Ok(())
451    }
452
453    /// Estimate token count for LLM processing (rough approximation)
454    pub fn estimate_tokens(content: &str) -> usize {
455        // Rough estimate: ~4 characters per token for English text
456        // This varies significantly by content type and tokenizer
457        (content.chars().count() as f64 / 4.0).ceil() as usize
458    }
459
460    /// Check if file extension indicates binary content
461    pub fn detect_binary_by_extension(extension: &str) -> bool {
462        BINARY_EXTENSIONS.contains(&format!(".{}", extension.to_lowercase()).as_str())
463    }
464
465    /// Classify file type based on path and language
466    pub fn classify_file_type(path: &str, language: &Language, extension: &str) -> FileType {
467        let path_lower = path.to_lowercase();
468        
469        // Test files
470        if path_lower.contains("test") || path_lower.contains("spec") {
471            return FileType::Test { 
472                language: language.clone() 
473            };
474        }
475
476        // Documentation
477        if language.is_documentation() {
478            let format = match extension {
479                "md" | "markdown" => DocumentationFormat::Markdown,
480                "html" | "htm" => DocumentationFormat::Html,
481                "rst" => DocumentationFormat::Rst,
482                "txt" => DocumentationFormat::PlainText,
483                _ => DocumentationFormat::Markdown,
484            };
485            return FileType::Documentation { format };
486        }
487
488        // Configuration
489        if language.is_configuration() {
490            let format = match extension {
491                "json" => ConfigurationFormat::Json,
492                "yaml" | "yml" => ConfigurationFormat::Yaml,
493                "toml" => ConfigurationFormat::Toml,
494                "xml" => ConfigurationFormat::Xml,
495                "ini" => ConfigurationFormat::Ini,
496                "env" => ConfigurationFormat::Dotenv,
497                _ => ConfigurationFormat::Json,
498            };
499            return FileType::Configuration { format };
500        }
501
502        // Binary files
503        if Self::detect_binary_by_extension(extension) {
504            return FileType::Binary;
505        }
506
507        // Generated files (common patterns)
508        if path_lower.contains("generated") || 
509           path_lower.contains("build") ||
510           path_lower.contains("dist") ||
511           path_lower.contains("target") {
512            return FileType::Generated;
513        }
514
515        // Source code
516        if language.is_programming() {
517            return FileType::Source { 
518                language: language.clone() 
519            };
520        }
521
522        FileType::Unknown
523    }
524
525    /// Get human-readable size
526    pub fn human_size(&self) -> String {
527        bytes_to_human(self.size)
528    }
529
530    /// Check if file should be included in analysis
531    pub fn should_include(&self) -> bool {
532        self.decision.should_include()
533    }
534
535    /// Get file name (last component of path)
536    pub fn file_name(&self) -> Option<&str> {
537        self.path.file_name()?.to_str()
538    }
539
540    /// Get file stem (name without extension)
541    pub fn file_stem(&self) -> Option<&str> {
542        self.path.file_stem()?.to_str()
543    }
544
545    /// Get file extension
546    pub fn extension(&self) -> Option<&str> {
547        self.path.extension()?.to_str()
548    }
549}
550
551/// Convert bytes to human-readable format
552pub fn bytes_to_human(bytes: u64) -> String {
553    const UNITS: &[&str] = &["B", "KiB", "MiB", "GiB", "TiB"];
554    const THRESHOLD: f64 = 1024.0;
555
556    if bytes == 0 {
557        return "0 B".to_string();
558    }
559
560    let mut size = bytes as f64;
561    let mut unit_idx = 0;
562
563    while size >= THRESHOLD && unit_idx < UNITS.len() - 1 {
564        size /= THRESHOLD;
565        unit_idx += 1;
566    }
567
568    if unit_idx == 0 {
569        format!("{} {}", bytes, UNITS[unit_idx])
570    } else {
571        format!("{:.1} {}", size, UNITS[unit_idx])
572    }
573}
574
575#[cfg(test)]
576mod tests {
577    use super::*;
578
579    #[test]
580    fn test_language_detection() {
581        assert_eq!(Language::from_extension("rs"), Language::Rust);
582        assert_eq!(Language::from_extension("py"), Language::Python);
583        assert_eq!(Language::from_extension("js"), Language::JavaScript);
584        assert_eq!(Language::from_extension("unknown"), Language::Unknown);
585    }
586
587    #[test]
588    fn test_binary_detection() {
589        assert!(FileInfo::detect_binary_by_extension("png"));
590        assert!(FileInfo::detect_binary_by_extension("exe"));
591        assert!(!FileInfo::detect_binary_by_extension("rs"));
592        assert!(!FileInfo::detect_binary_by_extension("py"));
593    }
594
595    #[test]
596    fn test_file_type_classification() {
597        let rust_lang = Language::Rust;
598        let _py_lang = Language::Python;
599        let md_lang = Language::Markdown;
600
601        assert!(matches!(
602            FileInfo::classify_file_type("src/lib.rs", &rust_lang, "rs"),
603            FileType::Source { .. }
604        ));
605
606        assert!(matches!(
607            FileInfo::classify_file_type("src/test_lib.rs", &rust_lang, "rs"),
608            FileType::Test { .. }
609        ));
610
611        assert!(matches!(
612            FileInfo::classify_file_type("README.md", &md_lang, "md"),
613            FileType::Documentation { .. }
614        ));
615    }
616
617    #[test]
618    fn test_bytes_to_human() {
619        assert_eq!(bytes_to_human(0), "0 B");
620        assert_eq!(bytes_to_human(512), "512 B");
621        assert_eq!(bytes_to_human(1024), "1.0 KiB");
622        assert_eq!(bytes_to_human(1536), "1.5 KiB");
623        assert_eq!(bytes_to_human(1048576), "1.0 MiB");
624    }
625
626    #[test]
627    fn test_token_estimation() {
628        let content = "Hello world, this is a test.";
629        let tokens = FileInfo::estimate_tokens(content);
630        assert!(tokens > 0);
631        assert!(tokens < 20); // Should be reasonable estimate
632    }
633
634    #[test]
635    fn test_render_decision() {
636        let include = RenderDecision::include("valid file");
637        assert!(include.should_include());
638        assert_eq!(include.reason_category(), RenderDecisionCategory::Other);
639
640        let exclude = RenderDecision::exclude("binary")
641            .with_context("detected by extension");
642        assert!(!exclude.should_include());
643        assert_eq!(exclude.reason_category(), RenderDecisionCategory::Binary);
644        assert!(exclude.context.is_some());
645    }
646}