scribe_core/
file.rs

1//! File-related types and utilities.
2//!
3//! Provides comprehensive file metadata structures, language detection,
4//! and file classification utilities for the Scribe analysis pipeline.
5
6use std::path::{Path, PathBuf};
7use std::time::SystemTime;
8use serde::{Deserialize, Serialize};
9
10use crate::error::{Result, ScribeError};
11
12/// Binary file extensions that should typically be excluded from text analysis
13pub const BINARY_EXTENSIONS: &[&str] = &[
14    // Images
15    ".png", ".jpg", ".jpeg", ".gif", ".webp", ".bmp", ".svg", ".ico", ".tiff",
16    // Documents
17    ".pdf", ".doc", ".docx", ".ppt", ".pptx", ".xls", ".xlsx",
18    // Archives
19    ".zip", ".tar", ".gz", ".bz2", ".xz", ".7z", ".rar",
20    // Media
21    ".mp3", ".mp4", ".mov", ".avi", ".mkv", ".wav", ".ogg", ".flac",
22    // Fonts
23    ".ttf", ".otf", ".eot", ".woff", ".woff2",
24    // Executables and libraries
25    ".so", ".dll", ".dylib", ".class", ".jar", ".exe", ".bin", ".app",
26];
27
28/// Markdown file extensions
29pub const MARKDOWN_EXTENSIONS: &[&str] = &[".md", ".markdown", ".mdown", ".mkd", ".mkdn"];
30
31/// Decision about whether to include a file in analysis
32#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
33pub struct RenderDecision {
34    /// Whether to include the file in analysis
35    pub include: bool,
36    /// Human-readable reason for the decision
37    pub reason: String,
38    /// Optional additional context
39    pub context: Option<String>,
40}
41
42impl RenderDecision {
43    /// Create a decision to include the file
44    pub fn include<S: Into<String>>(reason: S) -> Self {
45        Self {
46            include: true,
47            reason: reason.into(),
48            context: None,
49        }
50    }
51
52    /// Create a decision to exclude the file
53    pub fn exclude<S: Into<String>>(reason: S) -> Self {
54        Self {
55            include: false,
56            reason: reason.into(),
57            context: None,
58        }
59    }
60
61    /// Add context to the decision
62    pub fn with_context<S: Into<String>>(mut self, context: S) -> Self {
63        self.context = Some(context.into());
64        self
65    }
66
67    /// Check if the file should be included
68    pub fn should_include(&self) -> bool {
69        self.include
70    }
71
72    /// Get the reason as a standard category
73    pub fn reason_category(&self) -> RenderDecisionCategory {
74        match self.reason.as_str() {
75            "ok" => RenderDecisionCategory::Ok,
76            "binary" => RenderDecisionCategory::Binary,
77            "too_large" => RenderDecisionCategory::TooLarge,
78            "ignored" => RenderDecisionCategory::Ignored,
79            "empty" => RenderDecisionCategory::Empty,
80            _ => RenderDecisionCategory::Other,
81        }
82    }
83}
84
85/// Standard categories for render decisions
86#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
87pub enum RenderDecisionCategory {
88    Ok,
89    Binary,
90    TooLarge,
91    Ignored,
92    Empty,
93    Other,
94}
95
96/// Programming language classification
97#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
98pub enum Language {
99    // Systems languages
100    Rust,
101    C,
102    Cpp,
103    Go,
104    Zig,
105    
106    // Web languages
107    JavaScript,
108    TypeScript,
109    HTML,
110    CSS,
111    SCSS,
112    SASS,
113    
114    // Backend languages
115    Python,
116    Java,
117    CSharp,
118    Kotlin,
119    Scala,
120    Ruby,
121    PHP,
122    
123    // Functional languages
124    Haskell,
125    OCaml,
126    FSharp,
127    Erlang,
128    Elixir,
129    Clojure,
130    
131    // Configuration and markup
132    JSON,
133    YAML,
134    TOML,
135    XML,
136    Markdown,
137    
138    // Database
139    SQL,
140    
141    // Shell and scripts
142    Bash,
143    PowerShell,
144    Batch,
145    
146    // Data science
147    R,
148    Julia,
149    Matlab,
150    
151    // Mobile
152    Swift,
153    ObjectiveC,
154    Dart,
155    
156    // Other
157    Unknown,
158}
159
160impl Language {
161    /// Detect language from file extension
162    pub fn from_extension(ext: &str) -> Self {
163        match ext.to_lowercase().as_str() {
164            "rs" => Language::Rust,
165            "c" | "h" => Language::C,
166            "cpp" | "cxx" | "cc" | "hpp" | "hxx" => Language::Cpp,
167            "go" => Language::Go,
168            "zig" => Language::Zig,
169            "js" | "mjs" | "cjs" => Language::JavaScript,
170            "ts" | "mts" | "cts" => Language::TypeScript,
171            "html" | "htm" => Language::HTML,
172            "css" => Language::CSS,
173            "scss" => Language::SCSS,
174            "sass" => Language::SASS,
175            "py" | "pyi" | "pyw" => Language::Python,
176            "java" => Language::Java,
177            "cs" => Language::CSharp,
178            "kt" | "kts" => Language::Kotlin,
179            "scala" | "sc" => Language::Scala,
180            "rb" => Language::Ruby,
181            "php" => Language::PHP,
182            "hs" | "lhs" => Language::Haskell,
183            "ml" | "mli" => Language::OCaml,
184            "fs" | "fsi" | "fsx" => Language::FSharp,
185            "erl" | "hrl" => Language::Erlang,
186            "ex" | "exs" => Language::Elixir,
187            "clj" | "cljs" | "cljc" => Language::Clojure,
188            "json" => Language::JSON,
189            "yaml" | "yml" => Language::YAML,
190            "toml" => Language::TOML,
191            "xml" => Language::XML,
192            "md" | "markdown" | "mdown" | "mkd" | "mkdn" => Language::Markdown,
193            "sql" => Language::SQL,
194            "sh" | "bash" => Language::Bash,
195            "ps1" | "psm1" | "psd1" => Language::PowerShell,
196            "bat" | "cmd" => Language::Batch,
197            "r" => Language::R,
198            "jl" => Language::Julia,
199            "swift" => Language::Swift,
200            "dart" => Language::Dart,
201            // Handle ambiguous .m extension - could be Matlab or Objective-C
202            // Default to Objective-C as it's more common in modern development
203            "m" | "mm" => Language::ObjectiveC,
204            _ => Language::Unknown,
205        }
206    }
207
208    /// Check if this language is typically used for documentation
209    pub fn is_documentation(&self) -> bool {
210        matches!(self, Language::Markdown | Language::HTML)
211    }
212
213    /// Check if this language is typically used for configuration
214    pub fn is_configuration(&self) -> bool {
215        matches!(
216            self,
217            Language::JSON | Language::YAML | Language::TOML | Language::XML
218        )
219    }
220
221    /// Check if this is a programming language (not markup/config)
222    pub fn is_programming(&self) -> bool {
223        !matches!(
224            self,
225            Language::Markdown
226                | Language::HTML
227                | Language::JSON
228                | Language::YAML
229                | Language::TOML
230                | Language::XML
231                | Language::Unknown
232        )
233    }
234
235    /// Get the typical file extensions for this language
236    pub fn extensions(&self) -> &'static [&'static str] {
237        match self {
238            Language::Rust => &["rs"],
239            Language::C => &["c", "h"],
240            Language::Cpp => &["cpp", "cxx", "cc", "hpp", "hxx"],
241            Language::Go => &["go"],
242            Language::Zig => &["zig"],
243            Language::JavaScript => &["js", "mjs", "cjs"],
244            Language::TypeScript => &["ts", "mts", "cts"],
245            Language::HTML => &["html", "htm"],
246            Language::CSS => &["css"],
247            Language::SCSS => &["scss"],
248            Language::SASS => &["sass"],
249            Language::Python => &["py", "pyi", "pyw"],
250            Language::Java => &["java"],
251            Language::CSharp => &["cs"],
252            Language::Kotlin => &["kt", "kts"],
253            Language::Scala => &["scala", "sc"],
254            Language::Ruby => &["rb"],
255            Language::PHP => &["php"],
256            Language::Haskell => &["hs", "lhs"],
257            Language::OCaml => &["ml", "mli"],
258            Language::FSharp => &["fs", "fsi", "fsx"],
259            Language::Erlang => &["erl", "hrl"],
260            Language::Elixir => &["ex", "exs"],
261            Language::Clojure => &["clj", "cljs", "cljc"],
262            Language::JSON => &["json"],
263            Language::YAML => &["yaml", "yml"],
264            Language::TOML => &["toml"],
265            Language::XML => &["xml"],
266            Language::Markdown => &["md", "markdown", "mdown", "mkd", "mkdn"],
267            Language::SQL => &["sql"],
268            Language::Bash => &["sh", "bash"],
269            Language::PowerShell => &["ps1", "psm1", "psd1"],
270            Language::Batch => &["bat", "cmd"],
271            Language::R => &["r"],
272            Language::Julia => &["jl"],
273            Language::Matlab => &["m"], // Note: .m conflicts with Objective-C
274            Language::Swift => &["swift"],
275            Language::ObjectiveC => &["m", "mm"],
276            Language::Dart => &["dart"],
277            Language::Unknown => &[],
278        }
279    }
280}
281
282/// File type classification for analysis purposes
283#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
284pub enum FileType {
285    /// Source code files
286    Source { language: Language },
287    /// Documentation files
288    Documentation { format: DocumentationFormat },
289    /// Configuration files
290    Configuration { format: ConfigurationFormat },
291    /// Test files
292    Test { language: Language },
293    /// Binary files that should be excluded
294    Binary,
295    /// Generated or built files
296    Generated,
297    /// Unknown or unclassified
298    Unknown,
299}
300
301/// Documentation format classification
302#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
303pub enum DocumentationFormat {
304    Markdown,
305    Html,
306    PlainText,
307    Rst,
308    Asciidoc,
309}
310
311/// Configuration format classification
312#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
313pub enum ConfigurationFormat {
314    Json,
315    Yaml,
316    Toml,
317    Xml,
318    Ini,
319    Dotenv,
320}
321
322/// Comprehensive file metadata structure
323#[derive(Debug, Clone, Serialize, Deserialize)]
324pub struct FileInfo {
325    /// Absolute path to the file on disk
326    pub path: PathBuf,
327    
328    /// Path relative to repository root (forward slash separated)
329    pub relative_path: String,
330    
331    /// File size in bytes
332    pub size: u64,
333    
334    /// File modification time
335    pub modified: Option<SystemTime>,
336    
337    /// Analysis decision (include/exclude)
338    pub decision: RenderDecision,
339    
340    /// Detected file type
341    pub file_type: FileType,
342    
343    /// Detected programming language
344    pub language: Language,
345    
346    /// File content (loaded on demand)
347    pub content: Option<String>,
348    
349    /// Estimated token count for LLM processing
350    pub token_estimate: Option<usize>,
351    
352    /// Line count (if text file)
353    pub line_count: Option<usize>,
354    
355    /// Character count (if text file)
356    pub char_count: Option<usize>,
357    
358    /// Whether the file is likely binary
359    pub is_binary: bool,
360    
361    /// Git status information (if available)
362    pub git_status: Option<GitStatus>,
363    
364    /// PageRank centrality score (0.0-1.0, higher means more important)
365    pub centrality_score: Option<f64>,
366}
367
368/// Git status information for a file
369#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
370pub struct GitStatus {
371    /// Working tree status
372    pub working_tree: GitFileStatus,
373    /// Index/staging area status
374    pub index: GitFileStatus,
375}
376
377/// Git file status
378#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
379pub enum GitFileStatus {
380    Unmodified,
381    Modified,
382    Added,
383    Deleted,
384    Renamed,
385    Copied,
386    Unmerged,
387    Untracked,
388    Ignored,
389}
390
391impl FileInfo {
392    /// Create a new FileInfo from a path
393    pub fn new<P: AsRef<Path>>(
394        path: P,
395        relative_path: String,
396        decision: RenderDecision,
397    ) -> Result<Self> {
398        let path = path.as_ref();
399        let metadata = std::fs::metadata(path)
400            .map_err(|e| ScribeError::path_with_source("Failed to read file metadata", path, e))?;
401
402        let size = metadata.len();
403        let modified = metadata.modified().ok();
404        
405        let extension = path
406            .extension()
407            .and_then(|ext| ext.to_str())
408            .unwrap_or("");
409        
410        let language = Language::from_extension(extension);
411        let is_binary = Self::detect_binary_by_extension(extension);
412        let file_type = Self::classify_file_type(&relative_path, &language, extension);
413
414        Ok(Self {
415            path: path.to_path_buf(),
416            relative_path,
417            size,
418            modified,
419            decision,
420            file_type,
421            language,
422            content: None,
423            token_estimate: None,
424            line_count: None,
425            char_count: None,
426            is_binary,
427            git_status: None,
428            centrality_score: None,
429        })
430    }
431
432    /// Load file content and compute statistics
433    pub fn load_content(&mut self) -> Result<()> {
434        if self.is_binary || !self.decision.should_include() {
435            return Ok(());
436        }
437
438        let content = std::fs::read_to_string(&self.path)
439            .map_err(|e| ScribeError::analysis(
440                format!("Failed to read file content: {}", e), 
441                &self.path
442            ))?;
443
444        // Compute statistics
445        let line_count = content.lines().count();
446        let char_count = content.chars().count();
447        let token_estimate = Self::estimate_tokens(&content);
448
449        self.content = Some(content);
450        self.line_count = Some(line_count);
451        self.char_count = Some(char_count);
452        self.token_estimate = Some(token_estimate);
453
454        Ok(())
455    }
456
457    /// Estimate token count for LLM processing using tiktoken
458    /// 
459    /// This method uses the shared global TokenCounter instance for optimal performance.
460    /// If tiktoken fails, it falls back to the legacy character-based estimation.
461    pub fn estimate_tokens(content: &str) -> usize {
462        use crate::tokenization::{TokenCounter, utils};
463        
464        // Use the shared global instance for optimal performance
465        match TokenCounter::global().count_tokens(content) {
466            Ok(tokens) => tokens,
467            Err(_) => {
468                // Fall back to legacy estimation if tiktoken fails
469                utils::estimate_tokens_legacy(content)
470            }
471        }
472    }
473
474    /// Estimate token count for LLM processing with file context
475    /// 
476    /// This method uses the file path to apply language-specific multipliers
477    /// for more accurate token estimation.
478    pub fn estimate_tokens_with_path(content: &str, file_path: &std::path::Path) -> usize {
479        use crate::tokenization::TokenCounter;
480        
481        // Use the shared global instance for optimal performance
482        match TokenCounter::global().estimate_file_tokens(content, file_path) {
483            Ok(tokens) => tokens,
484            Err(_) => Self::estimate_tokens(content), // Fall back to basic estimation
485        }
486    }
487
488    /// Check if file extension indicates binary content
489    pub fn detect_binary_by_extension(extension: &str) -> bool {
490        BINARY_EXTENSIONS.contains(&format!(".{}", extension.to_lowercase()).as_str())
491    }
492
493    /// Classify file type based on path and language
494    pub fn classify_file_type(path: &str, language: &Language, extension: &str) -> FileType {
495        let path_lower = path.to_lowercase();
496        
497        // Test files
498        if path_lower.contains("test") || path_lower.contains("spec") {
499            return FileType::Test { 
500                language: language.clone() 
501            };
502        }
503
504        // Documentation
505        if language.is_documentation() {
506            let format = match extension {
507                "md" | "markdown" => DocumentationFormat::Markdown,
508                "html" | "htm" => DocumentationFormat::Html,
509                "rst" => DocumentationFormat::Rst,
510                "txt" => DocumentationFormat::PlainText,
511                _ => DocumentationFormat::Markdown,
512            };
513            return FileType::Documentation { format };
514        }
515
516        // Configuration
517        if language.is_configuration() {
518            let format = match extension {
519                "json" => ConfigurationFormat::Json,
520                "yaml" | "yml" => ConfigurationFormat::Yaml,
521                "toml" => ConfigurationFormat::Toml,
522                "xml" => ConfigurationFormat::Xml,
523                "ini" => ConfigurationFormat::Ini,
524                "env" => ConfigurationFormat::Dotenv,
525                _ => ConfigurationFormat::Json,
526            };
527            return FileType::Configuration { format };
528        }
529
530        // Binary files
531        if Self::detect_binary_by_extension(extension) {
532            return FileType::Binary;
533        }
534
535        // Generated files (common patterns)
536        if path_lower.contains("generated") || 
537           path_lower.contains("build") ||
538           path_lower.contains("dist") ||
539           path_lower.contains("target") {
540            return FileType::Generated;
541        }
542
543        // Source code
544        if language.is_programming() {
545            return FileType::Source { 
546                language: language.clone() 
547            };
548        }
549
550        FileType::Unknown
551    }
552
553    /// Get human-readable size
554    pub fn human_size(&self) -> String {
555        bytes_to_human(self.size)
556    }
557
558    /// Check if file should be included in analysis
559    pub fn should_include(&self) -> bool {
560        self.decision.should_include()
561    }
562
563    /// Get file name (last component of path)
564    pub fn file_name(&self) -> Option<&str> {
565        self.path.file_name()?.to_str()
566    }
567
568    /// Get file stem (name without extension)
569    pub fn file_stem(&self) -> Option<&str> {
570        self.path.file_stem()?.to_str()
571    }
572
573    /// Get file extension
574    pub fn extension(&self) -> Option<&str> {
575        self.path.extension()?.to_str()
576    }
577}
578
579/// Convert bytes to human-readable format
580pub fn bytes_to_human(bytes: u64) -> String {
581    const UNITS: &[&str] = &["B", "KiB", "MiB", "GiB", "TiB"];
582    const THRESHOLD: f64 = 1024.0;
583
584    if bytes == 0 {
585        return "0 B".to_string();
586    }
587
588    let mut size = bytes as f64;
589    let mut unit_idx = 0;
590
591    while size >= THRESHOLD && unit_idx < UNITS.len() - 1 {
592        size /= THRESHOLD;
593        unit_idx += 1;
594    }
595
596    if unit_idx == 0 {
597        format!("{} {}", bytes, UNITS[unit_idx])
598    } else {
599        format!("{:.1} {}", size, UNITS[unit_idx])
600    }
601}
602
603#[cfg(test)]
604mod tests {
605    use super::*;
606
607    #[test]
608    fn test_language_detection() {
609        assert_eq!(Language::from_extension("rs"), Language::Rust);
610        assert_eq!(Language::from_extension("py"), Language::Python);
611        assert_eq!(Language::from_extension("js"), Language::JavaScript);
612        assert_eq!(Language::from_extension("unknown"), Language::Unknown);
613    }
614
615    #[test]
616    fn test_binary_detection() {
617        assert!(FileInfo::detect_binary_by_extension("png"));
618        assert!(FileInfo::detect_binary_by_extension("exe"));
619        assert!(!FileInfo::detect_binary_by_extension("rs"));
620        assert!(!FileInfo::detect_binary_by_extension("py"));
621    }
622
623    #[test]
624    fn test_file_type_classification() {
625        let rust_lang = Language::Rust;
626        let py_lang = Language::Python;
627        let md_lang = Language::Markdown;
628
629        // Test Rust source files
630        assert!(matches!(
631            FileInfo::classify_file_type("src/lib.rs", &rust_lang, "rs"),
632            FileType::Source { .. }
633        ));
634        
635        assert!(matches!(
636            FileInfo::classify_file_type("scribe-rs/src/lib.rs", &rust_lang, "rs"),
637            FileType::Source { .. }
638        ));
639
640        // Test Python source files  
641        assert!(matches!(
642            FileInfo::classify_file_type("script.py", &py_lang, "py"),
643            FileType::Source { .. }
644        ));
645
646        // Test that is_programming works correctly
647        assert!(rust_lang.is_programming());
648        assert!(py_lang.is_programming());
649        assert!(!md_lang.is_programming());
650    }
651
652    #[test]
653    fn test_integration_file_classification() {
654        // Test the full pipeline: extension -> language -> file_type
655        
656        // Test Rust files
657        let rust_lang = Language::from_extension("rs");
658        assert_eq!(rust_lang, Language::Rust);
659        assert!(rust_lang.is_programming());
660        
661        let rust_file_type = FileInfo::classify_file_type("src/lib.rs", &rust_lang, "rs");
662        assert!(matches!(rust_file_type, FileType::Source { .. }));
663        
664        // Test Python files  
665        let py_lang = Language::from_extension("py");
666        assert_eq!(py_lang, Language::Python);
667        assert!(py_lang.is_programming());
668        
669        let py_file_type = FileInfo::classify_file_type("script.py", &py_lang, "py");
670        assert!(matches!(py_file_type, FileType::Source { .. }));
671        
672        // Test that Unknown language doesn't become Source
673        let unknown_lang = Language::from_extension("xyz");
674        assert_eq!(unknown_lang, Language::Unknown);
675        assert!(!unknown_lang.is_programming());
676        
677        let unknown_file_type = FileInfo::classify_file_type("file.xyz", &unknown_lang, "xyz");
678        assert!(matches!(unknown_file_type, FileType::Unknown));
679        
680        // Test Markdown files
681        let md_lang = Language::from_extension("md");
682        assert_eq!(md_lang, Language::Markdown);
683        assert!(!md_lang.is_programming());
684        
685        let md_file_type = FileInfo::classify_file_type("README.md", &md_lang, "md");
686        assert!(matches!(md_file_type, FileType::Documentation { .. }));
687    }
688
689    #[test]
690    fn test_bytes_to_human() {
691        assert_eq!(bytes_to_human(0), "0 B");
692        assert_eq!(bytes_to_human(512), "512 B");
693        assert_eq!(bytes_to_human(1024), "1.0 KiB");
694        assert_eq!(bytes_to_human(1536), "1.5 KiB");
695        assert_eq!(bytes_to_human(1048576), "1.0 MiB");
696    }
697
698    #[test]
699    fn test_token_estimation() {
700        let content = "Hello world, this is a test.";
701        let tokens = FileInfo::estimate_tokens(content);
702        assert!(tokens > 0);
703        assert!(tokens < 20); // Should be reasonable estimate
704    }
705
706    #[test]
707    fn test_render_decision() {
708        let include = RenderDecision::include("valid file");
709        assert!(include.should_include());
710        assert_eq!(include.reason_category(), RenderDecisionCategory::Other);
711
712        let exclude = RenderDecision::exclude("binary")
713            .with_context("detected by extension");
714        assert!(!exclude.should_include());
715        assert_eq!(exclude.reason_category(), RenderDecisionCategory::Binary);
716        assert!(exclude.context.is_some());
717    }
718}