scribe_core/
file.rs

1//! File-related types and utilities.
2//!
3//! Provides comprehensive file metadata structures, language detection,
4//! and file classification utilities for the Scribe analysis pipeline.
5
6use serde::{Deserialize, Serialize};
7use std::path::{Path, PathBuf};
8use std::time::SystemTime;
9
10use crate::error::{Result, ScribeError};
11
12/// Binary file extensions that should typically be excluded from text analysis
13pub const BINARY_EXTENSIONS: &[&str] = &[
14    // Images
15    ".png", ".jpg", ".jpeg", ".gif", ".webp", ".bmp", ".svg", ".ico", ".tiff",
16    // Documents
17    ".pdf", ".doc", ".docx", ".ppt", ".pptx", ".xls", ".xlsx", // Archives
18    ".zip", ".tar", ".gz", ".bz2", ".xz", ".7z", ".rar", // Media
19    ".mp3", ".mp4", ".mov", ".avi", ".mkv", ".wav", ".ogg", ".flac", // Fonts
20    ".ttf", ".otf", ".eot", ".woff", ".woff2", // Executables and libraries
21    ".so", ".dll", ".dylib", ".class", ".jar", ".exe", ".bin", ".app",
22];
23
24/// Markdown file extensions
25pub const MARKDOWN_EXTENSIONS: &[&str] = &[".md", ".markdown", ".mdown", ".mkd", ".mkdn"];
26
27/// Decision about whether to include a file in analysis
28#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
29pub struct RenderDecision {
30    /// Whether to include the file in analysis
31    pub include: bool,
32    /// Human-readable reason for the decision
33    pub reason: String,
34    /// Optional additional context
35    pub context: Option<String>,
36}
37
38impl RenderDecision {
39    /// Create a decision to include the file
40    pub fn include<S: Into<String>>(reason: S) -> Self {
41        Self {
42            include: true,
43            reason: reason.into(),
44            context: None,
45        }
46    }
47
48    /// Create a decision to exclude the file
49    pub fn exclude<S: Into<String>>(reason: S) -> Self {
50        Self {
51            include: false,
52            reason: reason.into(),
53            context: None,
54        }
55    }
56
57    /// Add context to the decision
58    pub fn with_context<S: Into<String>>(mut self, context: S) -> Self {
59        self.context = Some(context.into());
60        self
61    }
62
63    /// Check if the file should be included
64    pub fn should_include(&self) -> bool {
65        self.include
66    }
67
68    /// Get the reason as a standard category
69    pub fn reason_category(&self) -> RenderDecisionCategory {
70        match self.reason.as_str() {
71            "ok" => RenderDecisionCategory::Ok,
72            "binary" => RenderDecisionCategory::Binary,
73            "too_large" => RenderDecisionCategory::TooLarge,
74            "ignored" => RenderDecisionCategory::Ignored,
75            "empty" => RenderDecisionCategory::Empty,
76            _ => RenderDecisionCategory::Other,
77        }
78    }
79}
80
81/// Standard categories for render decisions
82#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
83pub enum RenderDecisionCategory {
84    Ok,
85    Binary,
86    TooLarge,
87    Ignored,
88    Empty,
89    Other,
90}
91
92/// Programming language classification
93#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
94pub enum Language {
95    // Systems languages
96    Rust,
97    C,
98    Cpp,
99    Go,
100    Zig,
101
102    // Web languages
103    JavaScript,
104    TypeScript,
105    HTML,
106    CSS,
107    SCSS,
108    SASS,
109
110    // Backend languages
111    Python,
112    Java,
113    CSharp,
114    Kotlin,
115    Scala,
116    Ruby,
117    PHP,
118
119    // Functional languages
120    Haskell,
121    OCaml,
122    FSharp,
123    Erlang,
124    Elixir,
125    Clojure,
126
127    // Configuration and markup
128    JSON,
129    YAML,
130    TOML,
131    XML,
132    Markdown,
133
134    // Database
135    SQL,
136
137    // Shell and scripts
138    Bash,
139    PowerShell,
140    Batch,
141
142    // Data science
143    R,
144    Julia,
145    Matlab,
146
147    // Mobile
148    Swift,
149    ObjectiveC,
150    Dart,
151
152    // Other
153    Unknown,
154}
155
156impl Language {
157    /// Detect language from file extension
158    pub fn from_extension(ext: &str) -> Self {
159        match ext.to_lowercase().as_str() {
160            "rs" => Language::Rust,
161            "c" | "h" => Language::C,
162            "cpp" | "cxx" | "cc" | "hpp" | "hxx" => Language::Cpp,
163            "go" => Language::Go,
164            "zig" => Language::Zig,
165            "js" | "mjs" | "cjs" => Language::JavaScript,
166            "ts" | "mts" | "cts" => Language::TypeScript,
167            "html" | "htm" => Language::HTML,
168            "css" => Language::CSS,
169            "scss" => Language::SCSS,
170            "sass" => Language::SASS,
171            "py" | "pyi" | "pyw" => Language::Python,
172            "java" => Language::Java,
173            "cs" => Language::CSharp,
174            "kt" | "kts" => Language::Kotlin,
175            "scala" | "sc" => Language::Scala,
176            "rb" => Language::Ruby,
177            "php" => Language::PHP,
178            "hs" | "lhs" => Language::Haskell,
179            "ml" | "mli" => Language::OCaml,
180            "fs" | "fsi" | "fsx" => Language::FSharp,
181            "erl" | "hrl" => Language::Erlang,
182            "ex" | "exs" => Language::Elixir,
183            "clj" | "cljs" | "cljc" => Language::Clojure,
184            "json" => Language::JSON,
185            "yaml" | "yml" => Language::YAML,
186            "toml" => Language::TOML,
187            "xml" => Language::XML,
188            "md" | "markdown" | "mdown" | "mkd" | "mkdn" => Language::Markdown,
189            "sql" => Language::SQL,
190            "sh" | "bash" => Language::Bash,
191            "ps1" | "psm1" | "psd1" => Language::PowerShell,
192            "bat" | "cmd" => Language::Batch,
193            "r" => Language::R,
194            "jl" => Language::Julia,
195            "swift" => Language::Swift,
196            "dart" => Language::Dart,
197            // Handle ambiguous .m extension - could be Matlab or Objective-C
198            // Default to Objective-C as it's more common in modern development
199            "m" | "mm" => Language::ObjectiveC,
200            _ => Language::Unknown,
201        }
202    }
203
204    /// Check if this language is typically used for documentation
205    pub fn is_documentation(&self) -> bool {
206        matches!(self, Language::Markdown | Language::HTML)
207    }
208
209    /// Check if this language is typically used for configuration
210    pub fn is_configuration(&self) -> bool {
211        matches!(
212            self,
213            Language::JSON | Language::YAML | Language::TOML | Language::XML
214        )
215    }
216
217    /// Check if this is a programming language (not markup/config)
218    pub fn is_programming(&self) -> bool {
219        !matches!(
220            self,
221            Language::Markdown
222                | Language::HTML
223                | Language::JSON
224                | Language::YAML
225                | Language::TOML
226                | Language::XML
227                | Language::Unknown
228        )
229    }
230
231    /// Get the typical file extensions for this language
232    pub fn extensions(&self) -> &'static [&'static str] {
233        match self {
234            Language::Rust => &["rs"],
235            Language::C => &["c", "h"],
236            Language::Cpp => &["cpp", "cxx", "cc", "hpp", "hxx"],
237            Language::Go => &["go"],
238            Language::Zig => &["zig"],
239            Language::JavaScript => &["js", "mjs", "cjs"],
240            Language::TypeScript => &["ts", "mts", "cts"],
241            Language::HTML => &["html", "htm"],
242            Language::CSS => &["css"],
243            Language::SCSS => &["scss"],
244            Language::SASS => &["sass"],
245            Language::Python => &["py", "pyi", "pyw"],
246            Language::Java => &["java"],
247            Language::CSharp => &["cs"],
248            Language::Kotlin => &["kt", "kts"],
249            Language::Scala => &["scala", "sc"],
250            Language::Ruby => &["rb"],
251            Language::PHP => &["php"],
252            Language::Haskell => &["hs", "lhs"],
253            Language::OCaml => &["ml", "mli"],
254            Language::FSharp => &["fs", "fsi", "fsx"],
255            Language::Erlang => &["erl", "hrl"],
256            Language::Elixir => &["ex", "exs"],
257            Language::Clojure => &["clj", "cljs", "cljc"],
258            Language::JSON => &["json"],
259            Language::YAML => &["yaml", "yml"],
260            Language::TOML => &["toml"],
261            Language::XML => &["xml"],
262            Language::Markdown => &["md", "markdown", "mdown", "mkd", "mkdn"],
263            Language::SQL => &["sql"],
264            Language::Bash => &["sh", "bash"],
265            Language::PowerShell => &["ps1", "psm1", "psd1"],
266            Language::Batch => &["bat", "cmd"],
267            Language::R => &["r"],
268            Language::Julia => &["jl"],
269            Language::Matlab => &["m"], // Note: .m conflicts with Objective-C
270            Language::Swift => &["swift"],
271            Language::ObjectiveC => &["m", "mm"],
272            Language::Dart => &["dart"],
273            Language::Unknown => &[],
274        }
275    }
276}
277
278/// File type classification for analysis purposes
279#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
280pub enum FileType {
281    /// Source code files
282    Source { language: Language },
283    /// Documentation files
284    Documentation { format: DocumentationFormat },
285    /// Configuration files
286    Configuration { format: ConfigurationFormat },
287    /// Test files
288    Test { language: Language },
289    /// Binary files that should be excluded
290    Binary,
291    /// Generated or built files
292    Generated,
293    /// Unknown or unclassified
294    Unknown,
295}
296
297/// Documentation format classification
298#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
299pub enum DocumentationFormat {
300    Markdown,
301    Html,
302    PlainText,
303    Rst,
304    Asciidoc,
305}
306
307/// Configuration format classification
308#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
309pub enum ConfigurationFormat {
310    Json,
311    Yaml,
312    Toml,
313    Xml,
314    Ini,
315    Dotenv,
316}
317
318/// Comprehensive file metadata structure
319#[derive(Debug, Clone, Serialize, Deserialize)]
320pub struct FileInfo {
321    /// Absolute path to the file on disk
322    pub path: PathBuf,
323
324    /// Path relative to repository root (forward slash separated)
325    pub relative_path: String,
326
327    /// File size in bytes
328    pub size: u64,
329
330    /// File modification time
331    pub modified: Option<SystemTime>,
332
333    /// Analysis decision (include/exclude)
334    pub decision: RenderDecision,
335
336    /// Detected file type
337    pub file_type: FileType,
338
339    /// Detected programming language
340    pub language: Language,
341
342    /// File content (loaded on demand)
343    pub content: Option<String>,
344
345    /// Estimated token count for LLM processing
346    pub token_estimate: Option<usize>,
347
348    /// Line count (if text file)
349    pub line_count: Option<usize>,
350
351    /// Character count (if text file)
352    pub char_count: Option<usize>,
353
354    /// Whether the file is likely binary
355    pub is_binary: bool,
356
357    /// Git status information (if available)
358    pub git_status: Option<GitStatus>,
359
360    /// PageRank centrality score (0.0-1.0, higher means more important)
361    pub centrality_score: Option<f64>,
362}
363
364/// Git status information for a file
365#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
366pub struct GitStatus {
367    /// Working tree status
368    pub working_tree: GitFileStatus,
369    /// Index/staging area status
370    pub index: GitFileStatus,
371}
372
373/// Git file status
374#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
375pub enum GitFileStatus {
376    Unmodified,
377    Modified,
378    Added,
379    Deleted,
380    Renamed,
381    Copied,
382    Unmerged,
383    Untracked,
384    Ignored,
385}
386
387impl FileInfo {
388    /// Create a new FileInfo from a path
389    pub fn new<P: AsRef<Path>>(
390        path: P,
391        relative_path: String,
392        decision: RenderDecision,
393    ) -> Result<Self> {
394        let path = path.as_ref();
395        let metadata = std::fs::metadata(path)
396            .map_err(|e| ScribeError::path_with_source("Failed to read file metadata", path, e))?;
397
398        let size = metadata.len();
399        let modified = metadata.modified().ok();
400
401        let extension = path.extension().and_then(|ext| ext.to_str()).unwrap_or("");
402
403        let language = Language::from_extension(extension);
404        let is_binary = Self::detect_binary_by_extension(extension);
405        let file_type = Self::classify_file_type(&relative_path, &language, extension);
406
407        Ok(Self {
408            path: path.to_path_buf(),
409            relative_path,
410            size,
411            modified,
412            decision,
413            file_type,
414            language,
415            content: None,
416            token_estimate: None,
417            line_count: None,
418            char_count: None,
419            is_binary,
420            git_status: None,
421            centrality_score: None,
422        })
423    }
424
425    /// Load file content and compute statistics
426    pub fn load_content(&mut self) -> Result<()> {
427        if self.is_binary || !self.decision.should_include() {
428            return Ok(());
429        }
430
431        let content = std::fs::read_to_string(&self.path).map_err(|e| {
432            ScribeError::analysis(format!("Failed to read file content: {}", e), &self.path)
433        })?;
434
435        // Compute statistics
436        let line_count = content.lines().count();
437        let char_count = content.chars().count();
438        let token_estimate = Self::estimate_tokens(&content);
439
440        self.content = Some(content);
441        self.line_count = Some(line_count);
442        self.char_count = Some(char_count);
443        self.token_estimate = Some(token_estimate);
444
445        Ok(())
446    }
447
448    /// Estimate token count for LLM processing using tiktoken
449    ///
450    /// This method uses the shared global TokenCounter instance for optimal performance.
451    /// If tiktoken fails, it falls back to the legacy character-based estimation.
452    pub fn estimate_tokens(content: &str) -> usize {
453        use crate::tokenization::{utils, TokenCounter};
454
455        // Use the shared global instance for optimal performance
456        match TokenCounter::global().count_tokens(content) {
457            Ok(tokens) => tokens,
458            Err(_) => {
459                // Fall back to legacy estimation if tiktoken fails
460                utils::estimate_tokens_legacy(content)
461            }
462        }
463    }
464
465    /// Estimate token count for LLM processing with file context
466    ///
467    /// This method uses the file path to apply language-specific multipliers
468    /// for more accurate token estimation.
469    pub fn estimate_tokens_with_path(content: &str, file_path: &std::path::Path) -> usize {
470        use crate::tokenization::TokenCounter;
471
472        // Use the shared global instance for optimal performance
473        match TokenCounter::global().estimate_file_tokens(content, file_path) {
474            Ok(tokens) => tokens,
475            Err(_) => Self::estimate_tokens(content), // Fall back to basic estimation
476        }
477    }
478
479    /// Check if file extension indicates binary content
480    pub fn detect_binary_by_extension(extension: &str) -> bool {
481        BINARY_EXTENSIONS.contains(&format!(".{}", extension.to_lowercase()).as_str())
482    }
483
484    /// Classify file type based on path and language
485    pub fn classify_file_type(path: &str, language: &Language, extension: &str) -> FileType {
486        let path_lower = path.to_lowercase();
487
488        // Test files
489        if path_lower.contains("test") || path_lower.contains("spec") {
490            return FileType::Test {
491                language: language.clone(),
492            };
493        }
494
495        // Documentation
496        if language.is_documentation() {
497            let format = match extension {
498                "md" | "markdown" => DocumentationFormat::Markdown,
499                "html" | "htm" => DocumentationFormat::Html,
500                "rst" => DocumentationFormat::Rst,
501                "txt" => DocumentationFormat::PlainText,
502                _ => DocumentationFormat::Markdown,
503            };
504            return FileType::Documentation { format };
505        }
506
507        // Configuration
508        if language.is_configuration() {
509            let format = match extension {
510                "json" => ConfigurationFormat::Json,
511                "yaml" | "yml" => ConfigurationFormat::Yaml,
512                "toml" => ConfigurationFormat::Toml,
513                "xml" => ConfigurationFormat::Xml,
514                "ini" => ConfigurationFormat::Ini,
515                "env" => ConfigurationFormat::Dotenv,
516                _ => ConfigurationFormat::Json,
517            };
518            return FileType::Configuration { format };
519        }
520
521        // Binary files
522        if Self::detect_binary_by_extension(extension) {
523            return FileType::Binary;
524        }
525
526        // Generated files (common patterns)
527        if path_lower.contains("generated")
528            || path_lower.contains("build")
529            || path_lower.contains("dist")
530            || path_lower.contains("target")
531        {
532            return FileType::Generated;
533        }
534
535        // Source code
536        if language.is_programming() {
537            return FileType::Source {
538                language: language.clone(),
539            };
540        }
541
542        FileType::Unknown
543    }
544
545    /// Get human-readable size
546    pub fn human_size(&self) -> String {
547        bytes_to_human(self.size)
548    }
549
550    /// Check if file should be included in analysis
551    pub fn should_include(&self) -> bool {
552        self.decision.should_include()
553    }
554
555    /// Get file name (last component of path)
556    pub fn file_name(&self) -> Option<&str> {
557        self.path.file_name()?.to_str()
558    }
559
560    /// Get file stem (name without extension)
561    pub fn file_stem(&self) -> Option<&str> {
562        self.path.file_stem()?.to_str()
563    }
564
565    /// Get file extension
566    pub fn extension(&self) -> Option<&str> {
567        self.path.extension()?.to_str()
568    }
569}
570
571/// Convert bytes to human-readable format
572pub fn bytes_to_human(bytes: u64) -> String {
573    const UNITS: &[&str] = &["B", "KiB", "MiB", "GiB", "TiB"];
574    const THRESHOLD: f64 = 1024.0;
575
576    if bytes == 0 {
577        return "0 B".to_string();
578    }
579
580    let mut size = bytes as f64;
581    let mut unit_idx = 0;
582
583    while size >= THRESHOLD && unit_idx < UNITS.len() - 1 {
584        size /= THRESHOLD;
585        unit_idx += 1;
586    }
587
588    if unit_idx == 0 {
589        format!("{} {}", bytes, UNITS[unit_idx])
590    } else {
591        format!("{:.1} {}", size, UNITS[unit_idx])
592    }
593}
594
595#[cfg(test)]
596mod tests {
597    use super::*;
598
599    #[test]
600    fn test_language_detection() {
601        assert_eq!(Language::from_extension("rs"), Language::Rust);
602        assert_eq!(Language::from_extension("py"), Language::Python);
603        assert_eq!(Language::from_extension("js"), Language::JavaScript);
604        assert_eq!(Language::from_extension("unknown"), Language::Unknown);
605    }
606
607    #[test]
608    fn test_binary_detection() {
609        assert!(FileInfo::detect_binary_by_extension("png"));
610        assert!(FileInfo::detect_binary_by_extension("exe"));
611        assert!(!FileInfo::detect_binary_by_extension("rs"));
612        assert!(!FileInfo::detect_binary_by_extension("py"));
613    }
614
615    #[test]
616    fn test_file_type_classification() {
617        let rust_lang = Language::Rust;
618        let py_lang = Language::Python;
619        let md_lang = Language::Markdown;
620
621        // Test Rust source files
622        assert!(matches!(
623            FileInfo::classify_file_type("src/lib.rs", &rust_lang, "rs"),
624            FileType::Source { .. }
625        ));
626
627        assert!(matches!(
628            FileInfo::classify_file_type("scribe-rs/src/lib.rs", &rust_lang, "rs"),
629            FileType::Source { .. }
630        ));
631
632        // Test Python source files
633        assert!(matches!(
634            FileInfo::classify_file_type("script.py", &py_lang, "py"),
635            FileType::Source { .. }
636        ));
637
638        // Test that is_programming works correctly
639        assert!(rust_lang.is_programming());
640        assert!(py_lang.is_programming());
641        assert!(!md_lang.is_programming());
642    }
643
644    #[test]
645    fn test_integration_file_classification() {
646        // Test the full pipeline: extension -> language -> file_type
647
648        // Test Rust files
649        let rust_lang = Language::from_extension("rs");
650        assert_eq!(rust_lang, Language::Rust);
651        assert!(rust_lang.is_programming());
652
653        let rust_file_type = FileInfo::classify_file_type("src/lib.rs", &rust_lang, "rs");
654        assert!(matches!(rust_file_type, FileType::Source { .. }));
655
656        // Test Python files
657        let py_lang = Language::from_extension("py");
658        assert_eq!(py_lang, Language::Python);
659        assert!(py_lang.is_programming());
660
661        let py_file_type = FileInfo::classify_file_type("script.py", &py_lang, "py");
662        assert!(matches!(py_file_type, FileType::Source { .. }));
663
664        // Test that Unknown language doesn't become Source
665        let unknown_lang = Language::from_extension("xyz");
666        assert_eq!(unknown_lang, Language::Unknown);
667        assert!(!unknown_lang.is_programming());
668
669        let unknown_file_type = FileInfo::classify_file_type("file.xyz", &unknown_lang, "xyz");
670        assert!(matches!(unknown_file_type, FileType::Unknown));
671
672        // Test Markdown files
673        let md_lang = Language::from_extension("md");
674        assert_eq!(md_lang, Language::Markdown);
675        assert!(!md_lang.is_programming());
676
677        let md_file_type = FileInfo::classify_file_type("README.md", &md_lang, "md");
678        assert!(matches!(md_file_type, FileType::Documentation { .. }));
679    }
680
681    #[test]
682    fn test_bytes_to_human() {
683        assert_eq!(bytes_to_human(0), "0 B");
684        assert_eq!(bytes_to_human(512), "512 B");
685        assert_eq!(bytes_to_human(1024), "1.0 KiB");
686        assert_eq!(bytes_to_human(1536), "1.5 KiB");
687        assert_eq!(bytes_to_human(1048576), "1.0 MiB");
688    }
689
690    #[test]
691    fn test_token_estimation() {
692        let content = "Hello world, this is a test.";
693        let tokens = FileInfo::estimate_tokens(content);
694        assert!(tokens > 0);
695        assert!(tokens < 20); // Should be reasonable estimate
696    }
697
698    #[test]
699    fn test_render_decision() {
700        let include = RenderDecision::include("valid file");
701        assert!(include.should_include());
702        assert_eq!(include.reason_category(), RenderDecisionCategory::Other);
703
704        let exclude = RenderDecision::exclude("binary").with_context("detected by extension");
705        assert!(!exclude.should_include());
706        assert_eq!(exclude.reason_category(), RenderDecisionCategory::Binary);
707        assert!(exclude.context.is_some());
708    }
709}