1use std::path::{Path, PathBuf};
7use std::time::SystemTime;
8use serde::{Deserialize, Serialize};
9
10use crate::error::{Result, ScribeError};
11
12pub const BINARY_EXTENSIONS: &[&str] = &[
14 ".png", ".jpg", ".jpeg", ".gif", ".webp", ".bmp", ".svg", ".ico", ".tiff",
16 ".pdf", ".doc", ".docx", ".ppt", ".pptx", ".xls", ".xlsx",
18 ".zip", ".tar", ".gz", ".bz2", ".xz", ".7z", ".rar",
20 ".mp3", ".mp4", ".mov", ".avi", ".mkv", ".wav", ".ogg", ".flac",
22 ".ttf", ".otf", ".eot", ".woff", ".woff2",
24 ".so", ".dll", ".dylib", ".class", ".jar", ".exe", ".bin", ".app",
26];
27
28pub const MARKDOWN_EXTENSIONS: &[&str] = &[".md", ".markdown", ".mdown", ".mkd", ".mkdn"];
30
31#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
33pub struct RenderDecision {
34 pub include: bool,
36 pub reason: String,
38 pub context: Option<String>,
40}
41
42impl RenderDecision {
43 pub fn include<S: Into<String>>(reason: S) -> Self {
45 Self {
46 include: true,
47 reason: reason.into(),
48 context: None,
49 }
50 }
51
52 pub fn exclude<S: Into<String>>(reason: S) -> Self {
54 Self {
55 include: false,
56 reason: reason.into(),
57 context: None,
58 }
59 }
60
61 pub fn with_context<S: Into<String>>(mut self, context: S) -> Self {
63 self.context = Some(context.into());
64 self
65 }
66
67 pub fn should_include(&self) -> bool {
69 self.include
70 }
71
72 pub fn reason_category(&self) -> RenderDecisionCategory {
74 match self.reason.as_str() {
75 "ok" => RenderDecisionCategory::Ok,
76 "binary" => RenderDecisionCategory::Binary,
77 "too_large" => RenderDecisionCategory::TooLarge,
78 "ignored" => RenderDecisionCategory::Ignored,
79 "empty" => RenderDecisionCategory::Empty,
80 _ => RenderDecisionCategory::Other,
81 }
82 }
83}
84
85#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
87pub enum RenderDecisionCategory {
88 Ok,
89 Binary,
90 TooLarge,
91 Ignored,
92 Empty,
93 Other,
94}
95
96#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
98pub enum Language {
99 Rust,
101 C,
102 Cpp,
103 Go,
104 Zig,
105
106 JavaScript,
108 TypeScript,
109 HTML,
110 CSS,
111 SCSS,
112 SASS,
113
114 Python,
116 Java,
117 CSharp,
118 Kotlin,
119 Scala,
120 Ruby,
121 PHP,
122
123 Haskell,
125 OCaml,
126 FSharp,
127 Erlang,
128 Elixir,
129 Clojure,
130
131 JSON,
133 YAML,
134 TOML,
135 XML,
136 Markdown,
137
138 SQL,
140
141 Bash,
143 PowerShell,
144 Batch,
145
146 R,
148 Julia,
149 Matlab,
150
151 Swift,
153 ObjectiveC,
154 Dart,
155
156 Unknown,
158}
159
160impl Language {
161 pub fn from_extension(ext: &str) -> Self {
163 match ext.to_lowercase().as_str() {
164 "rs" => Language::Rust,
165 "c" | "h" => Language::C,
166 "cpp" | "cxx" | "cc" | "hpp" | "hxx" => Language::Cpp,
167 "go" => Language::Go,
168 "zig" => Language::Zig,
169 "js" | "mjs" | "cjs" => Language::JavaScript,
170 "ts" | "mts" | "cts" => Language::TypeScript,
171 "html" | "htm" => Language::HTML,
172 "css" => Language::CSS,
173 "scss" => Language::SCSS,
174 "sass" => Language::SASS,
175 "py" | "pyi" | "pyw" => Language::Python,
176 "java" => Language::Java,
177 "cs" => Language::CSharp,
178 "kt" | "kts" => Language::Kotlin,
179 "scala" | "sc" => Language::Scala,
180 "rb" => Language::Ruby,
181 "php" => Language::PHP,
182 "hs" | "lhs" => Language::Haskell,
183 "ml" | "mli" => Language::OCaml,
184 "fs" | "fsi" | "fsx" => Language::FSharp,
185 "erl" | "hrl" => Language::Erlang,
186 "ex" | "exs" => Language::Elixir,
187 "clj" | "cljs" | "cljc" => Language::Clojure,
188 "json" => Language::JSON,
189 "yaml" | "yml" => Language::YAML,
190 "toml" => Language::TOML,
191 "xml" => Language::XML,
192 "md" | "markdown" | "mdown" | "mkd" | "mkdn" => Language::Markdown,
193 "sql" => Language::SQL,
194 "sh" | "bash" => Language::Bash,
195 "ps1" | "psm1" | "psd1" => Language::PowerShell,
196 "bat" | "cmd" => Language::Batch,
197 "r" => Language::R,
198 "jl" => Language::Julia,
199 "swift" => Language::Swift,
200 "dart" => Language::Dart,
201 "m" | "mm" => Language::ObjectiveC,
204 _ => Language::Unknown,
205 }
206 }
207
208 pub fn is_documentation(&self) -> bool {
210 matches!(self, Language::Markdown | Language::HTML)
211 }
212
213 pub fn is_configuration(&self) -> bool {
215 matches!(
216 self,
217 Language::JSON | Language::YAML | Language::TOML | Language::XML
218 )
219 }
220
221 pub fn is_programming(&self) -> bool {
223 !matches!(
224 self,
225 Language::Markdown
226 | Language::HTML
227 | Language::JSON
228 | Language::YAML
229 | Language::TOML
230 | Language::XML
231 | Language::Unknown
232 )
233 }
234
235 pub fn extensions(&self) -> &'static [&'static str] {
237 match self {
238 Language::Rust => &["rs"],
239 Language::C => &["c", "h"],
240 Language::Cpp => &["cpp", "cxx", "cc", "hpp", "hxx"],
241 Language::Go => &["go"],
242 Language::Zig => &["zig"],
243 Language::JavaScript => &["js", "mjs", "cjs"],
244 Language::TypeScript => &["ts", "mts", "cts"],
245 Language::HTML => &["html", "htm"],
246 Language::CSS => &["css"],
247 Language::SCSS => &["scss"],
248 Language::SASS => &["sass"],
249 Language::Python => &["py", "pyi", "pyw"],
250 Language::Java => &["java"],
251 Language::CSharp => &["cs"],
252 Language::Kotlin => &["kt", "kts"],
253 Language::Scala => &["scala", "sc"],
254 Language::Ruby => &["rb"],
255 Language::PHP => &["php"],
256 Language::Haskell => &["hs", "lhs"],
257 Language::OCaml => &["ml", "mli"],
258 Language::FSharp => &["fs", "fsi", "fsx"],
259 Language::Erlang => &["erl", "hrl"],
260 Language::Elixir => &["ex", "exs"],
261 Language::Clojure => &["clj", "cljs", "cljc"],
262 Language::JSON => &["json"],
263 Language::YAML => &["yaml", "yml"],
264 Language::TOML => &["toml"],
265 Language::XML => &["xml"],
266 Language::Markdown => &["md", "markdown", "mdown", "mkd", "mkdn"],
267 Language::SQL => &["sql"],
268 Language::Bash => &["sh", "bash"],
269 Language::PowerShell => &["ps1", "psm1", "psd1"],
270 Language::Batch => &["bat", "cmd"],
271 Language::R => &["r"],
272 Language::Julia => &["jl"],
273 Language::Matlab => &["m"], Language::Swift => &["swift"],
275 Language::ObjectiveC => &["m", "mm"],
276 Language::Dart => &["dart"],
277 Language::Unknown => &[],
278 }
279 }
280}
281
282#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
284pub enum FileType {
285 Source { language: Language },
287 Documentation { format: DocumentationFormat },
289 Configuration { format: ConfigurationFormat },
291 Test { language: Language },
293 Binary,
295 Generated,
297 Unknown,
299}
300
301#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
303pub enum DocumentationFormat {
304 Markdown,
305 Html,
306 PlainText,
307 Rst,
308 Asciidoc,
309}
310
311#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
313pub enum ConfigurationFormat {
314 Json,
315 Yaml,
316 Toml,
317 Xml,
318 Ini,
319 Dotenv,
320}
321
322#[derive(Debug, Clone, Serialize, Deserialize)]
324pub struct FileInfo {
325 pub path: PathBuf,
327
328 pub relative_path: String,
330
331 pub size: u64,
333
334 pub modified: Option<SystemTime>,
336
337 pub decision: RenderDecision,
339
340 pub file_type: FileType,
342
343 pub language: Language,
345
346 pub content: Option<String>,
348
349 pub token_estimate: Option<usize>,
351
352 pub line_count: Option<usize>,
354
355 pub char_count: Option<usize>,
357
358 pub is_binary: bool,
360
361 pub git_status: Option<GitStatus>,
363}
364
365#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
367pub struct GitStatus {
368 pub working_tree: GitFileStatus,
370 pub index: GitFileStatus,
372}
373
374#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
376pub enum GitFileStatus {
377 Unmodified,
378 Modified,
379 Added,
380 Deleted,
381 Renamed,
382 Copied,
383 Unmerged,
384 Untracked,
385 Ignored,
386}
387
388impl FileInfo {
389 pub fn new<P: AsRef<Path>>(
391 path: P,
392 relative_path: String,
393 decision: RenderDecision,
394 ) -> Result<Self> {
395 let path = path.as_ref();
396 let metadata = std::fs::metadata(path)
397 .map_err(|e| ScribeError::path_with_source("Failed to read file metadata", path, e))?;
398
399 let size = metadata.len();
400 let modified = metadata.modified().ok();
401
402 let extension = path
403 .extension()
404 .and_then(|ext| ext.to_str())
405 .unwrap_or("");
406
407 let language = Language::from_extension(extension);
408 let is_binary = Self::detect_binary_by_extension(extension);
409 let file_type = Self::classify_file_type(&relative_path, &language, extension);
410
411 Ok(Self {
412 path: path.to_path_buf(),
413 relative_path,
414 size,
415 modified,
416 decision,
417 file_type,
418 language,
419 content: None,
420 token_estimate: None,
421 line_count: None,
422 char_count: None,
423 is_binary,
424 git_status: None,
425 })
426 }
427
428 pub fn load_content(&mut self) -> Result<()> {
430 if self.is_binary || !self.decision.should_include() {
431 return Ok(());
432 }
433
434 let content = std::fs::read_to_string(&self.path)
435 .map_err(|e| ScribeError::analysis(
436 format!("Failed to read file content: {}", e),
437 &self.path
438 ))?;
439
440 let line_count = content.lines().count();
442 let char_count = content.chars().count();
443 let token_estimate = Self::estimate_tokens(&content);
444
445 self.content = Some(content);
446 self.line_count = Some(line_count);
447 self.char_count = Some(char_count);
448 self.token_estimate = Some(token_estimate);
449
450 Ok(())
451 }
452
453 pub fn estimate_tokens(content: &str) -> usize {
455 (content.chars().count() as f64 / 4.0).ceil() as usize
458 }
459
460 pub fn detect_binary_by_extension(extension: &str) -> bool {
462 BINARY_EXTENSIONS.contains(&format!(".{}", extension.to_lowercase()).as_str())
463 }
464
465 pub fn classify_file_type(path: &str, language: &Language, extension: &str) -> FileType {
467 let path_lower = path.to_lowercase();
468
469 if path_lower.contains("test") || path_lower.contains("spec") {
471 return FileType::Test {
472 language: language.clone()
473 };
474 }
475
476 if language.is_documentation() {
478 let format = match extension {
479 "md" | "markdown" => DocumentationFormat::Markdown,
480 "html" | "htm" => DocumentationFormat::Html,
481 "rst" => DocumentationFormat::Rst,
482 "txt" => DocumentationFormat::PlainText,
483 _ => DocumentationFormat::Markdown,
484 };
485 return FileType::Documentation { format };
486 }
487
488 if language.is_configuration() {
490 let format = match extension {
491 "json" => ConfigurationFormat::Json,
492 "yaml" | "yml" => ConfigurationFormat::Yaml,
493 "toml" => ConfigurationFormat::Toml,
494 "xml" => ConfigurationFormat::Xml,
495 "ini" => ConfigurationFormat::Ini,
496 "env" => ConfigurationFormat::Dotenv,
497 _ => ConfigurationFormat::Json,
498 };
499 return FileType::Configuration { format };
500 }
501
502 if Self::detect_binary_by_extension(extension) {
504 return FileType::Binary;
505 }
506
507 if path_lower.contains("generated") ||
509 path_lower.contains("build") ||
510 path_lower.contains("dist") ||
511 path_lower.contains("target") {
512 return FileType::Generated;
513 }
514
515 if language.is_programming() {
517 return FileType::Source {
518 language: language.clone()
519 };
520 }
521
522 FileType::Unknown
523 }
524
525 pub fn human_size(&self) -> String {
527 bytes_to_human(self.size)
528 }
529
530 pub fn should_include(&self) -> bool {
532 self.decision.should_include()
533 }
534
535 pub fn file_name(&self) -> Option<&str> {
537 self.path.file_name()?.to_str()
538 }
539
540 pub fn file_stem(&self) -> Option<&str> {
542 self.path.file_stem()?.to_str()
543 }
544
545 pub fn extension(&self) -> Option<&str> {
547 self.path.extension()?.to_str()
548 }
549}
550
551pub fn bytes_to_human(bytes: u64) -> String {
553 const UNITS: &[&str] = &["B", "KiB", "MiB", "GiB", "TiB"];
554 const THRESHOLD: f64 = 1024.0;
555
556 if bytes == 0 {
557 return "0 B".to_string();
558 }
559
560 let mut size = bytes as f64;
561 let mut unit_idx = 0;
562
563 while size >= THRESHOLD && unit_idx < UNITS.len() - 1 {
564 size /= THRESHOLD;
565 unit_idx += 1;
566 }
567
568 if unit_idx == 0 {
569 format!("{} {}", bytes, UNITS[unit_idx])
570 } else {
571 format!("{:.1} {}", size, UNITS[unit_idx])
572 }
573}
574
575#[cfg(test)]
576mod tests {
577 use super::*;
578
579 #[test]
580 fn test_language_detection() {
581 assert_eq!(Language::from_extension("rs"), Language::Rust);
582 assert_eq!(Language::from_extension("py"), Language::Python);
583 assert_eq!(Language::from_extension("js"), Language::JavaScript);
584 assert_eq!(Language::from_extension("unknown"), Language::Unknown);
585 }
586
587 #[test]
588 fn test_binary_detection() {
589 assert!(FileInfo::detect_binary_by_extension("png"));
590 assert!(FileInfo::detect_binary_by_extension("exe"));
591 assert!(!FileInfo::detect_binary_by_extension("rs"));
592 assert!(!FileInfo::detect_binary_by_extension("py"));
593 }
594
595 #[test]
596 fn test_file_type_classification() {
597 let rust_lang = Language::Rust;
598 let _py_lang = Language::Python;
599 let md_lang = Language::Markdown;
600
601 assert!(matches!(
602 FileInfo::classify_file_type("src/lib.rs", &rust_lang, "rs"),
603 FileType::Source { .. }
604 ));
605
606 assert!(matches!(
607 FileInfo::classify_file_type("src/test_lib.rs", &rust_lang, "rs"),
608 FileType::Test { .. }
609 ));
610
611 assert!(matches!(
612 FileInfo::classify_file_type("README.md", &md_lang, "md"),
613 FileType::Documentation { .. }
614 ));
615 }
616
617 #[test]
618 fn test_bytes_to_human() {
619 assert_eq!(bytes_to_human(0), "0 B");
620 assert_eq!(bytes_to_human(512), "512 B");
621 assert_eq!(bytes_to_human(1024), "1.0 KiB");
622 assert_eq!(bytes_to_human(1536), "1.5 KiB");
623 assert_eq!(bytes_to_human(1048576), "1.0 MiB");
624 }
625
626 #[test]
627 fn test_token_estimation() {
628 let content = "Hello world, this is a test.";
629 let tokens = FileInfo::estimate_tokens(content);
630 assert!(tokens > 0);
631 assert!(tokens < 20); }
633
634 #[test]
635 fn test_render_decision() {
636 let include = RenderDecision::include("valid file");
637 assert!(include.should_include());
638 assert_eq!(include.reason_category(), RenderDecisionCategory::Other);
639
640 let exclude = RenderDecision::exclude("binary")
641 .with_context("detected by extension");
642 assert!(!exclude.should_include());
643 assert_eq!(exclude.reason_category(), RenderDecisionCategory::Binary);
644 assert!(exclude.context.is_some());
645 }
646}