1use serde::{Deserialize, Serialize};
7use std::path::{Path, PathBuf};
8use std::time::SystemTime;
9
10use crate::error::{Result, ScribeError};
11
12pub const BINARY_EXTENSIONS: &[&str] = &[
14 ".png", ".jpg", ".jpeg", ".gif", ".webp", ".bmp", ".svg", ".ico", ".tiff",
16 ".pdf", ".doc", ".docx", ".ppt", ".pptx", ".xls", ".xlsx", ".zip", ".tar", ".gz", ".bz2", ".xz", ".7z", ".rar", ".mp3", ".mp4", ".mov", ".avi", ".mkv", ".wav", ".ogg", ".flac", ".ttf", ".otf", ".eot", ".woff", ".woff2", ".so", ".dll", ".dylib", ".class", ".jar", ".exe", ".bin", ".app",
22];
23
24pub const MARKDOWN_EXTENSIONS: &[&str] = &[".md", ".markdown", ".mdown", ".mkd", ".mkdn"];
26
27#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
29pub struct RenderDecision {
30 pub include: bool,
32 pub reason: String,
34 pub context: Option<String>,
36}
37
38impl RenderDecision {
39 pub fn include<S: Into<String>>(reason: S) -> Self {
41 Self {
42 include: true,
43 reason: reason.into(),
44 context: None,
45 }
46 }
47
48 pub fn exclude<S: Into<String>>(reason: S) -> Self {
50 Self {
51 include: false,
52 reason: reason.into(),
53 context: None,
54 }
55 }
56
57 pub fn with_context<S: Into<String>>(mut self, context: S) -> Self {
59 self.context = Some(context.into());
60 self
61 }
62
63 pub fn should_include(&self) -> bool {
65 self.include
66 }
67
68 pub fn reason_category(&self) -> RenderDecisionCategory {
70 match self.reason.as_str() {
71 "ok" => RenderDecisionCategory::Ok,
72 "binary" => RenderDecisionCategory::Binary,
73 "too_large" => RenderDecisionCategory::TooLarge,
74 "ignored" => RenderDecisionCategory::Ignored,
75 "empty" => RenderDecisionCategory::Empty,
76 _ => RenderDecisionCategory::Other,
77 }
78 }
79}
80
81#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
83pub enum RenderDecisionCategory {
84 Ok,
85 Binary,
86 TooLarge,
87 Ignored,
88 Empty,
89 Other,
90}
91
92#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
94pub enum Language {
95 Rust,
97 C,
98 Cpp,
99 Go,
100 Zig,
101
102 JavaScript,
104 TypeScript,
105 HTML,
106 CSS,
107 SCSS,
108 SASS,
109
110 Python,
112 Java,
113 CSharp,
114 Kotlin,
115 Scala,
116 Ruby,
117 PHP,
118
119 Haskell,
121 OCaml,
122 FSharp,
123 Erlang,
124 Elixir,
125 Clojure,
126
127 JSON,
129 YAML,
130 TOML,
131 XML,
132 Markdown,
133
134 SQL,
136
137 Bash,
139 PowerShell,
140 Batch,
141
142 R,
144 Julia,
145 Matlab,
146
147 Swift,
149 ObjectiveC,
150 Dart,
151
152 Unknown,
154}
155
156impl Language {
157 pub fn from_extension(ext: &str) -> Self {
159 match ext.to_lowercase().as_str() {
160 "rs" => Language::Rust,
161 "c" | "h" => Language::C,
162 "cpp" | "cxx" | "cc" | "hpp" | "hxx" => Language::Cpp,
163 "go" => Language::Go,
164 "zig" => Language::Zig,
165 "js" | "mjs" | "cjs" => Language::JavaScript,
166 "ts" | "mts" | "cts" => Language::TypeScript,
167 "html" | "htm" => Language::HTML,
168 "css" => Language::CSS,
169 "scss" => Language::SCSS,
170 "sass" => Language::SASS,
171 "py" | "pyi" | "pyw" => Language::Python,
172 "java" => Language::Java,
173 "cs" => Language::CSharp,
174 "kt" | "kts" => Language::Kotlin,
175 "scala" | "sc" => Language::Scala,
176 "rb" => Language::Ruby,
177 "php" => Language::PHP,
178 "hs" | "lhs" => Language::Haskell,
179 "ml" | "mli" => Language::OCaml,
180 "fs" | "fsi" | "fsx" => Language::FSharp,
181 "erl" | "hrl" => Language::Erlang,
182 "ex" | "exs" => Language::Elixir,
183 "clj" | "cljs" | "cljc" => Language::Clojure,
184 "json" => Language::JSON,
185 "yaml" | "yml" => Language::YAML,
186 "toml" => Language::TOML,
187 "xml" => Language::XML,
188 "md" | "markdown" | "mdown" | "mkd" | "mkdn" => Language::Markdown,
189 "sql" => Language::SQL,
190 "sh" | "bash" => Language::Bash,
191 "ps1" | "psm1" | "psd1" => Language::PowerShell,
192 "bat" | "cmd" => Language::Batch,
193 "r" => Language::R,
194 "jl" => Language::Julia,
195 "swift" => Language::Swift,
196 "dart" => Language::Dart,
197 "m" | "mm" => Language::ObjectiveC,
200 _ => Language::Unknown,
201 }
202 }
203
204 pub fn is_documentation(&self) -> bool {
206 matches!(self, Language::Markdown | Language::HTML)
207 }
208
209 pub fn is_configuration(&self) -> bool {
211 matches!(
212 self,
213 Language::JSON | Language::YAML | Language::TOML | Language::XML
214 )
215 }
216
217 pub fn is_programming(&self) -> bool {
219 !matches!(
220 self,
221 Language::Markdown
222 | Language::HTML
223 | Language::JSON
224 | Language::YAML
225 | Language::TOML
226 | Language::XML
227 | Language::Unknown
228 )
229 }
230
231 pub fn extensions(&self) -> &'static [&'static str] {
233 match self {
234 Language::Rust => &["rs"],
235 Language::C => &["c", "h"],
236 Language::Cpp => &["cpp", "cxx", "cc", "hpp", "hxx"],
237 Language::Go => &["go"],
238 Language::Zig => &["zig"],
239 Language::JavaScript => &["js", "mjs", "cjs"],
240 Language::TypeScript => &["ts", "mts", "cts"],
241 Language::HTML => &["html", "htm"],
242 Language::CSS => &["css"],
243 Language::SCSS => &["scss"],
244 Language::SASS => &["sass"],
245 Language::Python => &["py", "pyi", "pyw"],
246 Language::Java => &["java"],
247 Language::CSharp => &["cs"],
248 Language::Kotlin => &["kt", "kts"],
249 Language::Scala => &["scala", "sc"],
250 Language::Ruby => &["rb"],
251 Language::PHP => &["php"],
252 Language::Haskell => &["hs", "lhs"],
253 Language::OCaml => &["ml", "mli"],
254 Language::FSharp => &["fs", "fsi", "fsx"],
255 Language::Erlang => &["erl", "hrl"],
256 Language::Elixir => &["ex", "exs"],
257 Language::Clojure => &["clj", "cljs", "cljc"],
258 Language::JSON => &["json"],
259 Language::YAML => &["yaml", "yml"],
260 Language::TOML => &["toml"],
261 Language::XML => &["xml"],
262 Language::Markdown => &["md", "markdown", "mdown", "mkd", "mkdn"],
263 Language::SQL => &["sql"],
264 Language::Bash => &["sh", "bash"],
265 Language::PowerShell => &["ps1", "psm1", "psd1"],
266 Language::Batch => &["bat", "cmd"],
267 Language::R => &["r"],
268 Language::Julia => &["jl"],
269 Language::Matlab => &["m"], Language::Swift => &["swift"],
271 Language::ObjectiveC => &["m", "mm"],
272 Language::Dart => &["dart"],
273 Language::Unknown => &[],
274 }
275 }
276}
277
278#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
280pub enum FileType {
281 Source { language: Language },
283 Documentation { format: DocumentationFormat },
285 Configuration { format: ConfigurationFormat },
287 Test { language: Language },
289 Binary,
291 Generated,
293 Unknown,
295}
296
297#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
299pub enum DocumentationFormat {
300 Markdown,
301 Html,
302 PlainText,
303 Rst,
304 Asciidoc,
305}
306
307#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
309pub enum ConfigurationFormat {
310 Json,
311 Yaml,
312 Toml,
313 Xml,
314 Ini,
315 Dotenv,
316}
317
318#[derive(Debug, Clone, Serialize, Deserialize)]
320pub struct FileInfo {
321 pub path: PathBuf,
323
324 pub relative_path: String,
326
327 pub size: u64,
329
330 pub modified: Option<SystemTime>,
332
333 pub decision: RenderDecision,
335
336 pub file_type: FileType,
338
339 pub language: Language,
341
342 pub content: Option<String>,
344
345 pub token_estimate: Option<usize>,
347
348 pub line_count: Option<usize>,
350
351 pub char_count: Option<usize>,
353
354 pub is_binary: bool,
356
357 pub git_status: Option<GitStatus>,
359
360 pub centrality_score: Option<f64>,
362}
363
364#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
366pub struct GitStatus {
367 pub working_tree: GitFileStatus,
369 pub index: GitFileStatus,
371}
372
373#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
375pub enum GitFileStatus {
376 Unmodified,
377 Modified,
378 Added,
379 Deleted,
380 Renamed,
381 Copied,
382 Unmerged,
383 Untracked,
384 Ignored,
385}
386
387impl FileInfo {
388 pub fn new<P: AsRef<Path>>(
390 path: P,
391 relative_path: String,
392 decision: RenderDecision,
393 ) -> Result<Self> {
394 let path = path.as_ref();
395 let metadata = std::fs::metadata(path)
396 .map_err(|e| ScribeError::path_with_source("Failed to read file metadata", path, e))?;
397
398 let size = metadata.len();
399 let modified = metadata.modified().ok();
400
401 let extension = path.extension().and_then(|ext| ext.to_str()).unwrap_or("");
402
403 let language = Language::from_extension(extension);
404 let is_binary = Self::detect_binary_by_extension(extension);
405 let file_type = Self::classify_file_type(&relative_path, &language, extension);
406
407 Ok(Self {
408 path: path.to_path_buf(),
409 relative_path,
410 size,
411 modified,
412 decision,
413 file_type,
414 language,
415 content: None,
416 token_estimate: None,
417 line_count: None,
418 char_count: None,
419 is_binary,
420 git_status: None,
421 centrality_score: None,
422 })
423 }
424
425 pub fn load_content(&mut self) -> Result<()> {
427 if self.is_binary || !self.decision.should_include() {
428 return Ok(());
429 }
430
431 let content = std::fs::read_to_string(&self.path).map_err(|e| {
432 ScribeError::analysis(format!("Failed to read file content: {}", e), &self.path)
433 })?;
434
435 let line_count = content.lines().count();
437 let char_count = content.chars().count();
438 let token_estimate = Self::estimate_tokens(&content);
439
440 self.content = Some(content);
441 self.line_count = Some(line_count);
442 self.char_count = Some(char_count);
443 self.token_estimate = Some(token_estimate);
444
445 Ok(())
446 }
447
448 pub fn estimate_tokens(content: &str) -> usize {
453 use crate::tokenization::{utils, TokenCounter};
454
455 match TokenCounter::global().count_tokens(content) {
457 Ok(tokens) => tokens,
458 Err(_) => {
459 utils::estimate_tokens_legacy(content)
461 }
462 }
463 }
464
465 pub fn estimate_tokens_with_path(content: &str, file_path: &std::path::Path) -> usize {
470 use crate::tokenization::TokenCounter;
471
472 match TokenCounter::global().estimate_file_tokens(content, file_path) {
474 Ok(tokens) => tokens,
475 Err(_) => Self::estimate_tokens(content), }
477 }
478
479 pub fn detect_binary_by_extension(extension: &str) -> bool {
481 BINARY_EXTENSIONS.contains(&format!(".{}", extension.to_lowercase()).as_str())
482 }
483
484 pub fn classify_file_type(path: &str, language: &Language, extension: &str) -> FileType {
486 let path_lower = path.to_lowercase();
487
488 if path_lower.contains("test") || path_lower.contains("spec") {
490 return FileType::Test {
491 language: language.clone(),
492 };
493 }
494
495 if language.is_documentation() {
497 let format = match extension {
498 "md" | "markdown" => DocumentationFormat::Markdown,
499 "html" | "htm" => DocumentationFormat::Html,
500 "rst" => DocumentationFormat::Rst,
501 "txt" => DocumentationFormat::PlainText,
502 _ => DocumentationFormat::Markdown,
503 };
504 return FileType::Documentation { format };
505 }
506
507 if language.is_configuration() {
509 let format = match extension {
510 "json" => ConfigurationFormat::Json,
511 "yaml" | "yml" => ConfigurationFormat::Yaml,
512 "toml" => ConfigurationFormat::Toml,
513 "xml" => ConfigurationFormat::Xml,
514 "ini" => ConfigurationFormat::Ini,
515 "env" => ConfigurationFormat::Dotenv,
516 _ => ConfigurationFormat::Json,
517 };
518 return FileType::Configuration { format };
519 }
520
521 if Self::detect_binary_by_extension(extension) {
523 return FileType::Binary;
524 }
525
526 if path_lower.contains("generated")
528 || path_lower.contains("build")
529 || path_lower.contains("dist")
530 || path_lower.contains("target")
531 {
532 return FileType::Generated;
533 }
534
535 if language.is_programming() {
537 return FileType::Source {
538 language: language.clone(),
539 };
540 }
541
542 FileType::Unknown
543 }
544
545 pub fn human_size(&self) -> String {
547 bytes_to_human(self.size)
548 }
549
550 pub fn should_include(&self) -> bool {
552 self.decision.should_include()
553 }
554
555 pub fn file_name(&self) -> Option<&str> {
557 self.path.file_name()?.to_str()
558 }
559
560 pub fn file_stem(&self) -> Option<&str> {
562 self.path.file_stem()?.to_str()
563 }
564
565 pub fn extension(&self) -> Option<&str> {
567 self.path.extension()?.to_str()
568 }
569}
570
571pub fn bytes_to_human(bytes: u64) -> String {
573 const UNITS: &[&str] = &["B", "KiB", "MiB", "GiB", "TiB"];
574 const THRESHOLD: f64 = 1024.0;
575
576 if bytes == 0 {
577 return "0 B".to_string();
578 }
579
580 let mut size = bytes as f64;
581 let mut unit_idx = 0;
582
583 while size >= THRESHOLD && unit_idx < UNITS.len() - 1 {
584 size /= THRESHOLD;
585 unit_idx += 1;
586 }
587
588 if unit_idx == 0 {
589 format!("{} {}", bytes, UNITS[unit_idx])
590 } else {
591 format!("{:.1} {}", size, UNITS[unit_idx])
592 }
593}
594
595#[cfg(test)]
596mod tests {
597 use super::*;
598
599 #[test]
600 fn test_language_detection() {
601 assert_eq!(Language::from_extension("rs"), Language::Rust);
602 assert_eq!(Language::from_extension("py"), Language::Python);
603 assert_eq!(Language::from_extension("js"), Language::JavaScript);
604 assert_eq!(Language::from_extension("unknown"), Language::Unknown);
605 }
606
607 #[test]
608 fn test_binary_detection() {
609 assert!(FileInfo::detect_binary_by_extension("png"));
610 assert!(FileInfo::detect_binary_by_extension("exe"));
611 assert!(!FileInfo::detect_binary_by_extension("rs"));
612 assert!(!FileInfo::detect_binary_by_extension("py"));
613 }
614
615 #[test]
616 fn test_file_type_classification() {
617 let rust_lang = Language::Rust;
618 let py_lang = Language::Python;
619 let md_lang = Language::Markdown;
620
621 assert!(matches!(
623 FileInfo::classify_file_type("src/lib.rs", &rust_lang, "rs"),
624 FileType::Source { .. }
625 ));
626
627 assert!(matches!(
628 FileInfo::classify_file_type("scribe-rs/src/lib.rs", &rust_lang, "rs"),
629 FileType::Source { .. }
630 ));
631
632 assert!(matches!(
634 FileInfo::classify_file_type("script.py", &py_lang, "py"),
635 FileType::Source { .. }
636 ));
637
638 assert!(rust_lang.is_programming());
640 assert!(py_lang.is_programming());
641 assert!(!md_lang.is_programming());
642 }
643
644 #[test]
645 fn test_integration_file_classification() {
646 let rust_lang = Language::from_extension("rs");
650 assert_eq!(rust_lang, Language::Rust);
651 assert!(rust_lang.is_programming());
652
653 let rust_file_type = FileInfo::classify_file_type("src/lib.rs", &rust_lang, "rs");
654 assert!(matches!(rust_file_type, FileType::Source { .. }));
655
656 let py_lang = Language::from_extension("py");
658 assert_eq!(py_lang, Language::Python);
659 assert!(py_lang.is_programming());
660
661 let py_file_type = FileInfo::classify_file_type("script.py", &py_lang, "py");
662 assert!(matches!(py_file_type, FileType::Source { .. }));
663
664 let unknown_lang = Language::from_extension("xyz");
666 assert_eq!(unknown_lang, Language::Unknown);
667 assert!(!unknown_lang.is_programming());
668
669 let unknown_file_type = FileInfo::classify_file_type("file.xyz", &unknown_lang, "xyz");
670 assert!(matches!(unknown_file_type, FileType::Unknown));
671
672 let md_lang = Language::from_extension("md");
674 assert_eq!(md_lang, Language::Markdown);
675 assert!(!md_lang.is_programming());
676
677 let md_file_type = FileInfo::classify_file_type("README.md", &md_lang, "md");
678 assert!(matches!(md_file_type, FileType::Documentation { .. }));
679 }
680
681 #[test]
682 fn test_bytes_to_human() {
683 assert_eq!(bytes_to_human(0), "0 B");
684 assert_eq!(bytes_to_human(512), "512 B");
685 assert_eq!(bytes_to_human(1024), "1.0 KiB");
686 assert_eq!(bytes_to_human(1536), "1.5 KiB");
687 assert_eq!(bytes_to_human(1048576), "1.0 MiB");
688 }
689
690 #[test]
691 fn test_token_estimation() {
692 let content = "Hello world, this is a test.";
693 let tokens = FileInfo::estimate_tokens(content);
694 assert!(tokens > 0);
695 assert!(tokens < 20); }
697
698 #[test]
699 fn test_render_decision() {
700 let include = RenderDecision::include("valid file");
701 assert!(include.should_include());
702 assert_eq!(include.reason_category(), RenderDecisionCategory::Other);
703
704 let exclude = RenderDecision::exclude("binary").with_context("detected by extension");
705 assert!(!exclude.should_include());
706 assert_eq!(exclude.reason_category(), RenderDecisionCategory::Binary);
707 assert!(exclude.context.is_some());
708 }
709}