1use std::path::{Path, PathBuf};
7use std::time::SystemTime;
8use serde::{Deserialize, Serialize};
9
10use crate::error::{Result, ScribeError};
11
12pub const BINARY_EXTENSIONS: &[&str] = &[
14 ".png", ".jpg", ".jpeg", ".gif", ".webp", ".bmp", ".svg", ".ico", ".tiff",
16 ".pdf", ".doc", ".docx", ".ppt", ".pptx", ".xls", ".xlsx",
18 ".zip", ".tar", ".gz", ".bz2", ".xz", ".7z", ".rar",
20 ".mp3", ".mp4", ".mov", ".avi", ".mkv", ".wav", ".ogg", ".flac",
22 ".ttf", ".otf", ".eot", ".woff", ".woff2",
24 ".so", ".dll", ".dylib", ".class", ".jar", ".exe", ".bin", ".app",
26];
27
28pub const MARKDOWN_EXTENSIONS: &[&str] = &[".md", ".markdown", ".mdown", ".mkd", ".mkdn"];
30
31#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
33pub struct RenderDecision {
34 pub include: bool,
36 pub reason: String,
38 pub context: Option<String>,
40}
41
42impl RenderDecision {
43 pub fn include<S: Into<String>>(reason: S) -> Self {
45 Self {
46 include: true,
47 reason: reason.into(),
48 context: None,
49 }
50 }
51
52 pub fn exclude<S: Into<String>>(reason: S) -> Self {
54 Self {
55 include: false,
56 reason: reason.into(),
57 context: None,
58 }
59 }
60
61 pub fn with_context<S: Into<String>>(mut self, context: S) -> Self {
63 self.context = Some(context.into());
64 self
65 }
66
67 pub fn should_include(&self) -> bool {
69 self.include
70 }
71
72 pub fn reason_category(&self) -> RenderDecisionCategory {
74 match self.reason.as_str() {
75 "ok" => RenderDecisionCategory::Ok,
76 "binary" => RenderDecisionCategory::Binary,
77 "too_large" => RenderDecisionCategory::TooLarge,
78 "ignored" => RenderDecisionCategory::Ignored,
79 "empty" => RenderDecisionCategory::Empty,
80 _ => RenderDecisionCategory::Other,
81 }
82 }
83}
84
85#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
87pub enum RenderDecisionCategory {
88 Ok,
89 Binary,
90 TooLarge,
91 Ignored,
92 Empty,
93 Other,
94}
95
96#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
98pub enum Language {
99 Rust,
101 C,
102 Cpp,
103 Go,
104 Zig,
105
106 JavaScript,
108 TypeScript,
109 HTML,
110 CSS,
111 SCSS,
112 SASS,
113
114 Python,
116 Java,
117 CSharp,
118 Kotlin,
119 Scala,
120 Ruby,
121 PHP,
122
123 Haskell,
125 OCaml,
126 FSharp,
127 Erlang,
128 Elixir,
129 Clojure,
130
131 JSON,
133 YAML,
134 TOML,
135 XML,
136 Markdown,
137
138 SQL,
140
141 Bash,
143 PowerShell,
144 Batch,
145
146 R,
148 Julia,
149 Matlab,
150
151 Swift,
153 ObjectiveC,
154 Dart,
155
156 Unknown,
158}
159
160impl Language {
161 pub fn from_extension(ext: &str) -> Self {
163 match ext.to_lowercase().as_str() {
164 "rs" => Language::Rust,
165 "c" | "h" => Language::C,
166 "cpp" | "cxx" | "cc" | "hpp" | "hxx" => Language::Cpp,
167 "go" => Language::Go,
168 "zig" => Language::Zig,
169 "js" | "mjs" | "cjs" => Language::JavaScript,
170 "ts" | "mts" | "cts" => Language::TypeScript,
171 "html" | "htm" => Language::HTML,
172 "css" => Language::CSS,
173 "scss" => Language::SCSS,
174 "sass" => Language::SASS,
175 "py" | "pyi" | "pyw" => Language::Python,
176 "java" => Language::Java,
177 "cs" => Language::CSharp,
178 "kt" | "kts" => Language::Kotlin,
179 "scala" | "sc" => Language::Scala,
180 "rb" => Language::Ruby,
181 "php" => Language::PHP,
182 "hs" | "lhs" => Language::Haskell,
183 "ml" | "mli" => Language::OCaml,
184 "fs" | "fsi" | "fsx" => Language::FSharp,
185 "erl" | "hrl" => Language::Erlang,
186 "ex" | "exs" => Language::Elixir,
187 "clj" | "cljs" | "cljc" => Language::Clojure,
188 "json" => Language::JSON,
189 "yaml" | "yml" => Language::YAML,
190 "toml" => Language::TOML,
191 "xml" => Language::XML,
192 "md" | "markdown" | "mdown" | "mkd" | "mkdn" => Language::Markdown,
193 "sql" => Language::SQL,
194 "sh" | "bash" => Language::Bash,
195 "ps1" | "psm1" | "psd1" => Language::PowerShell,
196 "bat" | "cmd" => Language::Batch,
197 "r" => Language::R,
198 "jl" => Language::Julia,
199 "swift" => Language::Swift,
200 "dart" => Language::Dart,
201 "m" | "mm" => Language::ObjectiveC,
204 _ => Language::Unknown,
205 }
206 }
207
208 pub fn is_documentation(&self) -> bool {
210 matches!(self, Language::Markdown | Language::HTML)
211 }
212
213 pub fn is_configuration(&self) -> bool {
215 matches!(
216 self,
217 Language::JSON | Language::YAML | Language::TOML | Language::XML
218 )
219 }
220
221 pub fn is_programming(&self) -> bool {
223 !matches!(
224 self,
225 Language::Markdown
226 | Language::HTML
227 | Language::JSON
228 | Language::YAML
229 | Language::TOML
230 | Language::XML
231 | Language::Unknown
232 )
233 }
234
235 pub fn extensions(&self) -> &'static [&'static str] {
237 match self {
238 Language::Rust => &["rs"],
239 Language::C => &["c", "h"],
240 Language::Cpp => &["cpp", "cxx", "cc", "hpp", "hxx"],
241 Language::Go => &["go"],
242 Language::Zig => &["zig"],
243 Language::JavaScript => &["js", "mjs", "cjs"],
244 Language::TypeScript => &["ts", "mts", "cts"],
245 Language::HTML => &["html", "htm"],
246 Language::CSS => &["css"],
247 Language::SCSS => &["scss"],
248 Language::SASS => &["sass"],
249 Language::Python => &["py", "pyi", "pyw"],
250 Language::Java => &["java"],
251 Language::CSharp => &["cs"],
252 Language::Kotlin => &["kt", "kts"],
253 Language::Scala => &["scala", "sc"],
254 Language::Ruby => &["rb"],
255 Language::PHP => &["php"],
256 Language::Haskell => &["hs", "lhs"],
257 Language::OCaml => &["ml", "mli"],
258 Language::FSharp => &["fs", "fsi", "fsx"],
259 Language::Erlang => &["erl", "hrl"],
260 Language::Elixir => &["ex", "exs"],
261 Language::Clojure => &["clj", "cljs", "cljc"],
262 Language::JSON => &["json"],
263 Language::YAML => &["yaml", "yml"],
264 Language::TOML => &["toml"],
265 Language::XML => &["xml"],
266 Language::Markdown => &["md", "markdown", "mdown", "mkd", "mkdn"],
267 Language::SQL => &["sql"],
268 Language::Bash => &["sh", "bash"],
269 Language::PowerShell => &["ps1", "psm1", "psd1"],
270 Language::Batch => &["bat", "cmd"],
271 Language::R => &["r"],
272 Language::Julia => &["jl"],
273 Language::Matlab => &["m"], Language::Swift => &["swift"],
275 Language::ObjectiveC => &["m", "mm"],
276 Language::Dart => &["dart"],
277 Language::Unknown => &[],
278 }
279 }
280}
281
282#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
284pub enum FileType {
285 Source { language: Language },
287 Documentation { format: DocumentationFormat },
289 Configuration { format: ConfigurationFormat },
291 Test { language: Language },
293 Binary,
295 Generated,
297 Unknown,
299}
300
301#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
303pub enum DocumentationFormat {
304 Markdown,
305 Html,
306 PlainText,
307 Rst,
308 Asciidoc,
309}
310
311#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
313pub enum ConfigurationFormat {
314 Json,
315 Yaml,
316 Toml,
317 Xml,
318 Ini,
319 Dotenv,
320}
321
322#[derive(Debug, Clone, Serialize, Deserialize)]
324pub struct FileInfo {
325 pub path: PathBuf,
327
328 pub relative_path: String,
330
331 pub size: u64,
333
334 pub modified: Option<SystemTime>,
336
337 pub decision: RenderDecision,
339
340 pub file_type: FileType,
342
343 pub language: Language,
345
346 pub content: Option<String>,
348
349 pub token_estimate: Option<usize>,
351
352 pub line_count: Option<usize>,
354
355 pub char_count: Option<usize>,
357
358 pub is_binary: bool,
360
361 pub git_status: Option<GitStatus>,
363
364 pub centrality_score: Option<f64>,
366}
367
368#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
370pub struct GitStatus {
371 pub working_tree: GitFileStatus,
373 pub index: GitFileStatus,
375}
376
377#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
379pub enum GitFileStatus {
380 Unmodified,
381 Modified,
382 Added,
383 Deleted,
384 Renamed,
385 Copied,
386 Unmerged,
387 Untracked,
388 Ignored,
389}
390
391impl FileInfo {
392 pub fn new<P: AsRef<Path>>(
394 path: P,
395 relative_path: String,
396 decision: RenderDecision,
397 ) -> Result<Self> {
398 let path = path.as_ref();
399 let metadata = std::fs::metadata(path)
400 .map_err(|e| ScribeError::path_with_source("Failed to read file metadata", path, e))?;
401
402 let size = metadata.len();
403 let modified = metadata.modified().ok();
404
405 let extension = path
406 .extension()
407 .and_then(|ext| ext.to_str())
408 .unwrap_or("");
409
410 let language = Language::from_extension(extension);
411 let is_binary = Self::detect_binary_by_extension(extension);
412 let file_type = Self::classify_file_type(&relative_path, &language, extension);
413
414 Ok(Self {
415 path: path.to_path_buf(),
416 relative_path,
417 size,
418 modified,
419 decision,
420 file_type,
421 language,
422 content: None,
423 token_estimate: None,
424 line_count: None,
425 char_count: None,
426 is_binary,
427 git_status: None,
428 centrality_score: None,
429 })
430 }
431
432 pub fn load_content(&mut self) -> Result<()> {
434 if self.is_binary || !self.decision.should_include() {
435 return Ok(());
436 }
437
438 let content = std::fs::read_to_string(&self.path)
439 .map_err(|e| ScribeError::analysis(
440 format!("Failed to read file content: {}", e),
441 &self.path
442 ))?;
443
444 let line_count = content.lines().count();
446 let char_count = content.chars().count();
447 let token_estimate = Self::estimate_tokens(&content);
448
449 self.content = Some(content);
450 self.line_count = Some(line_count);
451 self.char_count = Some(char_count);
452 self.token_estimate = Some(token_estimate);
453
454 Ok(())
455 }
456
457 pub fn estimate_tokens(content: &str) -> usize {
462 use crate::tokenization::{TokenCounter, utils};
463
464 match TokenCounter::global().count_tokens(content) {
466 Ok(tokens) => tokens,
467 Err(_) => {
468 utils::estimate_tokens_legacy(content)
470 }
471 }
472 }
473
474 pub fn estimate_tokens_with_path(content: &str, file_path: &std::path::Path) -> usize {
479 use crate::tokenization::TokenCounter;
480
481 match TokenCounter::global().estimate_file_tokens(content, file_path) {
483 Ok(tokens) => tokens,
484 Err(_) => Self::estimate_tokens(content), }
486 }
487
488 pub fn detect_binary_by_extension(extension: &str) -> bool {
490 BINARY_EXTENSIONS.contains(&format!(".{}", extension.to_lowercase()).as_str())
491 }
492
493 pub fn classify_file_type(path: &str, language: &Language, extension: &str) -> FileType {
495 let path_lower = path.to_lowercase();
496
497 if path_lower.contains("test") || path_lower.contains("spec") {
499 return FileType::Test {
500 language: language.clone()
501 };
502 }
503
504 if language.is_documentation() {
506 let format = match extension {
507 "md" | "markdown" => DocumentationFormat::Markdown,
508 "html" | "htm" => DocumentationFormat::Html,
509 "rst" => DocumentationFormat::Rst,
510 "txt" => DocumentationFormat::PlainText,
511 _ => DocumentationFormat::Markdown,
512 };
513 return FileType::Documentation { format };
514 }
515
516 if language.is_configuration() {
518 let format = match extension {
519 "json" => ConfigurationFormat::Json,
520 "yaml" | "yml" => ConfigurationFormat::Yaml,
521 "toml" => ConfigurationFormat::Toml,
522 "xml" => ConfigurationFormat::Xml,
523 "ini" => ConfigurationFormat::Ini,
524 "env" => ConfigurationFormat::Dotenv,
525 _ => ConfigurationFormat::Json,
526 };
527 return FileType::Configuration { format };
528 }
529
530 if Self::detect_binary_by_extension(extension) {
532 return FileType::Binary;
533 }
534
535 if path_lower.contains("generated") ||
537 path_lower.contains("build") ||
538 path_lower.contains("dist") ||
539 path_lower.contains("target") {
540 return FileType::Generated;
541 }
542
543 if language.is_programming() {
545 return FileType::Source {
546 language: language.clone()
547 };
548 }
549
550 FileType::Unknown
551 }
552
553 pub fn human_size(&self) -> String {
555 bytes_to_human(self.size)
556 }
557
558 pub fn should_include(&self) -> bool {
560 self.decision.should_include()
561 }
562
563 pub fn file_name(&self) -> Option<&str> {
565 self.path.file_name()?.to_str()
566 }
567
568 pub fn file_stem(&self) -> Option<&str> {
570 self.path.file_stem()?.to_str()
571 }
572
573 pub fn extension(&self) -> Option<&str> {
575 self.path.extension()?.to_str()
576 }
577}
578
579pub fn bytes_to_human(bytes: u64) -> String {
581 const UNITS: &[&str] = &["B", "KiB", "MiB", "GiB", "TiB"];
582 const THRESHOLD: f64 = 1024.0;
583
584 if bytes == 0 {
585 return "0 B".to_string();
586 }
587
588 let mut size = bytes as f64;
589 let mut unit_idx = 0;
590
591 while size >= THRESHOLD && unit_idx < UNITS.len() - 1 {
592 size /= THRESHOLD;
593 unit_idx += 1;
594 }
595
596 if unit_idx == 0 {
597 format!("{} {}", bytes, UNITS[unit_idx])
598 } else {
599 format!("{:.1} {}", size, UNITS[unit_idx])
600 }
601}
602
603#[cfg(test)]
604mod tests {
605 use super::*;
606
607 #[test]
608 fn test_language_detection() {
609 assert_eq!(Language::from_extension("rs"), Language::Rust);
610 assert_eq!(Language::from_extension("py"), Language::Python);
611 assert_eq!(Language::from_extension("js"), Language::JavaScript);
612 assert_eq!(Language::from_extension("unknown"), Language::Unknown);
613 }
614
615 #[test]
616 fn test_binary_detection() {
617 assert!(FileInfo::detect_binary_by_extension("png"));
618 assert!(FileInfo::detect_binary_by_extension("exe"));
619 assert!(!FileInfo::detect_binary_by_extension("rs"));
620 assert!(!FileInfo::detect_binary_by_extension("py"));
621 }
622
623 #[test]
624 fn test_file_type_classification() {
625 let rust_lang = Language::Rust;
626 let py_lang = Language::Python;
627 let md_lang = Language::Markdown;
628
629 assert!(matches!(
631 FileInfo::classify_file_type("src/lib.rs", &rust_lang, "rs"),
632 FileType::Source { .. }
633 ));
634
635 assert!(matches!(
636 FileInfo::classify_file_type("scribe-rs/src/lib.rs", &rust_lang, "rs"),
637 FileType::Source { .. }
638 ));
639
640 assert!(matches!(
642 FileInfo::classify_file_type("script.py", &py_lang, "py"),
643 FileType::Source { .. }
644 ));
645
646 assert!(rust_lang.is_programming());
648 assert!(py_lang.is_programming());
649 assert!(!md_lang.is_programming());
650 }
651
652 #[test]
653 fn test_integration_file_classification() {
654 let rust_lang = Language::from_extension("rs");
658 assert_eq!(rust_lang, Language::Rust);
659 assert!(rust_lang.is_programming());
660
661 let rust_file_type = FileInfo::classify_file_type("src/lib.rs", &rust_lang, "rs");
662 assert!(matches!(rust_file_type, FileType::Source { .. }));
663
664 let py_lang = Language::from_extension("py");
666 assert_eq!(py_lang, Language::Python);
667 assert!(py_lang.is_programming());
668
669 let py_file_type = FileInfo::classify_file_type("script.py", &py_lang, "py");
670 assert!(matches!(py_file_type, FileType::Source { .. }));
671
672 let unknown_lang = Language::from_extension("xyz");
674 assert_eq!(unknown_lang, Language::Unknown);
675 assert!(!unknown_lang.is_programming());
676
677 let unknown_file_type = FileInfo::classify_file_type("file.xyz", &unknown_lang, "xyz");
678 assert!(matches!(unknown_file_type, FileType::Unknown));
679
680 let md_lang = Language::from_extension("md");
682 assert_eq!(md_lang, Language::Markdown);
683 assert!(!md_lang.is_programming());
684
685 let md_file_type = FileInfo::classify_file_type("README.md", &md_lang, "md");
686 assert!(matches!(md_file_type, FileType::Documentation { .. }));
687 }
688
689 #[test]
690 fn test_bytes_to_human() {
691 assert_eq!(bytes_to_human(0), "0 B");
692 assert_eq!(bytes_to_human(512), "512 B");
693 assert_eq!(bytes_to_human(1024), "1.0 KiB");
694 assert_eq!(bytes_to_human(1536), "1.5 KiB");
695 assert_eq!(bytes_to_human(1048576), "1.0 MiB");
696 }
697
698 #[test]
699 fn test_token_estimation() {
700 let content = "Hello world, this is a test.";
701 let tokens = FileInfo::estimate_tokens(content);
702 assert!(tokens > 0);
703 assert!(tokens < 20); }
705
706 #[test]
707 fn test_render_decision() {
708 let include = RenderDecision::include("valid file");
709 assert!(include.should_include());
710 assert_eq!(include.reason_category(), RenderDecisionCategory::Other);
711
712 let exclude = RenderDecision::exclude("binary")
713 .with_context("detected by extension");
714 assert!(!exclude.should_include());
715 assert_eq!(exclude.reason_category(), RenderDecisionCategory::Binary);
716 assert!(exclude.context.is_some());
717 }
718}