scribe_scanner/
metadata.rs

1//! File metadata extraction and analysis.
2//!
3//! This module provides comprehensive file metadata extraction including
4//! size statistics, timestamps, permissions, and file system attributes.
5
6use scribe_core::{Result, ScribeError, bytes_to_human};
7use std::path::{Path, PathBuf};
8use std::time::{SystemTime, UNIX_EPOCH};
9use std::fs;
10use std::cell::RefCell;
11use serde::{Serialize, Deserialize};
12
13/// Comprehensive file metadata information
14#[derive(Debug, Clone, Serialize, Deserialize)]
15pub struct FileMetadata {
16    pub path: PathBuf,
17    pub size: u64,
18    pub size_human: String,
19    pub created: Option<u64>,
20    pub modified: Option<u64>,
21    pub accessed: Option<u64>,
22    pub readonly: bool,
23    pub hidden: bool,
24    pub executable: bool,
25    pub symlink: bool,
26    pub symlink_target: Option<PathBuf>,
27    pub permissions: u32,
28    pub file_type: FileSystemType,
29    pub inode: Option<u64>,
30    pub links: Option<u64>,
31    pub uid: Option<u32>,
32    pub gid: Option<u32>,
33}
34
35/// File system type classification
36#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
37pub enum FileSystemType {
38    RegularFile,
39    SymbolicLink,
40    Directory,
41    FIFO,
42    Socket,
43    CharacterDevice,
44    BlockDevice,
45    Unknown,
46}
47
48/// Size-related statistics for collections of files
49#[derive(Debug, Clone, Default, Serialize, Deserialize)]
50pub struct SizeStats {
51    pub total_size: u64,
52    pub total_size_human: String,
53    pub file_count: usize,
54    pub average_size: u64,
55    pub median_size: u64,
56    pub min_size: u64,
57    pub max_size: u64,
58    pub size_distribution: SizeDistribution,
59}
60
61/// Distribution of file sizes
62#[derive(Debug, Clone, Default, Serialize, Deserialize)]
63pub struct SizeDistribution {
64    pub tiny: usize,    // < 1KB
65    pub small: usize,   // 1KB - 10KB
66    pub medium: usize,  // 10KB - 100KB
67    pub large: usize,   // 100KB - 1MB
68    pub huge: usize,    // > 1MB
69}
70
71/// Metadata extractor with caching and optimization
72pub struct MetadataExtractor {
73    cache: RefCell<std::collections::HashMap<PathBuf, FileMetadata>>,
74    cache_enabled: bool,
75}
76
77impl Default for FileMetadata {
78    fn default() -> Self {
79        Self {
80            path: PathBuf::new(),
81            size: 0,
82            size_human: "0 B".to_string(),
83            created: None,
84            modified: None,
85            accessed: None,
86            readonly: false,
87            hidden: false,
88            executable: false,
89            symlink: false,
90            symlink_target: None,
91            permissions: 0,
92            file_type: FileSystemType::Unknown,
93            inode: None,
94            links: None,
95            uid: None,
96            gid: None,
97        }
98    }
99}
100
101impl MetadataExtractor {
102    /// Create a new metadata extractor
103    pub fn new() -> Self {
104        Self {
105            cache: RefCell::new(std::collections::HashMap::new()),
106            cache_enabled: true,
107        }
108    }
109
110    /// Create a metadata extractor without caching
111    pub fn without_cache() -> Self {
112        Self {
113            cache: RefCell::new(std::collections::HashMap::new()),
114            cache_enabled: false,
115        }
116    }
117
118    /// Extract comprehensive metadata for a file
119    pub async fn extract_metadata(&self, path: &Path) -> Result<FileMetadata> {
120        // Check cache first if enabled
121        if self.cache_enabled {
122            if let Some(cached) = self.cache.borrow().get(path) {
123                return Ok(cached.clone());
124            }
125        }
126
127        let metadata = self.extract_metadata_uncached(path).await?;
128
129        // Cache the result if caching is enabled
130        if self.cache_enabled {
131            self.cache.borrow_mut().insert(path.to_path_buf(), metadata.clone());
132        }
133
134        Ok(metadata)
135    }
136
137    /// Extract metadata without caching
138    async fn extract_metadata_uncached(&self, path: &Path) -> Result<FileMetadata> {
139        let std_metadata = fs::symlink_metadata(path)
140            .map_err(|e| ScribeError::io(format!("Failed to read metadata for {}: {}", path.display(), e), e))?;
141
142        let size = std_metadata.len();
143        let size_human = bytes_to_human(size);
144
145        // Extract timestamps
146        let created = system_time_to_timestamp(std_metadata.created().ok());
147        let modified = system_time_to_timestamp(std_metadata.modified().ok());
148        let accessed = system_time_to_timestamp(std_metadata.accessed().ok());
149
150        // Determine file type
151        let file_type = classify_file_type(&std_metadata);
152
153        // Check if it's a symlink and get target
154        let (symlink, symlink_target) = if std_metadata.file_type().is_symlink() {
155            let target = fs::read_link(path).ok();
156            (true, target)
157        } else {
158            (false, None)
159        };
160
161        // Platform-specific metadata extraction
162        let (permissions, readonly, hidden, executable, inode, links, uid, gid) = 
163            extract_platform_metadata(path, &std_metadata)?;
164
165        Ok(FileMetadata {
166            path: path.to_path_buf(),
167            size,
168            size_human,
169            created,
170            modified,
171            accessed,
172            readonly,
173            hidden,
174            executable,
175            symlink,
176            symlink_target,
177            permissions,
178            file_type,
179            inode,
180            links,
181            uid,
182            gid,
183        })
184    }
185
186    /// Extract metadata for multiple files in parallel
187    pub async fn extract_metadata_batch(&self, paths: &[PathBuf]) -> Vec<Result<FileMetadata>> {
188        // For now, process sequentially to avoid async closure issues
189        // In a future version, this could use async map with proper lifetime handling
190        let mut results = Vec::with_capacity(paths.len());
191        for path in paths {
192            results.push(self.extract_metadata(path).await);
193        }
194        results
195    }
196
197    /// Calculate size statistics for a collection of files
198    pub fn calculate_size_stats(&self, files: &[FileMetadata]) -> SizeStats {
199        if files.is_empty() {
200            return SizeStats::default();
201        }
202
203        let mut sizes: Vec<u64> = files.iter().map(|f| f.size).collect();
204        sizes.sort_unstable();
205
206        let total_size = sizes.iter().sum();
207        let file_count = files.len();
208        let average_size = total_size / file_count as u64;
209        let median_size = if file_count % 2 == 0 {
210            (sizes[file_count / 2 - 1] + sizes[file_count / 2]) / 2
211        } else {
212            sizes[file_count / 2]
213        };
214
215        let min_size = sizes[0];
216        let max_size = sizes[sizes.len() - 1];
217
218        // Calculate size distribution
219        let mut distribution = SizeDistribution::default();
220        for &size in &sizes {
221            match size {
222                0..=1024 => distribution.tiny += 1,
223                1025..=10240 => distribution.small += 1,
224                10241..=102400 => distribution.medium += 1,
225                102401..=1048576 => distribution.large += 1,
226                _ => distribution.huge += 1,
227            }
228        }
229
230        SizeStats {
231            total_size,
232            total_size_human: bytes_to_human(total_size),
233            file_count,
234            average_size,
235            median_size,
236            min_size,
237            max_size,
238            size_distribution: distribution,
239        }
240    }
241
242    /// Clear the metadata cache
243    pub fn clear_cache(&self) {
244        self.cache.borrow_mut().clear();
245    }
246
247    /// Get cache statistics
248    pub fn cache_stats(&self) -> (usize, usize) {
249        let cache = self.cache.borrow();
250        (cache.len(), cache.capacity())
251    }
252
253    /// Check if a file is likely to be a text file based on metadata
254    pub fn is_likely_text_file(&self, metadata: &FileMetadata) -> bool {
255        // Skip very large files that are unlikely to be source code
256        if metadata.size > 10 * 1024 * 1024 { // 10MB
257            return false;
258        }
259
260        // Skip binary file types
261        matches!(metadata.file_type, 
262            FileSystemType::RegularFile | FileSystemType::SymbolicLink)
263    }
264
265    /// Check if a file has been modified recently
266    pub fn is_recently_modified(&self, metadata: &FileMetadata, hours: u64) -> bool {
267        if let Some(modified) = metadata.modified {
268            let now = SystemTime::now()
269                .duration_since(UNIX_EPOCH)
270                .unwrap()
271                .as_secs();
272            let threshold = hours * 3600;
273            
274            now.saturating_sub(modified) <= threshold
275        } else {
276            false
277        }
278    }
279}
280
281impl Default for MetadataExtractor {
282    fn default() -> Self {
283        Self::new()
284    }
285}
286
287/// Convert SystemTime to Unix timestamp
288fn system_time_to_timestamp(time: Option<SystemTime>) -> Option<u64> {
289    time.and_then(|t| t.duration_since(UNIX_EPOCH).ok())
290        .map(|d| d.as_secs())
291}
292
293/// Classify file type from metadata
294fn classify_file_type(metadata: &fs::Metadata) -> FileSystemType {
295    let file_type = metadata.file_type();
296    
297    if file_type.is_file() {
298        FileSystemType::RegularFile
299    } else if file_type.is_dir() {
300        FileSystemType::Directory
301    } else if file_type.is_symlink() {
302        FileSystemType::SymbolicLink
303    } else {
304        // Platform-specific special file types
305        #[cfg(unix)]
306        {
307            use std::os::unix::fs::FileTypeExt;
308            if file_type.is_fifo() {
309                return FileSystemType::FIFO;
310            } else if file_type.is_socket() {
311                return FileSystemType::Socket;
312            } else if file_type.is_char_device() {
313                return FileSystemType::CharacterDevice;
314            } else if file_type.is_block_device() {
315                return FileSystemType::BlockDevice;
316            }
317        }
318        
319        FileSystemType::Unknown
320    }
321}
322
323/// Extract platform-specific metadata
324#[cfg(unix)]
325fn extract_platform_metadata(path: &Path, metadata: &fs::Metadata) -> Result<(u32, bool, bool, bool, Option<u64>, Option<u64>, Option<u32>, Option<u32>)> {
326    use std::os::unix::fs::{MetadataExt, PermissionsExt};
327
328    let permissions = metadata.permissions().mode();
329    let readonly = !metadata.permissions().readonly();
330    
331    // Check if file is hidden (starts with .)
332    let hidden = path.file_name()
333        .and_then(|name| name.to_str())
334        .map_or(false, |name| name.starts_with('.'));
335    
336    // Check if file is executable
337    let executable = permissions & 0o111 != 0;
338    
339    let inode = Some(metadata.ino());
340    let links = Some(metadata.nlink());
341    let uid = Some(metadata.uid());
342    let gid = Some(metadata.gid());
343
344    Ok((permissions, readonly, hidden, executable, inode, links, uid, gid))
345}
346
347/// Extract platform-specific metadata for Windows
348#[cfg(windows)]
349fn extract_platform_metadata(path: &Path, metadata: &fs::Metadata) -> Result<(u32, bool, bool, bool, Option<u64>, Option<u64>, Option<u32>, Option<u32>)> {
350    use std::os::windows::fs::MetadataExt;
351
352    let permissions = 0; // Windows doesn't have Unix-style permissions
353    let readonly = metadata.permissions().readonly();
354    
355    // Check if file is hidden using Windows attributes
356    let hidden = metadata.file_attributes() & 0x2 != 0;
357    
358    // Windows executables typically have .exe, .bat, .cmd extensions
359    let executable = path.extension()
360        .and_then(|ext| ext.to_str())
361        .map_or(false, |ext| {
362            matches!(ext.to_lowercase().as_str(), "exe" | "bat" | "cmd" | "com" | "scr")
363        });
364    
365    // Windows doesn't have direct equivalents for these Unix concepts
366    let inode = None;
367    let links = None;
368    let uid = None;
369    let gid = None;
370
371    Ok((permissions, readonly, hidden, executable, inode, links, uid, gid))
372}
373
374impl SizeStats {
375    /// Create size statistics from a slice of file sizes
376    pub fn from_sizes(sizes: &[u64]) -> Self {
377        let mut extractor = MetadataExtractor::new();
378        let fake_metadata: Vec<FileMetadata> = sizes.iter()
379            .enumerate()
380            .map(|(i, &size)| FileMetadata {
381                path: PathBuf::from(format!("file_{}", i)),
382                size,
383                size_human: bytes_to_human(size),
384                ..Default::default()
385            })
386            .collect();
387        
388        extractor.calculate_size_stats(&fake_metadata)
389    }
390
391    /// Get a human-readable summary of the size statistics
392    pub fn summary(&self) -> String {
393        format!(
394            "Files: {}, Total: {}, Avg: {}, Range: {} - {}",
395            self.file_count,
396            self.total_size_human,
397            bytes_to_human(self.average_size),
398            bytes_to_human(self.min_size),
399            bytes_to_human(self.max_size)
400        )
401    }
402
403    /// Get distribution summary
404    pub fn distribution_summary(&self) -> String {
405        format!(
406            "Tiny: {}, Small: {}, Medium: {}, Large: {}, Huge: {}",
407            self.size_distribution.tiny,
408            self.size_distribution.small,
409            self.size_distribution.medium,
410            self.size_distribution.large,
411            self.size_distribution.huge
412        )
413    }
414}
415
416#[cfg(test)]
417mod tests {
418    use super::*;
419    use tempfile::TempDir;
420    use std::fs;
421    use tokio::fs as async_fs;
422
423    #[tokio::test]
424    async fn test_metadata_extraction() {
425        let temp_dir = TempDir::new().unwrap();
426        let test_file = temp_dir.path().join("test.txt");
427        
428        let content = "Hello, world! This is a test file.";
429        fs::write(&test_file, content).unwrap();
430
431        let mut extractor = MetadataExtractor::new();
432        let metadata = extractor.extract_metadata(&test_file).await.unwrap();
433
434        assert_eq!(metadata.path, test_file);
435        assert_eq!(metadata.size, content.len() as u64);
436        assert!(!metadata.size_human.is_empty());
437        assert!(metadata.modified.is_some());
438        assert_eq!(metadata.file_type, FileSystemType::RegularFile);
439        assert!(!metadata.symlink);
440    }
441
442    #[tokio::test]
443    async fn test_symlink_detection() {
444        let temp_dir = TempDir::new().unwrap();
445        let original_file = temp_dir.path().join("original.txt");
446        let symlink_file = temp_dir.path().join("link.txt");
447        
448        fs::write(&original_file, "original content").unwrap();
449        
450        #[cfg(unix)]
451        {
452            std::os::unix::fs::symlink(&original_file, &symlink_file).unwrap();
453            
454            let mut extractor = MetadataExtractor::new();
455            let metadata = extractor.extract_metadata(&symlink_file).await.unwrap();
456
457            assert!(metadata.symlink);
458            assert_eq!(metadata.symlink_target, Some(original_file));
459        }
460    }
461
462    #[tokio::test]
463    async fn test_batch_metadata_extraction() {
464        let temp_dir = TempDir::new().unwrap();
465        let mut file_paths = Vec::new();
466
467        // Create multiple test files
468        for i in 0..5 {
469            let file_path = temp_dir.path().join(format!("test_{}.txt", i));
470            fs::write(&file_path, format!("Content for file {}", i)).unwrap();
471            file_paths.push(file_path);
472        }
473
474        let mut extractor = MetadataExtractor::new();
475        let results = extractor.extract_metadata_batch(&file_paths).await;
476
477        assert_eq!(results.len(), 5);
478        for result in results {
479            assert!(result.is_ok());
480            let metadata = result.unwrap();
481            assert_eq!(metadata.file_type, FileSystemType::RegularFile);
482            assert!(metadata.size > 0);
483        }
484    }
485
486    #[tokio::test]
487    async fn test_size_statistics() {
488        let temp_dir = TempDir::new().unwrap();
489        let mut files = Vec::new();
490
491        // Create files of different sizes
492        let sizes = [100, 500, 1500, 5000, 50000];
493        for (i, &size) in sizes.iter().enumerate() {
494            let file_path = temp_dir.path().join(format!("test_{}.txt", i));
495            let content = "x".repeat(size);
496            fs::write(&file_path, content).unwrap();
497
498            let mut extractor = MetadataExtractor::new();
499            let metadata = extractor.extract_metadata(&file_path).await.unwrap();
500            files.push(metadata);
501        }
502
503        let extractor = MetadataExtractor::new();
504        let stats = extractor.calculate_size_stats(&files);
505
506        assert_eq!(stats.file_count, 5);
507        assert_eq!(stats.total_size, sizes.iter().sum::<usize>() as u64);
508        assert_eq!(stats.min_size, 100);
509        assert_eq!(stats.max_size, 50000);
510        
511        // Check distribution
512        assert_eq!(stats.size_distribution.tiny, 2);    // 100, 500 bytes (both <= 1024)
513        assert_eq!(stats.size_distribution.small, 2);   // 1500, 5000 bytes (1025-10240)
514        assert_eq!(stats.size_distribution.medium, 1);  // 50000 bytes (10241-102400)
515        assert_eq!(stats.size_distribution.large, 0);   // none
516        assert_eq!(stats.size_distribution.huge, 0);
517    }
518
519    #[test]
520    fn test_size_stats_from_sizes() {
521        let sizes = [1000, 2000, 3000, 4000, 5000];
522        let stats = SizeStats::from_sizes(&sizes);
523
524        assert_eq!(stats.file_count, 5);
525        assert_eq!(stats.total_size, 15000);
526        assert_eq!(stats.average_size, 3000);
527        assert_eq!(stats.median_size, 3000);
528        assert_eq!(stats.min_size, 1000);
529        assert_eq!(stats.max_size, 5000);
530    }
531
532    #[test]
533    fn test_size_distribution() {
534        let sizes = [
535            500,      // tiny
536            5000,     // small
537            50000,    // medium
538            500000,   // large
539            5000000,  // huge
540        ];
541        let stats = SizeStats::from_sizes(&sizes);
542
543        assert_eq!(stats.size_distribution.tiny, 1);
544        assert_eq!(stats.size_distribution.small, 1);
545        assert_eq!(stats.size_distribution.medium, 1);
546        assert_eq!(stats.size_distribution.large, 1);
547        assert_eq!(stats.size_distribution.huge, 1);
548    }
549
550    #[tokio::test]
551    async fn test_cache_functionality() {
552        let temp_dir = TempDir::new().unwrap();
553        let test_file = temp_dir.path().join("test.txt");
554        fs::write(&test_file, "test content").unwrap();
555
556        let mut extractor = MetadataExtractor::new();
557        
558        // First extraction should cache the result
559        let metadata1 = extractor.extract_metadata(&test_file).await.unwrap();
560        let (cache_size, _) = extractor.cache_stats();
561        assert_eq!(cache_size, 1);
562
563        // Second extraction should use cache
564        let metadata2 = extractor.extract_metadata(&test_file).await.unwrap();
565        assert_eq!(metadata1.size, metadata2.size);
566        assert_eq!(metadata1.modified, metadata2.modified);
567
568        // Clear cache and verify
569        extractor.clear_cache();
570        let (cache_size, _) = extractor.cache_stats();
571        assert_eq!(cache_size, 0);
572    }
573
574    #[tokio::test]
575    async fn test_recently_modified() {
576        let temp_dir = TempDir::new().unwrap();
577        let test_file = temp_dir.path().join("test.txt");
578        fs::write(&test_file, "test content").unwrap();
579
580        let mut extractor = MetadataExtractor::new();
581        let metadata = extractor.extract_metadata(&test_file).await.unwrap();
582
583        // File should be recently modified (within 1 hour)
584        assert!(extractor.is_recently_modified(&metadata, 1));
585        
586        // File should definitely be modified within 24 hours
587        assert!(extractor.is_recently_modified(&metadata, 24));
588    }
589
590    #[test]
591    fn test_file_type_classification() {
592        // Test with mock metadata - actual implementation would depend on platform
593        let sizes = [1000];
594        let stats = SizeStats::from_sizes(&sizes);
595        
596        // Basic smoke test for stats functionality
597        assert_eq!(stats.file_count, 1);
598        assert_eq!(stats.total_size, 1000);
599    }
600}