scribe_scanner/
metadata.rs

1//! File metadata extraction and analysis.
2//!
3//! This module provides comprehensive file metadata extraction including
4//! size statistics, timestamps, permissions, and file system attributes.
5
6use dashmap::DashMap;
7use scribe_core::{bytes_to_human, Result, ScribeError};
8use serde::{Deserialize, Serialize};
9use std::fs;
10use std::path::{Path, PathBuf};
11use std::time::{SystemTime, UNIX_EPOCH};
12
13/// Comprehensive file metadata information
14#[derive(Debug, Clone, Serialize, Deserialize)]
15pub struct FileMetadata {
16    pub path: PathBuf,
17    pub size: u64,
18    pub size_human: String,
19    pub created: Option<u64>,
20    pub modified: Option<u64>,
21    pub accessed: Option<u64>,
22    pub readonly: bool,
23    pub hidden: bool,
24    pub executable: bool,
25    pub symlink: bool,
26    pub symlink_target: Option<PathBuf>,
27    pub permissions: u32,
28    pub file_type: FileSystemType,
29    pub inode: Option<u64>,
30    pub links: Option<u64>,
31    pub uid: Option<u32>,
32    pub gid: Option<u32>,
33}
34
35/// File system type classification
36#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
37pub enum FileSystemType {
38    RegularFile,
39    SymbolicLink,
40    Directory,
41    FIFO,
42    Socket,
43    CharacterDevice,
44    BlockDevice,
45    Unknown,
46}
47
48/// Size-related statistics for collections of files
49#[derive(Debug, Clone, Default, Serialize, Deserialize)]
50pub struct SizeStats {
51    pub total_size: u64,
52    pub total_size_human: String,
53    pub file_count: usize,
54    pub average_size: u64,
55    pub median_size: u64,
56    pub min_size: u64,
57    pub max_size: u64,
58    pub size_distribution: SizeDistribution,
59}
60
61/// Distribution of file sizes
62#[derive(Debug, Clone, Default, Serialize, Deserialize)]
63pub struct SizeDistribution {
64    pub tiny: usize,   // < 1KB
65    pub small: usize,  // 1KB - 10KB
66    pub medium: usize, // 10KB - 100KB
67    pub large: usize,  // 100KB - 1MB
68    pub huge: usize,   // > 1MB
69}
70
71/// Metadata extractor with caching and optimization
72pub struct MetadataExtractor {
73    cache: DashMap<PathBuf, FileMetadata>,
74    cache_enabled: bool,
75}
76
77impl Default for FileMetadata {
78    fn default() -> Self {
79        Self {
80            path: PathBuf::new(),
81            size: 0,
82            size_human: "0 B".to_string(),
83            created: None,
84            modified: None,
85            accessed: None,
86            readonly: false,
87            hidden: false,
88            executable: false,
89            symlink: false,
90            symlink_target: None,
91            permissions: 0,
92            file_type: FileSystemType::Unknown,
93            inode: None,
94            links: None,
95            uid: None,
96            gid: None,
97        }
98    }
99}
100
101impl MetadataExtractor {
102    /// Create a new metadata extractor
103    pub fn new() -> Self {
104        Self {
105            cache: DashMap::new(),
106            cache_enabled: true,
107        }
108    }
109
110    /// Create a metadata extractor without caching
111    pub fn without_cache() -> Self {
112        Self {
113            cache: DashMap::new(),
114            cache_enabled: false,
115        }
116    }
117
118    /// Extract comprehensive metadata for a file
119    pub async fn extract_metadata(&self, path: &Path) -> Result<FileMetadata> {
120        // Check cache first if enabled
121        if self.cache_enabled {
122            if let Some(cached) = self.cache.get(path) {
123                return Ok(cached.clone());
124            }
125        }
126
127        let metadata = self.extract_metadata_uncached(path).await?;
128
129        // Cache the result if caching is enabled
130        if self.cache_enabled {
131            self.cache.insert(path.to_path_buf(), metadata.clone());
132        }
133
134        Ok(metadata)
135    }
136
137    /// Extract metadata without caching
138    async fn extract_metadata_uncached(&self, path: &Path) -> Result<FileMetadata> {
139        let std_metadata = tokio::fs::symlink_metadata(path).await.map_err(|e| {
140            ScribeError::io(
141                format!("Failed to read metadata for {}: {}", path.display(), e),
142                e,
143            )
144        })?;
145
146        let size = std_metadata.len();
147        let size_human = bytes_to_human(size);
148
149        // Extract timestamps
150        let created = system_time_to_timestamp(std_metadata.created().ok());
151        let modified = system_time_to_timestamp(std_metadata.modified().ok());
152        let accessed = system_time_to_timestamp(std_metadata.accessed().ok());
153
154        // Determine file type
155        let file_type = classify_file_type(&std_metadata);
156
157        // Check if it's a symlink and get target
158        let (symlink, symlink_target) = if std_metadata.file_type().is_symlink() {
159            let target = tokio::fs::read_link(path).await.ok();
160            (true, target)
161        } else {
162            (false, None)
163        };
164
165        // Platform-specific metadata extraction
166        let (permissions, readonly, hidden, executable, inode, links, uid, gid) =
167            extract_platform_metadata(path, &std_metadata)?;
168
169        Ok(FileMetadata {
170            path: path.to_path_buf(),
171            size,
172            size_human,
173            created,
174            modified,
175            accessed,
176            readonly,
177            hidden,
178            executable,
179            symlink,
180            symlink_target,
181            permissions,
182            file_type,
183            inode,
184            links,
185            uid,
186            gid,
187        })
188    }
189
190    /// Extract metadata for multiple files in parallel
191    pub async fn extract_metadata_batch(&self, paths: &[PathBuf]) -> Vec<Result<FileMetadata>> {
192        // For now, process sequentially to avoid async closure issues
193        // In a future version, this could use async map with proper lifetime handling
194        let mut results = Vec::with_capacity(paths.len());
195        for path in paths {
196            results.push(self.extract_metadata(path).await);
197        }
198        results
199    }
200
201    /// Calculate size statistics for a collection of files
202    pub fn calculate_size_stats(&self, files: &[FileMetadata]) -> SizeStats {
203        if files.is_empty() {
204            return SizeStats::default();
205        }
206
207        let mut sizes: Vec<u64> = files.iter().map(|f| f.size).collect();
208        sizes.sort_unstable();
209
210        let total_size = sizes.iter().sum();
211        let file_count = files.len();
212        let average_size = total_size / file_count as u64;
213        let median_size = if file_count % 2 == 0 {
214            (sizes[file_count / 2 - 1] + sizes[file_count / 2]) / 2
215        } else {
216            sizes[file_count / 2]
217        };
218
219        let min_size = sizes[0];
220        let max_size = sizes[sizes.len() - 1];
221
222        // Calculate size distribution
223        let mut distribution = SizeDistribution::default();
224        for &size in &sizes {
225            match size {
226                0..=1024 => distribution.tiny += 1,
227                1025..=10240 => distribution.small += 1,
228                10241..=102400 => distribution.medium += 1,
229                102401..=1048576 => distribution.large += 1,
230                _ => distribution.huge += 1,
231            }
232        }
233
234        SizeStats {
235            total_size,
236            total_size_human: bytes_to_human(total_size),
237            file_count,
238            average_size,
239            median_size,
240            min_size,
241            max_size,
242            size_distribution: distribution,
243        }
244    }
245
246    /// Clear the metadata cache
247    pub fn clear_cache(&self) {
248        self.cache.clear();
249    }
250
251    /// Get cache statistics
252    pub fn cache_stats(&self) -> (usize, usize) {
253        (self.cache.len(), self.cache.capacity())
254    }
255
256    /// Check if a file is likely to be a text file based on metadata
257    pub fn is_likely_text_file(&self, metadata: &FileMetadata) -> bool {
258        // Skip very large files that are unlikely to be source code
259        if metadata.size > 10 * 1024 * 1024 {
260            // 10MB
261            return false;
262        }
263
264        // Skip binary file types
265        matches!(
266            metadata.file_type,
267            FileSystemType::RegularFile | FileSystemType::SymbolicLink
268        )
269    }
270
271    /// Check if a file has been modified recently
272    pub fn is_recently_modified(&self, metadata: &FileMetadata, hours: u64) -> bool {
273        if let Some(modified) = metadata.modified {
274            let now = SystemTime::now()
275                .duration_since(UNIX_EPOCH)
276                .unwrap()
277                .as_secs();
278            let threshold = hours * 3600;
279
280            now.saturating_sub(modified) <= threshold
281        } else {
282            false
283        }
284    }
285}
286
287impl Default for MetadataExtractor {
288    fn default() -> Self {
289        Self::new()
290    }
291}
292
293/// Convert SystemTime to Unix timestamp
294fn system_time_to_timestamp(time: Option<SystemTime>) -> Option<u64> {
295    time.and_then(|t| t.duration_since(UNIX_EPOCH).ok())
296        .map(|d| d.as_secs())
297}
298
299/// Classify file type from metadata
300fn classify_file_type(metadata: &fs::Metadata) -> FileSystemType {
301    let file_type = metadata.file_type();
302
303    if file_type.is_file() {
304        FileSystemType::RegularFile
305    } else if file_type.is_dir() {
306        FileSystemType::Directory
307    } else if file_type.is_symlink() {
308        FileSystemType::SymbolicLink
309    } else {
310        // Platform-specific special file types
311        #[cfg(unix)]
312        {
313            use std::os::unix::fs::FileTypeExt;
314            if file_type.is_fifo() {
315                return FileSystemType::FIFO;
316            } else if file_type.is_socket() {
317                return FileSystemType::Socket;
318            } else if file_type.is_char_device() {
319                return FileSystemType::CharacterDevice;
320            } else if file_type.is_block_device() {
321                return FileSystemType::BlockDevice;
322            }
323        }
324
325        FileSystemType::Unknown
326    }
327}
328
329/// Extract platform-specific metadata
330#[cfg(unix)]
331fn extract_platform_metadata(
332    path: &Path,
333    metadata: &fs::Metadata,
334) -> Result<(
335    u32,
336    bool,
337    bool,
338    bool,
339    Option<u64>,
340    Option<u64>,
341    Option<u32>,
342    Option<u32>,
343)> {
344    use std::os::unix::fs::{MetadataExt, PermissionsExt};
345
346    let permissions = metadata.permissions().mode();
347    let readonly = !metadata.permissions().readonly();
348
349    // Check if file is hidden (starts with .)
350    let hidden = path
351        .file_name()
352        .and_then(|name| name.to_str())
353        .map_or(false, |name| name.starts_with('.'));
354
355    // Check if file is executable
356    let executable = permissions & 0o111 != 0;
357
358    let inode = Some(metadata.ino());
359    let links = Some(metadata.nlink());
360    let uid = Some(metadata.uid());
361    let gid = Some(metadata.gid());
362
363    Ok((
364        permissions,
365        readonly,
366        hidden,
367        executable,
368        inode,
369        links,
370        uid,
371        gid,
372    ))
373}
374
375/// Extract platform-specific metadata for Windows
376#[cfg(windows)]
377fn extract_platform_metadata(
378    path: &Path,
379    metadata: &fs::Metadata,
380) -> Result<(
381    u32,
382    bool,
383    bool,
384    bool,
385    Option<u64>,
386    Option<u64>,
387    Option<u32>,
388    Option<u32>,
389)> {
390    use std::os::windows::fs::MetadataExt;
391
392    let permissions = 0; // Windows doesn't have Unix-style permissions
393    let readonly = metadata.permissions().readonly();
394
395    // Check if file is hidden using Windows attributes
396    let hidden = metadata.file_attributes() & 0x2 != 0;
397
398    // Windows executables typically have .exe, .bat, .cmd extensions
399    let executable = path
400        .extension()
401        .and_then(|ext| ext.to_str())
402        .map_or(false, |ext| {
403            matches!(
404                ext.to_lowercase().as_str(),
405                "exe" | "bat" | "cmd" | "com" | "scr"
406            )
407        });
408
409    // Windows doesn't have direct equivalents for these Unix concepts
410    let inode = None;
411    let links = None;
412    let uid = None;
413    let gid = None;
414
415    Ok((
416        permissions,
417        readonly,
418        hidden,
419        executable,
420        inode,
421        links,
422        uid,
423        gid,
424    ))
425}
426
427impl SizeStats {
428    /// Create size statistics from a slice of file sizes
429    pub fn from_sizes(sizes: &[u64]) -> Self {
430        let mut extractor = MetadataExtractor::new();
431        let fake_metadata: Vec<FileMetadata> = sizes
432            .iter()
433            .enumerate()
434            .map(|(i, &size)| FileMetadata {
435                path: PathBuf::from(format!("file_{}", i)),
436                size,
437                size_human: bytes_to_human(size),
438                ..Default::default()
439            })
440            .collect();
441
442        extractor.calculate_size_stats(&fake_metadata)
443    }
444
445    /// Get a human-readable summary of the size statistics
446    pub fn summary(&self) -> String {
447        format!(
448            "Files: {}, Total: {}, Avg: {}, Range: {} - {}",
449            self.file_count,
450            self.total_size_human,
451            bytes_to_human(self.average_size),
452            bytes_to_human(self.min_size),
453            bytes_to_human(self.max_size)
454        )
455    }
456
457    /// Get distribution summary
458    pub fn distribution_summary(&self) -> String {
459        format!(
460            "Tiny: {}, Small: {}, Medium: {}, Large: {}, Huge: {}",
461            self.size_distribution.tiny,
462            self.size_distribution.small,
463            self.size_distribution.medium,
464            self.size_distribution.large,
465            self.size_distribution.huge
466        )
467    }
468}
469
470#[cfg(test)]
471mod tests {
472    use super::*;
473    use std::fs;
474    use tempfile::TempDir;
475    use tokio::fs as async_fs;
476
477    #[tokio::test]
478    async fn test_metadata_extraction() {
479        let temp_dir = TempDir::new().unwrap();
480        let test_file = temp_dir.path().join("test.txt");
481
482        let content = "Hello, world! This is a test file.";
483        fs::write(&test_file, content).unwrap();
484
485        let mut extractor = MetadataExtractor::new();
486        let metadata = extractor.extract_metadata(&test_file).await.unwrap();
487
488        assert_eq!(metadata.path, test_file);
489        assert_eq!(metadata.size, content.len() as u64);
490        assert!(!metadata.size_human.is_empty());
491        assert!(metadata.modified.is_some());
492        assert_eq!(metadata.file_type, FileSystemType::RegularFile);
493        assert!(!metadata.symlink);
494    }
495
496    #[tokio::test]
497    async fn test_symlink_detection() {
498        let temp_dir = TempDir::new().unwrap();
499        let original_file = temp_dir.path().join("original.txt");
500        let symlink_file = temp_dir.path().join("link.txt");
501
502        fs::write(&original_file, "original content").unwrap();
503
504        #[cfg(unix)]
505        {
506            std::os::unix::fs::symlink(&original_file, &symlink_file).unwrap();
507
508            let mut extractor = MetadataExtractor::new();
509            let metadata = extractor.extract_metadata(&symlink_file).await.unwrap();
510
511            assert!(metadata.symlink);
512            assert_eq!(metadata.symlink_target, Some(original_file));
513        }
514    }
515
516    #[tokio::test]
517    async fn test_batch_metadata_extraction() {
518        let temp_dir = TempDir::new().unwrap();
519        let mut file_paths = Vec::new();
520
521        // Create multiple test files
522        for i in 0..5 {
523            let file_path = temp_dir.path().join(format!("test_{}.txt", i));
524            fs::write(&file_path, format!("Content for file {}", i)).unwrap();
525            file_paths.push(file_path);
526        }
527
528        let mut extractor = MetadataExtractor::new();
529        let results = extractor.extract_metadata_batch(&file_paths).await;
530
531        assert_eq!(results.len(), 5);
532        for result in results {
533            assert!(result.is_ok());
534            let metadata = result.unwrap();
535            assert_eq!(metadata.file_type, FileSystemType::RegularFile);
536            assert!(metadata.size > 0);
537        }
538    }
539
540    #[tokio::test]
541    async fn test_size_statistics() {
542        let temp_dir = TempDir::new().unwrap();
543        let mut files = Vec::new();
544
545        // Create files of different sizes
546        let sizes = [100, 500, 1500, 5000, 50000];
547        for (i, &size) in sizes.iter().enumerate() {
548            let file_path = temp_dir.path().join(format!("test_{}.txt", i));
549            let content = "x".repeat(size);
550            fs::write(&file_path, content).unwrap();
551
552            let mut extractor = MetadataExtractor::new();
553            let metadata = extractor.extract_metadata(&file_path).await.unwrap();
554            files.push(metadata);
555        }
556
557        let extractor = MetadataExtractor::new();
558        let stats = extractor.calculate_size_stats(&files);
559
560        assert_eq!(stats.file_count, 5);
561        assert_eq!(stats.total_size, sizes.iter().sum::<usize>() as u64);
562        assert_eq!(stats.min_size, 100);
563        assert_eq!(stats.max_size, 50000);
564
565        // Check distribution
566        assert_eq!(stats.size_distribution.tiny, 2); // 100, 500 bytes (both <= 1024)
567        assert_eq!(stats.size_distribution.small, 2); // 1500, 5000 bytes (1025-10240)
568        assert_eq!(stats.size_distribution.medium, 1); // 50000 bytes (10241-102400)
569        assert_eq!(stats.size_distribution.large, 0); // none
570        assert_eq!(stats.size_distribution.huge, 0);
571    }
572
573    #[test]
574    fn test_size_stats_from_sizes() {
575        let sizes = [1000, 2000, 3000, 4000, 5000];
576        let stats = SizeStats::from_sizes(&sizes);
577
578        assert_eq!(stats.file_count, 5);
579        assert_eq!(stats.total_size, 15000);
580        assert_eq!(stats.average_size, 3000);
581        assert_eq!(stats.median_size, 3000);
582        assert_eq!(stats.min_size, 1000);
583        assert_eq!(stats.max_size, 5000);
584    }
585
586    #[test]
587    fn test_size_distribution() {
588        let sizes = [
589            500,     // tiny
590            5000,    // small
591            50000,   // medium
592            500000,  // large
593            5000000, // huge
594        ];
595        let stats = SizeStats::from_sizes(&sizes);
596
597        assert_eq!(stats.size_distribution.tiny, 1);
598        assert_eq!(stats.size_distribution.small, 1);
599        assert_eq!(stats.size_distribution.medium, 1);
600        assert_eq!(stats.size_distribution.large, 1);
601        assert_eq!(stats.size_distribution.huge, 1);
602    }
603
604    #[tokio::test]
605    async fn test_cache_functionality() {
606        let temp_dir = TempDir::new().unwrap();
607        let test_file = temp_dir.path().join("test.txt");
608        fs::write(&test_file, "test content").unwrap();
609
610        let mut extractor = MetadataExtractor::new();
611
612        // First extraction should cache the result
613        let metadata1 = extractor.extract_metadata(&test_file).await.unwrap();
614        let (cache_size, _) = extractor.cache_stats();
615        assert_eq!(cache_size, 1);
616
617        // Second extraction should use cache
618        let metadata2 = extractor.extract_metadata(&test_file).await.unwrap();
619        assert_eq!(metadata1.size, metadata2.size);
620        assert_eq!(metadata1.modified, metadata2.modified);
621
622        // Clear cache and verify
623        extractor.clear_cache();
624        let (cache_size, _) = extractor.cache_stats();
625        assert_eq!(cache_size, 0);
626    }
627
628    #[tokio::test]
629    async fn test_recently_modified() {
630        let temp_dir = TempDir::new().unwrap();
631        let test_file = temp_dir.path().join("test.txt");
632        fs::write(&test_file, "test content").unwrap();
633
634        let mut extractor = MetadataExtractor::new();
635        let metadata = extractor.extract_metadata(&test_file).await.unwrap();
636
637        // File should be recently modified (within 1 hour)
638        assert!(extractor.is_recently_modified(&metadata, 1));
639
640        // File should definitely be modified within 24 hours
641        assert!(extractor.is_recently_modified(&metadata, 24));
642    }
643
644    #[test]
645    fn test_file_type_classification() {
646        // Test with mock metadata - actual implementation would depend on platform
647        let sizes = [1000];
648        let stats = SizeStats::from_sizes(&sizes);
649
650        // Basic smoke test for stats functionality
651        assert_eq!(stats.file_count, 1);
652        assert_eq!(stats.total_size, 1000);
653    }
654}