subx_cli/core/matcher/
discovery.rs

1//! Media file discovery utilities.
2//!
3//! This module provides `FileDiscovery` to scan directories,
4//! classify media files (video and subtitle), and collect metadata needed for matching.
5//!
6//! # Examples
7//!
8//! ```rust,ignore
9//! use subx_cli::core::matcher::discovery::FileDiscovery;
10//! let disco = FileDiscovery::new();
11//! let files = disco.scan_directory("./path".as_ref(), true).unwrap();
12//! ```
13
14use std::collections::hash_map::DefaultHasher;
15use std::hash::{Hash, Hasher};
16use std::path::{Path, PathBuf};
17use walkdir::WalkDir;
18
19use crate::Result;
20
21/// Media file record representing a discovered file.
22///
23/// Contains metadata about a media file discovered during the scanning process,
24/// including its path, type classification, and basic file properties.
25#[derive(Debug, Clone)]
26pub struct MediaFile {
27    /// Unique identifier for this media file (deterministic hash)
28    pub id: String,
29    /// Full path to the media file
30    pub path: PathBuf,
31    /// Classification of the file (Video or Subtitle)
32    pub file_type: MediaFileType,
33    /// File size in bytes
34    pub size: u64,
35    /// Complete filename with extension (e.g., "movie.mkv")
36    pub name: String,
37    /// File extension (without the dot)
38    pub extension: String,
39    /// Relative path from scan root for recursive matching
40    pub relative_path: String,
41}
42/// Generate a deterministic unique identifier for a media file
43///
44/// Uses a fast hash algorithm combining the absolute path and file size to
45/// produce a consistent ID regardless of scanning method.
46pub fn generate_file_id(path: &std::path::Path, file_size: u64) -> String {
47    let mut hasher = DefaultHasher::new();
48    // Use absolute path to ensure consistency across different scanning methods
49    let abs_path = path.canonicalize().unwrap_or_else(|_| path.to_path_buf());
50    abs_path.to_string_lossy().as_ref().hash(&mut hasher);
51    file_size.hash(&mut hasher);
52    format!("file_{:016x}", hasher.finish())
53}
54
55// Unit tests: FileDiscovery file matching logic
56#[cfg(test)]
57mod tests {
58    use super::*;
59    use std::fs;
60    use tempfile::TempDir;
61
62    fn create_test_files(dir: &std::path::Path) {
63        let _ = fs::write(dir.join("video1.mp4"), b"");
64        let _ = fs::write(dir.join("video2.mkv"), b"");
65        let _ = fs::write(dir.join("subtitle1.srt"), b"");
66        let sub = dir.join("season1");
67        fs::create_dir_all(&sub).unwrap();
68        let _ = fs::write(sub.join("episode1.mp4"), b"");
69        let _ = fs::write(sub.join("episode1.srt"), b"");
70        let _ = fs::write(dir.join("note.txt"), b"");
71    }
72
73    #[test]
74    fn test_file_discovery_non_recursive() {
75        let temp = TempDir::new().unwrap();
76        create_test_files(temp.path());
77        let disco = FileDiscovery::new();
78        let files = disco.scan_directory(temp.path(), false).unwrap();
79        let vids = files
80            .iter()
81            .filter(|f| matches!(f.file_type, MediaFileType::Video))
82            .count();
83        let subs = files
84            .iter()
85            .filter(|f| matches!(f.file_type, MediaFileType::Subtitle))
86            .count();
87        assert_eq!(vids, 2);
88        assert_eq!(subs, 1);
89        assert!(!files.iter().any(|f| f.relative_path.contains("episode1")));
90    }
91
92    #[test]
93    fn test_file_discovery_recursive() {
94        let temp = TempDir::new().unwrap();
95        create_test_files(temp.path());
96        let disco = FileDiscovery::new();
97        let files = disco.scan_directory(temp.path(), true).unwrap();
98        let vids = files
99            .iter()
100            .filter(|f| matches!(f.file_type, MediaFileType::Video))
101            .count();
102        let subs = files
103            .iter()
104            .filter(|f| matches!(f.file_type, MediaFileType::Subtitle))
105            .count();
106        assert_eq!(vids, 3);
107        assert_eq!(subs, 2);
108        assert!(files.iter().any(|f| f.relative_path.contains("episode1")));
109    }
110
111    #[test]
112    fn test_file_classification_and_extensions() {
113        let temp = TempDir::new().unwrap();
114        let v = temp.path().join("t.mp4");
115        fs::write(&v, b"").unwrap();
116        let s = temp.path().join("t.srt");
117        fs::write(&s, b"").unwrap();
118        let x = temp.path().join("t.txt");
119        fs::write(&x, b"").unwrap();
120        let disco = FileDiscovery::new();
121        let vf = disco.classify_file(&v, temp.path()).unwrap().unwrap();
122        assert!(matches!(vf.file_type, MediaFileType::Video));
123        assert_eq!(vf.name, "t.mp4");
124        let sf = disco.classify_file(&s, temp.path()).unwrap().unwrap();
125        assert!(matches!(sf.file_type, MediaFileType::Subtitle));
126        assert_eq!(sf.name, "t.srt");
127        let none = disco.classify_file(&x, temp.path()).unwrap();
128        assert!(none.is_none());
129        assert!(disco.video_extensions.contains(&"mp4".to_string()));
130        assert!(disco.subtitle_extensions.contains(&"srt".to_string()));
131    }
132
133    #[test]
134    fn test_empty_and_nonexistent_directory() {
135        let temp = TempDir::new().unwrap();
136        let disco = FileDiscovery::new();
137        let files = disco.scan_directory(temp.path(), false).unwrap();
138        assert!(files.is_empty());
139        let res = disco.scan_directory(&std::path::Path::new("/nonexistent/path"), false);
140        assert!(res.is_err());
141    }
142}
143
144// Unit tests for unique ID generation and MediaFile structure
145#[cfg(test)]
146mod id_tests {
147    use super::*;
148    use std::fs;
149    use tempfile::TempDir;
150
151    #[test]
152    fn test_media_file_structure_with_unique_id() {
153        let temp = TempDir::new().unwrap();
154        let video_path = temp.path().join("[Test][01].mkv");
155        fs::write(&video_path, b"dummy content").unwrap();
156
157        let disco = FileDiscovery::new();
158        let files = disco.scan_directory(temp.path(), false).unwrap();
159
160        let video_file = files
161            .iter()
162            .find(|f| matches!(f.file_type, MediaFileType::Video))
163            .unwrap();
164
165        assert!(!video_file.id.is_empty());
166        assert!(video_file.id.starts_with("file_"));
167        assert_eq!(video_file.id.len(), 21);
168
169        assert_eq!(video_file.name, "[Test][01].mkv");
170        assert_eq!(video_file.extension, "mkv");
171        assert_eq!(video_file.relative_path, "[Test][01].mkv");
172    }
173
174    #[test]
175    fn test_deterministic_id_generation() {
176        use std::path::Path;
177        let path1 = Path::new("test/file.mkv");
178        let path2 = Path::new("test/file.mkv");
179        let path3 = Path::new("test/file2.mkv");
180
181        let id1 = generate_file_id(path1, 1000);
182        let id2 = generate_file_id(path2, 1000);
183        assert_eq!(id1, id2);
184
185        let id3 = generate_file_id(path3, 1000);
186        assert_ne!(id1, id3);
187
188        let id4 = generate_file_id(path1, 2000);
189        assert_ne!(id1, id4);
190
191        assert!(id1.starts_with("file_"));
192        assert_eq!(id1.len(), 21);
193    }
194
195    #[test]
196    fn test_recursive_mode_with_unique_ids() {
197        let temp = TempDir::new().unwrap();
198        let sub_dir = temp.path().join("season1");
199        fs::create_dir_all(&sub_dir).unwrap();
200
201        let video1 = temp.path().join("movie.mkv");
202        let video2 = sub_dir.join("episode1.mkv");
203        fs::write(&video1, b"content1").unwrap();
204        fs::write(&video2, b"content2").unwrap();
205
206        let disco = FileDiscovery::new();
207        let files = disco.scan_directory(temp.path(), true).unwrap();
208
209        let root_video = files.iter().find(|f| f.name == "movie.mkv").unwrap();
210        let sub_video = files.iter().find(|f| f.name == "episode1.mkv").unwrap();
211
212        assert_ne!(root_video.id, sub_video.id);
213        assert_eq!(root_video.relative_path, "movie.mkv");
214        assert_eq!(sub_video.relative_path, "season1/episode1.mkv");
215    }
216
217    #[test]
218    fn test_hash_generation_basic() {
219        use std::path::Path;
220        let path = Path::new("test/file.mkv");
221        let id = generate_file_id(path, 1000);
222        assert!(id.starts_with("file_"));
223        assert_eq!(id.len(), 21);
224    }
225}
226
227impl Default for FileDiscovery {
228    fn default() -> Self {
229        Self::new()
230    }
231}
232
233/// Enumeration of supported media file types.
234///
235/// Classifies discovered files into their primary categories for
236/// processing by the subtitle matching system.
237#[derive(Debug, Clone)]
238pub enum MediaFileType {
239    /// Video file (e.g., .mp4, .mkv, .avi)
240    Video,
241    /// Subtitle file (e.g., .srt, .ass, .vtt)
242    Subtitle,
243}
244
245/// File discovery engine for scanning and classifying media files.
246pub struct FileDiscovery {
247    video_extensions: Vec<String>,
248    subtitle_extensions: Vec<String>,
249}
250
251impl FileDiscovery {
252    /// Creates a new `FileDiscovery` with default video and subtitle extensions.
253    pub fn new() -> Self {
254        Self {
255            video_extensions: vec![
256                "mp4".to_string(),
257                "mkv".to_string(),
258                "avi".to_string(),
259                "mov".to_string(),
260                "wmv".to_string(),
261                "flv".to_string(),
262                "m4v".to_string(),
263                "webm".to_string(),
264            ],
265            subtitle_extensions: vec![
266                "srt".to_string(),
267                "ass".to_string(),
268                "vtt".to_string(),
269                "sub".to_string(),
270                "ssa".to_string(),
271                "idx".to_string(),
272            ],
273        }
274    }
275
276    /// Scans the given directory and returns all media files found.
277    ///
278    /// # Arguments
279    ///
280    /// * `path` - The root directory to scan.
281    /// * `recursive` - Whether to scan subdirectories recursively.
282    pub fn scan_directory(&self, root_path: &Path, recursive: bool) -> Result<Vec<MediaFile>> {
283        let mut files = Vec::new();
284
285        let walker = if recursive {
286            WalkDir::new(root_path).into_iter()
287        } else {
288            WalkDir::new(root_path).max_depth(1).into_iter()
289        };
290
291        for entry in walker {
292            let entry = entry?;
293            let path = entry.path();
294
295            if path.is_file() {
296                if let Some(media_file) = self.classify_file(path, root_path)? {
297                    files.push(media_file);
298                }
299            }
300        }
301
302        Ok(files)
303    }
304
305    /// Creates MediaFile objects from a list of file paths.
306    ///
307    /// This method processes each file path individually, creating MediaFile objects
308    /// with consistent IDs that match those generated by scan_directory.
309    ///
310    /// # Arguments
311    ///
312    /// * `file_paths` - A slice of file paths to process
313    ///
314    /// # Returns
315    ///
316    /// A vector of `MediaFile` objects for valid media files, or an error if file access fails.
317    pub fn scan_file_list(&self, file_paths: &[PathBuf]) -> Result<Vec<MediaFile>> {
318        let mut media_files = Vec::new();
319
320        for path in file_paths {
321            if !path.exists() {
322                continue; // Skip non-existent files
323            }
324
325            if !path.is_file() {
326                continue; // Skip directories
327            }
328
329            if let Some(extension) = path.extension().and_then(|e| e.to_str()) {
330                let extension_lower = extension.to_lowercase();
331
332                // Check if it's a video or subtitle file
333                let file_type = if self.video_extensions.contains(&extension_lower) {
334                    MediaFileType::Video
335                } else if self.subtitle_extensions.contains(&extension_lower) {
336                    MediaFileType::Subtitle
337                } else {
338                    continue; // Skip non-media files
339                };
340
341                if let Ok(metadata) = path.metadata() {
342                    let name = path
343                        .file_name()
344                        .and_then(|n| n.to_str())
345                        .unwrap_or("")
346                        .to_string();
347
348                    // For file list scanning, use filename as relative path
349                    // This maintains compatibility with existing display logic
350                    let relative_path = name.clone();
351
352                    let media_file = MediaFile {
353                        id: generate_file_id(path, metadata.len()),
354                        path: path.clone(),
355                        file_type,
356                        size: metadata.len(),
357                        name,
358                        extension: extension_lower,
359                        relative_path,
360                    };
361                    media_files.push(media_file);
362                }
363            }
364        }
365
366        Ok(media_files)
367    }
368
369    /// Classifies a file by its extension and gathers its metadata.
370    ///
371    /// Returns `Some(MediaFile)` if the file is a recognized media type,
372    /// or `None` otherwise.
373    fn classify_file(&self, path: &Path, scan_root: &Path) -> Result<Option<MediaFile>> {
374        let extension = path
375            .extension()
376            .and_then(|ext| ext.to_str())
377            .map(|s| s.to_lowercase())
378            .unwrap_or_default();
379
380        let file_type = if self.video_extensions.contains(&extension) {
381            MediaFileType::Video
382        } else if self.subtitle_extensions.contains(&extension) {
383            MediaFileType::Subtitle
384        } else {
385            return Ok(None);
386        };
387
388        let metadata = std::fs::metadata(path)?;
389        // Complete filename with extension
390        let name = path
391            .file_name()
392            .and_then(|n| n.to_str())
393            .unwrap_or_default()
394            .to_string();
395
396        // Compute relative path with normalized separators
397        let relative_path = path
398            .strip_prefix(scan_root)
399            .unwrap_or(path)
400            .to_string_lossy()
401            .replace('\\', "/"); // Normalize to Unix-style separators for consistency
402
403        // Generate unique ID based on absolute path and file size
404        let id = generate_file_id(path, metadata.len());
405
406        Ok(Some(MediaFile {
407            id,
408            path: path.to_path_buf(),
409            file_type,
410            size: metadata.len(),
411            name,
412            extension,
413            relative_path,
414        }))
415    }
416}