subx_cli/core/matcher/
discovery.rs

1//! Media file discovery utilities.
2//!
3//! This module provides `FileDiscovery` to scan directories,
4//! classify media files (video and subtitle), and collect metadata needed for matching.
5//!
6//! # Examples
7//!
8//! ```rust,ignore
9//! use subx_cli::core::matcher::discovery::FileDiscovery;
10//! let disco = FileDiscovery::new();
11//! let files = disco.scan_directory("./path".as_ref(), true).unwrap();
12//! ```
13
14use std::collections::hash_map::DefaultHasher;
15use std::hash::{Hash, Hasher};
16use std::path::{Path, PathBuf};
17use walkdir::WalkDir;
18
19use crate::Result;
20
21/// Media file record representing a discovered file.
22///
23/// Contains metadata about a media file discovered during the scanning process,
24/// including its path, type classification, and basic file properties.
25#[derive(Debug, Clone)]
26pub struct MediaFile {
27    /// Unique identifier for this media file (deterministic hash)
28    pub id: String,
29    /// Full path to the media file
30    pub path: PathBuf,
31    /// Classification of the file (Video or Subtitle)
32    pub file_type: MediaFileType,
33    /// File size in bytes
34    pub size: u64,
35    /// Complete filename with extension (e.g., "movie.mkv")
36    pub name: String,
37    /// File extension (without the dot)
38    pub extension: String,
39    /// Relative path from scan root for recursive matching
40    pub relative_path: String,
41}
42/// Generate a deterministic unique identifier for a media file
43///
44/// Uses a fast hash algorithm combining the relative path and file size to
45/// produce a consistent ID.
46fn generate_file_id(relative_path: &str, file_size: u64) -> String {
47    let mut hasher = DefaultHasher::new();
48    relative_path.hash(&mut hasher);
49    file_size.hash(&mut hasher);
50    format!("file_{:016x}", hasher.finish())
51}
52
53// Unit tests: FileDiscovery file matching logic
54#[cfg(test)]
55mod tests {
56    use super::*;
57    use std::fs;
58    use tempfile::TempDir;
59
60    fn create_test_files(dir: &std::path::Path) {
61        let _ = fs::write(dir.join("video1.mp4"), b"");
62        let _ = fs::write(dir.join("video2.mkv"), b"");
63        let _ = fs::write(dir.join("subtitle1.srt"), b"");
64        let sub = dir.join("season1");
65        fs::create_dir_all(&sub).unwrap();
66        let _ = fs::write(sub.join("episode1.mp4"), b"");
67        let _ = fs::write(sub.join("episode1.srt"), b"");
68        let _ = fs::write(dir.join("note.txt"), b"");
69    }
70
71    #[test]
72    fn test_file_discovery_non_recursive() {
73        let temp = TempDir::new().unwrap();
74        create_test_files(temp.path());
75        let disco = FileDiscovery::new();
76        let files = disco.scan_directory(temp.path(), false).unwrap();
77        let vids = files
78            .iter()
79            .filter(|f| matches!(f.file_type, MediaFileType::Video))
80            .count();
81        let subs = files
82            .iter()
83            .filter(|f| matches!(f.file_type, MediaFileType::Subtitle))
84            .count();
85        assert_eq!(vids, 2);
86        assert_eq!(subs, 1);
87        assert!(!files.iter().any(|f| f.relative_path.contains("episode1")));
88    }
89
90    #[test]
91    fn test_file_discovery_recursive() {
92        let temp = TempDir::new().unwrap();
93        create_test_files(temp.path());
94        let disco = FileDiscovery::new();
95        let files = disco.scan_directory(temp.path(), true).unwrap();
96        let vids = files
97            .iter()
98            .filter(|f| matches!(f.file_type, MediaFileType::Video))
99            .count();
100        let subs = files
101            .iter()
102            .filter(|f| matches!(f.file_type, MediaFileType::Subtitle))
103            .count();
104        assert_eq!(vids, 3);
105        assert_eq!(subs, 2);
106        assert!(files.iter().any(|f| f.relative_path.contains("episode1")));
107    }
108
109    #[test]
110    fn test_file_classification_and_extensions() {
111        let temp = TempDir::new().unwrap();
112        let v = temp.path().join("t.mp4");
113        fs::write(&v, b"").unwrap();
114        let s = temp.path().join("t.srt");
115        fs::write(&s, b"").unwrap();
116        let x = temp.path().join("t.txt");
117        fs::write(&x, b"").unwrap();
118        let disco = FileDiscovery::new();
119        let vf = disco.classify_file(&v, temp.path()).unwrap().unwrap();
120        assert!(matches!(vf.file_type, MediaFileType::Video));
121        assert_eq!(vf.name, "t.mp4");
122        let sf = disco.classify_file(&s, temp.path()).unwrap().unwrap();
123        assert!(matches!(sf.file_type, MediaFileType::Subtitle));
124        assert_eq!(sf.name, "t.srt");
125        let none = disco.classify_file(&x, temp.path()).unwrap();
126        assert!(none.is_none());
127        assert!(disco.video_extensions.contains(&"mp4".to_string()));
128        assert!(disco.subtitle_extensions.contains(&"srt".to_string()));
129    }
130
131    #[test]
132    fn test_empty_and_nonexistent_directory() {
133        let temp = TempDir::new().unwrap();
134        let disco = FileDiscovery::new();
135        let files = disco.scan_directory(temp.path(), false).unwrap();
136        assert!(files.is_empty());
137        let res = disco.scan_directory(&std::path::Path::new("/nonexistent/path"), false);
138        assert!(res.is_err());
139    }
140}
141
142// Unit tests for unique ID generation and MediaFile structure
143#[cfg(test)]
144mod id_tests {
145    use super::*;
146    use std::fs;
147    use tempfile::TempDir;
148
149    #[test]
150    fn test_media_file_structure_with_unique_id() {
151        let temp = TempDir::new().unwrap();
152        let video_path = temp.path().join("[Test][01].mkv");
153        fs::write(&video_path, b"dummy content").unwrap();
154
155        let disco = FileDiscovery::new();
156        let files = disco.scan_directory(temp.path(), false).unwrap();
157
158        let video_file = files
159            .iter()
160            .find(|f| matches!(f.file_type, MediaFileType::Video))
161            .unwrap();
162
163        assert!(!video_file.id.is_empty());
164        assert!(video_file.id.starts_with("file_"));
165        assert_eq!(video_file.id.len(), 21);
166
167        assert_eq!(video_file.name, "[Test][01].mkv");
168        assert_eq!(video_file.extension, "mkv");
169        assert_eq!(video_file.relative_path, "[Test][01].mkv");
170    }
171
172    #[test]
173    fn test_deterministic_id_generation() {
174        let id1 = generate_file_id("test/file.mkv", 1000);
175        let id2 = generate_file_id("test/file.mkv", 1000);
176        assert_eq!(id1, id2);
177
178        let id3 = generate_file_id("test/file2.mkv", 1000);
179        assert_ne!(id1, id3);
180
181        let id4 = generate_file_id("test/file.mkv", 2000);
182        assert_ne!(id1, id4);
183
184        assert!(id1.starts_with("file_"));
185        assert_eq!(id1.len(), 21);
186    }
187
188    #[test]
189    fn test_recursive_mode_with_unique_ids() {
190        let temp = TempDir::new().unwrap();
191        let sub_dir = temp.path().join("season1");
192        fs::create_dir_all(&sub_dir).unwrap();
193
194        let video1 = temp.path().join("movie.mkv");
195        let video2 = sub_dir.join("episode1.mkv");
196        fs::write(&video1, b"content1").unwrap();
197        fs::write(&video2, b"content2").unwrap();
198
199        let disco = FileDiscovery::new();
200        let files = disco.scan_directory(temp.path(), true).unwrap();
201
202        let root_video = files.iter().find(|f| f.name == "movie.mkv").unwrap();
203        let sub_video = files.iter().find(|f| f.name == "episode1.mkv").unwrap();
204
205        assert_ne!(root_video.id, sub_video.id);
206        assert_eq!(root_video.relative_path, "movie.mkv");
207        assert_eq!(sub_video.relative_path, "season1/episode1.mkv");
208    }
209
210    #[test]
211    fn test_hash_generation_basic() {
212        let id = generate_file_id("test/file.mkv", 1000);
213        assert!(id.starts_with("file_"));
214        assert_eq!(id.len(), 21);
215    }
216}
217
218impl Default for FileDiscovery {
219    fn default() -> Self {
220        Self::new()
221    }
222}
223
224/// Enumeration of supported media file types.
225///
226/// Classifies discovered files into their primary categories for
227/// processing by the subtitle matching system.
228#[derive(Debug, Clone)]
229pub enum MediaFileType {
230    /// Video file (e.g., .mp4, .mkv, .avi)
231    Video,
232    /// Subtitle file (e.g., .srt, .ass, .vtt)
233    Subtitle,
234}
235
236/// File discovery engine for scanning and classifying media files.
237pub struct FileDiscovery {
238    video_extensions: Vec<String>,
239    subtitle_extensions: Vec<String>,
240}
241
242impl FileDiscovery {
243    /// Creates a new `FileDiscovery` with default video and subtitle extensions.
244    pub fn new() -> Self {
245        Self {
246            video_extensions: vec![
247                "mp4".to_string(),
248                "mkv".to_string(),
249                "avi".to_string(),
250                "mov".to_string(),
251                "wmv".to_string(),
252                "flv".to_string(),
253                "m4v".to_string(),
254                "webm".to_string(),
255            ],
256            subtitle_extensions: vec![
257                "srt".to_string(),
258                "ass".to_string(),
259                "vtt".to_string(),
260                "sub".to_string(),
261                "ssa".to_string(),
262                "idx".to_string(),
263            ],
264        }
265    }
266
267    /// Scans the given directory and returns all media files found.
268    ///
269    /// # Arguments
270    ///
271    /// * `path` - The root directory to scan.
272    /// * `recursive` - Whether to scan subdirectories recursively.
273    pub fn scan_directory(&self, root_path: &Path, recursive: bool) -> Result<Vec<MediaFile>> {
274        let mut files = Vec::new();
275
276        let walker = if recursive {
277            WalkDir::new(root_path).into_iter()
278        } else {
279            WalkDir::new(root_path).max_depth(1).into_iter()
280        };
281
282        for entry in walker {
283            let entry = entry?;
284            let path = entry.path();
285
286            if path.is_file() {
287                if let Some(media_file) = self.classify_file(path, root_path)? {
288                    files.push(media_file);
289                }
290            }
291        }
292
293        Ok(files)
294    }
295
296    /// Classifies a file by its extension and gathers its metadata.
297    ///
298    /// Returns `Some(MediaFile)` if the file is a recognized media type,
299    /// or `None` otherwise.
300    fn classify_file(&self, path: &Path, scan_root: &Path) -> Result<Option<MediaFile>> {
301        let extension = path
302            .extension()
303            .and_then(|ext| ext.to_str())
304            .map(|s| s.to_lowercase())
305            .unwrap_or_default();
306
307        let file_type = if self.video_extensions.contains(&extension) {
308            MediaFileType::Video
309        } else if self.subtitle_extensions.contains(&extension) {
310            MediaFileType::Subtitle
311        } else {
312            return Ok(None);
313        };
314
315        let metadata = std::fs::metadata(path)?;
316        // Complete filename with extension
317        let name = path
318            .file_name()
319            .and_then(|n| n.to_str())
320            .unwrap_or_default()
321            .to_string();
322
323        // Compute relative path with normalized separators
324        let relative_path = path
325            .strip_prefix(scan_root)
326            .unwrap_or(path)
327            .to_string_lossy()
328            .replace('\\', "/"); // Normalize to Unix-style separators for consistency
329
330        // Generate unique ID based on relative path and file size
331        let id = generate_file_id(&relative_path, metadata.len());
332
333        Ok(Some(MediaFile {
334            id,
335            path: path.to_path_buf(),
336            file_type,
337            size: metadata.len(),
338            name,
339            extension,
340            relative_path,
341        }))
342    }
343}