Skip to main content

subx_cli/core/matcher/
discovery.rs

1//! Media file discovery utilities.
2//!
3//! This module provides `FileDiscovery` to scan directories,
4//! classify media files (video and subtitle), and collect metadata needed for matching.
5//!
6//! # Examples
7//!
8//! ```rust,ignore
9//! use subx_cli::core::matcher::discovery::FileDiscovery;
10//! let disco = FileDiscovery::new();
11//! let files = disco.scan_directory("./path".as_ref(), true).unwrap();
12//! ```
13
14use std::path::{Path, PathBuf};
15use walkdir::WalkDir;
16
17use crate::Result;
18use crate::core::uuidv7::Uuidv7Generator;
19
20/// Media file record representing a discovered file.
21///
22/// Contains metadata about a media file discovered during the scanning process,
23/// including its path, type classification, and basic file properties.
24#[derive(Debug, Clone)]
25pub struct MediaFile {
26    /// Unique identifier for this media file (`file_<uuid-v7-hyphenated>`)
27    pub id: String,
28    /// Full path to the media file
29    pub path: PathBuf,
30    /// Classification of the file (Video or Subtitle)
31    pub file_type: MediaFileType,
32    /// File size in bytes
33    pub size: u64,
34    /// Complete filename with extension (e.g., "movie.mkv")
35    pub name: String,
36    /// File extension (without the dot)
37    pub extension: String,
38    /// Relative path from scan root for recursive matching
39    pub relative_path: String,
40}
41/// Generate a unique UUIDv7-based identifier for a discovered media file.
42///
43/// The returned string has the form `file_<uuid-v7-hyphenated>` (length 41)
44/// and embeds a `unix_time_ts` strictly greater than that of every previous
45/// ID produced by the same generator instance. Callers that need monotonic
46/// ordering across an entire scan SHALL share a single
47/// [`Uuidv7Generator`] across all calls in the scan.
48pub fn generate_file_id(generator: &mut Uuidv7Generator) -> String {
49    format!("file_{}", generator.next_id().hyphenated())
50}
51
52// Unit tests: FileDiscovery file matching logic
53#[cfg(test)]
54mod tests {
55    use super::*;
56    use std::fs;
57    use tempfile::TempDir;
58
59    fn create_test_files(dir: &std::path::Path) {
60        let _ = fs::write(dir.join("video1.mp4"), b"");
61        let _ = fs::write(dir.join("video2.mkv"), b"");
62        let _ = fs::write(dir.join("subtitle1.srt"), b"");
63        let sub = dir.join("season1");
64        fs::create_dir_all(&sub).unwrap();
65        let _ = fs::write(sub.join("episode1.mp4"), b"");
66        let _ = fs::write(sub.join("episode1.srt"), b"");
67        let _ = fs::write(dir.join("note.txt"), b"");
68    }
69
70    #[test]
71    fn test_file_discovery_non_recursive() {
72        let temp = TempDir::new().unwrap();
73        create_test_files(temp.path());
74        let disco = FileDiscovery::new();
75        let files = disco.scan_directory(temp.path(), false).unwrap();
76        let vids = files
77            .iter()
78            .filter(|f| matches!(f.file_type, MediaFileType::Video))
79            .count();
80        let subs = files
81            .iter()
82            .filter(|f| matches!(f.file_type, MediaFileType::Subtitle))
83            .count();
84        assert_eq!(vids, 2);
85        assert_eq!(subs, 1);
86        assert!(!files.iter().any(|f| f.relative_path.contains("episode1")));
87    }
88
89    #[test]
90    fn test_file_discovery_recursive() {
91        let temp = TempDir::new().unwrap();
92        create_test_files(temp.path());
93        let disco = FileDiscovery::new();
94        let files = disco.scan_directory(temp.path(), true).unwrap();
95        let vids = files
96            .iter()
97            .filter(|f| matches!(f.file_type, MediaFileType::Video))
98            .count();
99        let subs = files
100            .iter()
101            .filter(|f| matches!(f.file_type, MediaFileType::Subtitle))
102            .count();
103        assert_eq!(vids, 3);
104        assert_eq!(subs, 2);
105        assert!(files.iter().any(|f| f.relative_path.contains("episode1")));
106    }
107
108    #[test]
109    fn test_file_classification_and_extensions() {
110        let temp = TempDir::new().unwrap();
111        let v = temp.path().join("t.mp4");
112        fs::write(&v, b"").unwrap();
113        let s = temp.path().join("t.srt");
114        fs::write(&s, b"").unwrap();
115        let x = temp.path().join("t.txt");
116        fs::write(&x, b"").unwrap();
117        let disco = FileDiscovery::new();
118        let vf = disco
119            .classify_file(&v, temp.path(), &mut Uuidv7Generator::new())
120            .unwrap()
121            .unwrap();
122        assert!(matches!(vf.file_type, MediaFileType::Video));
123        assert_eq!(vf.name, "t.mp4");
124        let sf = disco
125            .classify_file(&s, temp.path(), &mut Uuidv7Generator::new())
126            .unwrap()
127            .unwrap();
128        assert!(matches!(sf.file_type, MediaFileType::Subtitle));
129        assert_eq!(sf.name, "t.srt");
130        let none = disco
131            .classify_file(&x, temp.path(), &mut Uuidv7Generator::new())
132            .unwrap();
133        assert!(none.is_none());
134        assert!(disco.video_extensions.contains(&"mp4".to_string()));
135        assert!(disco.subtitle_extensions.contains(&"srt".to_string()));
136    }
137
138    #[test]
139    fn test_empty_and_nonexistent_directory() {
140        let temp = TempDir::new().unwrap();
141        let disco = FileDiscovery::new();
142        let files = disco.scan_directory(temp.path(), false).unwrap();
143        assert!(files.is_empty());
144        let res = disco.scan_directory(&std::path::Path::new("/nonexistent/path"), false);
145        assert!(res.is_err());
146    }
147}
148
149// Unit tests for unique ID generation and MediaFile structure
150#[cfg(test)]
151mod id_tests {
152    use super::*;
153    use crate::core::uuidv7::unix_time_ms;
154    use std::fs;
155    use tempfile::TempDir;
156
157    fn parse_file_id(id: &str) -> uuid::Uuid {
158        let stripped = id
159            .strip_prefix("file_")
160            .expect("file id must begin with `file_`");
161        uuid::Uuid::parse_str(stripped).expect("file id must contain a valid UUID")
162    }
163
164    #[test]
165    fn test_media_file_structure_with_unique_id() {
166        let temp = TempDir::new().unwrap();
167        let video_path = temp.path().join("[Test][01].mkv");
168        fs::write(&video_path, b"dummy content").unwrap();
169
170        let disco = FileDiscovery::new();
171        let files = disco.scan_directory(temp.path(), false).unwrap();
172
173        let video_file = files
174            .iter()
175            .find(|f| matches!(f.file_type, MediaFileType::Video))
176            .unwrap();
177
178        assert!(!video_file.id.is_empty());
179        assert!(video_file.id.starts_with("file_"));
180        assert_eq!(video_file.id.len(), 41);
181        let parsed = parse_file_id(&video_file.id);
182        assert_eq!(parsed.get_version_num(), 7);
183
184        assert_eq!(video_file.name, "[Test][01].mkv");
185        assert_eq!(video_file.extension, "mkv");
186        assert_eq!(video_file.relative_path, "[Test][01].mkv");
187    }
188
189    #[test]
190    fn test_uuidv7_id_generation() {
191        let mut gen1 = Uuidv7Generator::new();
192        let id1 = generate_file_id(&mut gen1);
193        assert!(id1.starts_with("file_"));
194        assert_eq!(id1.len(), 41);
195
196        let parsed1 = parse_file_id(&id1);
197        assert_eq!(parsed1.get_version_num(), 7);
198
199        let id2 = generate_file_id(&mut gen1);
200        let parsed2 = parse_file_id(&id2);
201        assert_eq!(parsed2.get_version_num(), 7);
202
203        assert!(
204            unix_time_ms(&parsed2) > unix_time_ms(&parsed1),
205            "second id's unix_time_ts must strictly exceed the first"
206        );
207    }
208
209    #[test]
210    fn test_recursive_mode_with_unique_ids() {
211        let temp = TempDir::new().unwrap();
212        let sub_dir = temp.path().join("season1");
213        fs::create_dir_all(&sub_dir).unwrap();
214
215        let video1 = temp.path().join("movie.mkv");
216        let video2 = sub_dir.join("episode1.mkv");
217        fs::write(&video1, b"content1").unwrap();
218        fs::write(&video2, b"content2").unwrap();
219
220        let disco = FileDiscovery::new();
221        let files = disco.scan_directory(temp.path(), true).unwrap();
222
223        let root_video = files.iter().find(|f| f.name == "movie.mkv").unwrap();
224        let sub_video = files.iter().find(|f| f.name == "episode1.mkv").unwrap();
225
226        assert_ne!(root_video.id, sub_video.id);
227        assert_eq!(root_video.id.len(), 41);
228        assert_eq!(sub_video.id.len(), 41);
229        assert_eq!(parse_file_id(&root_video.id).get_version_num(), 7);
230        assert_eq!(parse_file_id(&sub_video.id).get_version_num(), 7);
231        assert_eq!(root_video.relative_path, "movie.mkv");
232        assert_eq!(sub_video.relative_path, "season1/episode1.mkv");
233    }
234
235    #[test]
236    fn test_uuidv7_id_shape_basic() {
237        let mut generator = Uuidv7Generator::new();
238        let id = generate_file_id(&mut generator);
239        assert!(id.starts_with("file_"));
240        assert_eq!(id.len(), 41);
241        assert_eq!(parse_file_id(&id).get_version_num(), 7);
242    }
243}
244
245impl Default for FileDiscovery {
246    fn default() -> Self {
247        Self::new()
248    }
249}
250
251/// Enumeration of supported media file types.
252///
253/// Classifies discovered files into their primary categories for
254/// processing by the subtitle matching system.
255#[derive(Debug, Clone)]
256pub enum MediaFileType {
257    /// Video file (e.g., .mp4, .mkv, .avi)
258    Video,
259    /// Subtitle file (e.g., .srt, .ass, .vtt)
260    Subtitle,
261}
262
263/// File discovery engine for scanning and classifying media files.
264pub struct FileDiscovery {
265    video_extensions: Vec<String>,
266    subtitle_extensions: Vec<String>,
267}
268
269impl FileDiscovery {
270    /// Creates a new `FileDiscovery` with default video and subtitle extensions.
271    pub fn new() -> Self {
272        Self {
273            video_extensions: vec![
274                "mp4".to_string(),
275                "mkv".to_string(),
276                "avi".to_string(),
277                "mov".to_string(),
278                "wmv".to_string(),
279                "flv".to_string(),
280                "m4v".to_string(),
281                "webm".to_string(),
282            ],
283            subtitle_extensions: vec![
284                "srt".to_string(),
285                "ass".to_string(),
286                "vtt".to_string(),
287                "sub".to_string(),
288                "ssa".to_string(),
289                "idx".to_string(),
290            ],
291        }
292    }
293
294    /// Scans the given directory and returns all media files found.
295    ///
296    /// # Arguments
297    ///
298    /// * `path` - The root directory to scan.
299    /// * `recursive` - Whether to scan subdirectories recursively.
300    pub fn scan_directory(&self, root_path: &Path, recursive: bool) -> Result<Vec<MediaFile>> {
301        let mut files = Vec::new();
302        let mut id_gen = Uuidv7Generator::new();
303
304        let walker = if recursive {
305            WalkDir::new(root_path).into_iter()
306        } else {
307            WalkDir::new(root_path).max_depth(1).into_iter()
308        };
309
310        for entry in walker {
311            let entry = entry?;
312            let path = entry.path();
313
314            let ft = entry.file_type();
315            if ft.is_symlink() {
316                log::debug!("Skipping symlink: {}", path.display());
317                continue;
318            }
319            if ft.is_file() {
320                if let Some(media_file) = self.classify_file(path, root_path, &mut id_gen)? {
321                    files.push(media_file);
322                }
323            }
324        }
325
326        Ok(files)
327    }
328
329    /// Creates MediaFile objects from a list of file paths.
330    ///
331    /// This method processes each file path individually, creating MediaFile objects
332    /// with consistent IDs that match those generated by scan_directory.
333    ///
334    /// # Arguments
335    ///
336    /// * `file_paths` - A slice of file paths to process
337    ///
338    /// # Returns
339    ///
340    /// A vector of `MediaFile` objects for valid media files, or an error if file access fails.
341    pub fn scan_file_list(&self, file_paths: &[PathBuf]) -> Result<Vec<MediaFile>> {
342        let mut media_files = Vec::new();
343        let mut id_gen = Uuidv7Generator::new();
344
345        for path in file_paths {
346            if !path.exists() {
347                continue; // Skip non-existent files
348            }
349
350            if !path.is_file() {
351                continue; // Skip directories
352            }
353
354            if let Some(extension) = path.extension().and_then(|e| e.to_str()) {
355                let extension_lower = extension.to_lowercase();
356
357                // Check if it's a video or subtitle file
358                let file_type = if self.video_extensions.contains(&extension_lower) {
359                    MediaFileType::Video
360                } else if self.subtitle_extensions.contains(&extension_lower) {
361                    MediaFileType::Subtitle
362                } else {
363                    continue; // Skip non-media files
364                };
365
366                if let Ok(metadata) = path.metadata() {
367                    let name = path
368                        .file_name()
369                        .and_then(|n| n.to_str())
370                        .unwrap_or("")
371                        .to_string();
372
373                    // For file list scanning, use filename as relative path
374                    // This maintains compatibility with existing display logic
375                    let relative_path = name.clone();
376
377                    let media_file = MediaFile {
378                        id: generate_file_id(&mut id_gen),
379                        path: path.clone(),
380                        file_type,
381                        size: metadata.len(),
382                        name,
383                        extension: extension_lower,
384                        relative_path,
385                    };
386                    media_files.push(media_file);
387                }
388            }
389        }
390
391        Ok(media_files)
392    }
393
394    /// Classifies a file by its extension and gathers its metadata.
395    ///
396    /// Returns `Some(MediaFile)` if the file is a recognized media type,
397    /// or `None` otherwise.
398    fn classify_file(
399        &self,
400        path: &Path,
401        scan_root: &Path,
402        id_gen: &mut Uuidv7Generator,
403    ) -> Result<Option<MediaFile>> {
404        let extension = path
405            .extension()
406            .and_then(|ext| ext.to_str())
407            .map(|s| s.to_lowercase())
408            .unwrap_or_default();
409
410        let file_type = if self.video_extensions.contains(&extension) {
411            MediaFileType::Video
412        } else if self.subtitle_extensions.contains(&extension) {
413            MediaFileType::Subtitle
414        } else {
415            return Ok(None);
416        };
417
418        let metadata = std::fs::metadata(path)?;
419        // Complete filename with extension
420        let name = path
421            .file_name()
422            .and_then(|n| n.to_str())
423            .unwrap_or_default()
424            .to_string();
425
426        // Compute relative path with normalized separators
427        let relative_path = path
428            .strip_prefix(scan_root)
429            .unwrap_or(path)
430            .to_string_lossy()
431            .replace('\\', "/"); // Normalize to Unix-style separators for consistency
432
433        // Generate a per-scan unique UUIDv7-based identifier
434        let id = generate_file_id(id_gen);
435
436        Ok(Some(MediaFile {
437            id,
438            path: path.to_path_buf(),
439            file_type,
440            size: metadata.len(),
441            name,
442            extension,
443            relative_path,
444        }))
445    }
446}