subx_cli/core/matcher/
mod.rs

1//! AI-powered subtitle file matching and discovery engine.
2//!
3//! This module provides sophisticated algorithms for automatically matching subtitle
4//! files with their corresponding video files using AI analysis, language detection,
5//! and intelligent filename pattern recognition. It handles complex scenarios including
6//! multiple subtitle languages, season/episode structures, and various naming conventions.
7//!
8//! # Core Features
9//!
10//! ## Intelligent File Discovery
11//! - **Recursive Search**: Traverses directory structures to find media and subtitle files
12//! - **Format Detection**: Automatically identifies video and subtitle file formats
13//! - **Pattern Recognition**: Understands common naming patterns and conventions
14//! - **Language Detection**: Identifies subtitle languages from filenames and content
15//!
16//! ## AI-Powered Matching
17//! - **Semantic Analysis**: Uses AI to understand filename semantics beyond patterns
18//! - **Content Correlation**: Matches based on content similarity and timing patterns
19//! - **Multi-Language Support**: Handles subtitle files in different languages
20//! - **Confidence Scoring**: Provides match confidence levels for user validation
21//!
22//! ## Advanced Matching Algorithms
23//! - **Fuzzy Matching**: Tolerates variations in naming conventions
24//! - **Episode Detection**: Recognizes season/episode patterns in TV series
25//! - **Quality Assessment**: Evaluates subtitle quality and completeness
26//! - **Conflict Resolution**: Handles multiple subtitle candidates intelligently
27//!
28//! # Architecture Overview
29//!
30//! The matching system consists of several interconnected components:
31//!
32//! ```text
33//! ┌─────────────────┐    ┌──────────────────┐    ┌─────────────────┐
34//! │   Discovery     │────│   AI Analysis    │────│   Match Engine  │
35//! │   - Find files  │    │   - Semantic     │    │   - Score calc  │
36//! │   - Language    │    │   - Content      │    │   - Validation  │
37//! │   - Metadata    │    │   - Confidence   │    │   - Ranking     │
38//! └─────────────────┘    └──────────────────┘    └─────────────────┘
39//!         │                        │                        │
40//!         └────────────────────────┼────────────────────────┘
41//!                                  │
42//!                    ┌─────────────────────────┐
43//!                    │       Cache System      │
44//!                    │   - Analysis results    │
45//!                    │   - Match history       │
46//!                    │   - Performance data    │
47//!                    └─────────────────────────┘
48//! ```
49//!
50//! # Usage Examples
51//!
52//! ## Basic File Matching
53//!
54//! ```rust,ignore
55//! use subx_cli::core::matcher::{MatchEngine, MatchConfig, FileDiscovery};
56//! use std::path::Path;
57//!
58//! // Configure matching parameters
59//! let config = MatchConfig {
60//!     confidence_threshold: 0.8,
61//!     dry_run: false,
62//!     ai_provider: Some("openai".to_string()),
63//!     ..Default::default()
64//! };
65//!
66//! // Initialize the matching engine
67//! let engine = MatchEngine::new(config);
68//!
69//! // Discover files in directories
70//! let discovery = FileDiscovery::new();
71//! let video_files = discovery.find_media_files(Path::new("/videos"))?;
72//! let subtitle_files = discovery.find_subtitle_files(Path::new("/subtitles"))?;
73//!
74//! // Perform matching
75//! let matches = engine.match_files(&video_files, &subtitle_files).await?;
76//!
77//! for match_result in matches {
78//!     println!("Matched: {} -> {} (confidence: {:.2})",
79//!         match_result.video_file.name,
80//!         match_result.subtitle_file.name,
81//!         match_result.confidence
82//!     );
83//! }
84//! ```
85//!
86//! ## Advanced Matching with Language Filtering
87//!
88//! ```rust,ignore
89//! use subx_cli::core::matcher::MatchConfig;
90//!
91//! let config = MatchConfig {
92//!     target_languages: vec!["zh".to_string(), "en".to_string()],
93//!     exclude_languages: vec!["jp".to_string()],
94//!     confidence_threshold: 0.75,
95//!     max_matches_per_video: 2, // Allow multiple subtitle languages
96//!     ..Default::default()
97//! };
98//!
99//! let matches = engine.match_files_with_config(&video_files, &subtitle_files, config).await?;
100//! ```
101//!
102//! ## TV Series Episode Matching
103//!
104//! ```rust,ignore
105//! // For TV series with season/episode structure
106//! let tv_config = MatchConfig {
107//!     series_mode: true,
108//!     season_episode_patterns: vec![
109//!         r"S(\d+)E(\d+)".to_string(),
110//!         r"Season (\d+) Episode (\d+)".to_string(),
111//!     ],
112//!     ..Default::default()
113//! };
114//!
115//! let tv_matches = engine.match_tv_series(&video_files, &subtitle_files, tv_config).await?;
116//! ```
117//!
118//! # Matching Algorithms
119//!
120//! ## 1. Filename Analysis
121//! - **Pattern Extraction**: Identifies common patterns like episode numbers, years, quality markers
122//! - **Language Code Detection**: Recognizes language codes in various formats (en, eng, english, etc.)
123//! - **Normalization**: Standardizes filenames for comparison by removing common variations
124//!
125//! ## 2. AI Semantic Analysis
126//! - **Title Extraction**: Uses AI to identify actual titles from complex filenames
127//! - **Content Understanding**: Analyzes subtitle content to understand context and themes
128//! - **Cross-Reference**: Compares extracted information between video and subtitle files
129//!
130//! ## 3. Confidence Scoring
131//! - **Multiple Factors**: Combines filename similarity, language match, content correlation
132//! - **Weighted Scoring**: Applies different weights based on reliability of each factor
133//! - **Threshold Filtering**: Only presents matches above configurable confidence levels
134//!
135//! ## 4. Conflict Resolution
136//! - **Ranking**: Orders multiple candidates by confidence score
137//! - **Deduplication**: Removes duplicate or overlapping matches
138//! - **User Preferences**: Applies user-defined preferences for language, quality, etc.
139//!
140//! # Performance Characteristics
141//!
142//! - **Caching**: Results are cached to avoid re-analysis of unchanged files
143//! - **Parallel Processing**: File analysis is performed concurrently for speed
144//! - **Incremental Updates**: Only processes new or modified files in subsequent runs
145//! - **Memory Efficient**: Streams large directory structures without loading all data
146//!
147//! # Error Handling
148//!
149//! The matching system provides comprehensive error handling for:
150//! - File system access issues (permissions, missing directories)
151//! - AI service connectivity and quota problems
152//! - Invalid or corrupted subtitle files
153//! - Configuration validation errors
154//! - Network timeouts and service degradation
155//!
156//! # Thread Safety
157//!
158//! All matching operations are thread-safe and can be used concurrently.
159//! The cache system uses appropriate synchronization for multi-threaded access.
160
161#![allow(dead_code)]
162
163pub mod discovery;
164pub mod engine;
165// Filename analyzer removed to simplify matching logic.
166
167pub use discovery::{FileDiscovery, MediaFile, MediaFileType};
168pub use engine::{MatchConfig, MatchEngine, MatchOperation};
169// pub use filename_analyzer::{FilenameAnalyzer, ParsedFilename};
170pub mod cache;
171use crate::Result;
172use crate::core::language::{LanguageDetector, LanguageInfo};
173use crate::error::SubXError;
174use std::path::{Path, PathBuf};
175
176/// Extended file information structure with metadata for intelligent matching.
177///
178/// This structure contains comprehensive information about discovered files,
179/// including path relationships, language detection results, and contextual
180/// metadata that enables sophisticated matching algorithms.
181///
182/// # Purpose
183///
184/// `FileInfo` serves as the primary data structure for file representation
185/// in the matching system. It normalizes file information from different
186/// sources and provides a consistent interface for matching algorithms.
187///
188/// # Path Relationships
189///
190/// The structure maintains three different path representations:
191/// - `name`: Just the filename for display and basic comparison
192/// - `relative_path`: Path relative to search root for organization
193/// - `full_path`: Absolute path for file system operations
194///
195/// # Language Detection
196///
197/// Language information is automatically detected from:
198/// - Filename patterns (e.g., "movie.en.srt", "film.zh-tw.ass")
199/// - Directory structure (e.g., "English/", "Chinese/")
200/// - File content analysis for subtitle files
201///
202/// # Examples
203///
204/// ```rust,ignore
205/// use subx_cli::core::matcher::FileInfo;
206/// use std::path::PathBuf;
207///
208/// let root = PathBuf::from("/media/movies");
209/// let file_path = PathBuf::from("/media/movies/Action/movie.en.srt");
210///
211/// let file_info = FileInfo::new(&file_path, &root)?;
212///
213/// assert_eq!(file_info.name, "movie.en.srt");
214/// assert_eq!(file_info.relative_path, "Action/movie.en.srt");
215/// assert_eq!(file_info.directory, "Action");
216/// assert_eq!(file_info.depth, 1);
217///
218/// if let Some(lang) = &file_info.language {
219///     println!("Detected language: {}", lang.code);
220/// }
221/// ```
222#[derive(Debug, Clone)]
223pub struct FileInfo {
224    /// File name without directory path for display and comparison.
225    ///
226    /// This is the base filename including extension, useful for
227    /// pattern matching and user-friendly display.
228    pub name: String,
229
230    /// Path relative to the search root directory for organization.
231    ///
232    /// Maintains the directory structure context while being
233    /// independent of the absolute filesystem location.
234    pub relative_path: String,
235
236    /// Absolute file system path for file operations.
237    ///
238    /// Used for actual file reading, writing, and metadata access.
239    pub full_path: PathBuf,
240
241    /// Name of the immediate parent directory containing the file.
242    ///
243    /// Useful for organization-based matching and language detection
244    /// from directory names.
245    pub directory: String,
246
247    /// Directory depth relative to the root search path.
248    ///
249    /// Indicates how many subdirectory levels deep the file is located.
250    /// Depth 0 means the file is directly in the root directory.
251    pub depth: usize,
252
253    /// Detected language information from filename or content analysis.
254    ///
255    /// Contains language code, confidence level, and detection method.
256    /// May be `None` if no language could be reliably detected.
257    pub language: Option<LanguageInfo>,
258}
259
260impl FileInfo {
261    /// Construct a new `FileInfo` from a file path and search root directory.
262    ///
263    /// This method performs comprehensive analysis of the file location,
264    /// extracting path relationships, directory structure, and attempting
265    /// automatic language detection from the filename and path.
266    ///
267    /// # Arguments
268    ///
269    /// * `full_path` - Absolute path to the media or subtitle file
270    /// * `root_path` - Root directory for file discovery (used to compute relative paths)
271    ///
272    /// # Returns
273    ///
274    /// Returns a `FileInfo` struct with all metadata populated, including
275    /// optional language detection results.
276    ///
277    /// # Errors
278    ///
279    /// Returns `SubXError::Other` if:
280    /// - The file path cannot be made relative to the root path
281    /// - Path contains invalid Unicode characters
282    /// - File system access issues occur during analysis
283    ///
284    /// # Examples
285    ///
286    /// ```rust,ignore
287    /// use subx_cli::core::matcher::FileInfo;
288    /// use std::path::PathBuf;
289    ///
290    /// // Simple file in root directory
291    /// let root = PathBuf::from("/media/videos");
292    /// let file_path = root.join("movie.mp4");
293    /// let info = FileInfo::new(file_path, &root)?;
294    ///
295    /// assert_eq!(info.name, "movie.mp4");
296    /// assert_eq!(info.relative_path, "movie.mp4");
297    /// assert_eq!(info.depth, 0);
298    ///
299    /// // File in subdirectory with language
300    /// let sub_file = root.join("English").join("movie.en.srt");
301    /// let sub_info = FileInfo::new(sub_file, &root)?;
302    ///
303    /// assert_eq!(sub_info.name, "movie.en.srt");
304    /// assert_eq!(sub_info.relative_path, "English/movie.en.srt");
305    /// assert_eq!(sub_info.directory, "English");
306    /// assert_eq!(sub_info.depth, 1);
307    /// assert!(sub_info.language.is_some());
308    /// ```
309    ///
310    /// # Implementation Details
311    ///
312    /// - Path separators are normalized to Unix style (/) for consistency
313    /// - Directory depth is calculated based on relative path components
314    /// - Language detection runs automatically using multiple detection methods
315    /// - All path operations are Unicode-safe with fallback to empty strings
316    pub fn new(full_path: PathBuf, root_path: &Path) -> Result<Self> {
317        // Calculate relative path by stripping the root prefix
318        let relative_path = full_path
319            .strip_prefix(root_path)
320            .map_err(|e| SubXError::Other(e.into()))?
321            .to_string_lossy()
322            .replace('\\', "/"); // Normalize to Unix-style separators
323
324        // Extract the base filename
325        let name = full_path
326            .file_name()
327            .and_then(|n| n.to_str())
328            .unwrap_or_default()
329            .to_string();
330
331        // Get the immediate parent directory name
332        let directory = full_path
333            .parent()
334            .and_then(|p| p.file_name())
335            .and_then(|n| n.to_str())
336            .unwrap_or_default()
337            .to_string();
338
339        // Calculate directory depth by counting path separators
340        let depth = relative_path.matches('/').count();
341
342        // Attempt automatic language detection from path and filename
343        let detector = LanguageDetector::new();
344        let language = detector.detect_from_path(&full_path);
345
346        Ok(Self {
347            name,
348            relative_path,
349            full_path,
350            directory,
351            depth,
352            language,
353        })
354    }
355
356    /// Get the file extension without the leading dot.
357    ///
358    /// Returns the file extension in lowercase, or an empty string if
359    /// no extension is present.
360    ///
361    /// # Examples
362    ///
363    /// ```rust,ignore
364    /// assert_eq!(file_info.extension(), "mp4");
365    /// assert_eq!(subtitle_info.extension(), "srt");
366    /// ```
367    pub fn extension(&self) -> String {
368        self.full_path
369            .extension()
370            .and_then(|ext| ext.to_str())
371            .unwrap_or_default()
372            .to_lowercase()
373    }
374
375    /// Get the filename without extension (stem).
376    ///
377    /// Returns the base filename with the extension removed, useful
378    /// for comparison and matching operations.
379    ///
380    /// # Examples
381    ///
382    /// ```rust,ignore
383    /// // For "movie.en.srt"
384    /// assert_eq!(file_info.stem(), "movie.en");
385    ///
386    /// // For "episode01.mp4"
387    /// assert_eq!(file_info.stem(), "episode01");
388    /// ```
389    pub fn stem(&self) -> String {
390        self.full_path
391            .file_stem()
392            .and_then(|stem| stem.to_str())
393            .unwrap_or_default()
394            .to_string()
395    }
396
397    /// Check if this file is in the root directory (depth 0).
398    ///
399    /// Returns `true` if the file is directly in the search root,
400    /// `false` if it's in a subdirectory.
401    pub fn is_in_root(&self) -> bool {
402        self.depth == 0
403    }
404
405    /// Check if this file has detected language information.
406    ///
407    /// Returns `true` if language detection was successful and
408    /// confidence is above the detection threshold.
409    pub fn has_language(&self) -> bool {
410        self.language.is_some()
411    }
412
413    /// Get the detected language code if available.
414    ///
415    /// Returns the language code string (e.g., "en", "zh", "ja")
416    /// or `None` if no language was detected.
417    ///
418    /// # Examples
419    ///
420    /// ```rust,ignore
421    /// if let Some(lang) = file_info.language_code() {
422    ///     println!("Detected language: {}", lang);
423    /// }
424    /// ```
425    pub fn language_code(&self) -> Option<&str> {
426        self.language.as_ref().map(|lang| lang.code.as_str())
427    }
428
429    /// Create a normalized version of the filename for comparison.
430    ///
431    /// Applies various normalization rules to make filenames more
432    /// comparable during matching operations:
433    /// - Converts to lowercase
434    /// - Removes common separators and special characters
435    /// - Standardizes whitespace
436    /// - Removes quality indicators and release group tags
437    ///
438    /// # Returns
439    ///
440    /// A normalized filename string suitable for fuzzy matching.
441    ///
442    /// # Examples
443    ///
444    /// ```rust,ignore
445    /// // "Movie.Name.2023.1080p.BluRay.x264-GROUP.mkv"
446    /// // becomes "movie name 2023"
447    /// let normalized = file_info.normalized_name();
448    /// ```
449    pub fn normalized_name(&self) -> String {
450        let mut name = self.stem().to_lowercase();
451
452        // Remove common separators
453        name = name.replace(['.', '_', '-'], " ");
454
455        // Remove quality indicators
456        let quality_patterns = [
457            "1080p", "720p", "480p", "4k", "2160p", "bluray", "webrip", "hdtv", "dvdrip", "x264",
458            "x265", "h264", "h265",
459        ];
460
461        for pattern in &quality_patterns {
462            name = name.replace(pattern, "");
463        }
464
465        // Remove release group tags (text within brackets/parentheses)
466        name = regex::Regex::new(r"\[.*?\]|\(.*?\)")
467            .unwrap()
468            .replace_all(&name, "")
469            .to_string();
470
471        // Normalize whitespace
472        name.split_whitespace().collect::<Vec<_>>().join(" ")
473    }
474}
475
476#[cfg(test)]
477mod tests {
478    use super::*;
479    use tempfile::TempDir;
480
481    #[test]
482    fn test_file_info_creation() -> Result<()> {
483        let temp = TempDir::new().unwrap();
484        let root = temp.path();
485        let file_path = root.join("season1").join("episode1.mp4");
486        std::fs::create_dir_all(file_path.parent().unwrap()).unwrap();
487        std::fs::write(&file_path, b"").unwrap();
488
489        let info = FileInfo::new(file_path.clone(), root)?;
490        assert_eq!(info.name, "episode1.mp4");
491        assert_eq!(info.relative_path, "season1/episode1.mp4");
492        assert_eq!(info.directory, "season1");
493        assert_eq!(info.depth, 1);
494        Ok(())
495    }
496
497    #[test]
498    fn test_file_info_deep_path() -> Result<()> {
499        let temp = TempDir::new().unwrap();
500        let root = temp.path();
501
502        // Test multi-level directory
503        let file_path = root
504            .join("series")
505            .join("season1")
506            .join("episodes")
507            .join("ep01.mp4");
508        std::fs::create_dir_all(file_path.parent().unwrap()).unwrap();
509        std::fs::write(&file_path, b"").unwrap();
510
511        let info = FileInfo::new(file_path.clone(), root)?;
512        assert_eq!(info.relative_path, "series/season1/episodes/ep01.mp4");
513        assert_eq!(info.depth, 3);
514
515        Ok(())
516    }
517}