Skip to main content

subx_cli/commands/
detect_encoding_command.rs

1//! Advanced character encoding detection command implementation.
2//!
3//! This module provides sophisticated character encoding detection capabilities
4//! for subtitle files, helping users identify and resolve encoding issues that
5//! can cause display problems with non-ASCII characters. It uses multiple
6//! detection algorithms and heuristics to provide accurate encoding identification.
7//!
8//! # Detection Algorithms
9//!
10//! The encoding detection system employs multiple complementary approaches:
11//!
12//! ## Byte Order Mark (BOM) Detection
13//! - **UTF-8**: EF BB BF byte sequence
14//! - **UTF-16LE**: FF FE byte sequence
15//! - **UTF-16BE**: FE FF byte sequence
16//! - **UTF-32**: Various 4-byte BOM sequences
17//!
18//! ## Statistical Analysis
19//! - **Character Frequency**: Analyze byte patterns for specific encodings
20//! - **Bigram Analysis**: Examine two-byte character combinations
21//! - **Language Heuristics**: Apply language-specific character patterns
22//! - **Confidence Scoring**: Quantify detection reliability
23//!
24//! ## Format-Specific Detection
25//! - **ASCII Compatibility**: Check for pure ASCII content
26//! - **Extended ASCII**: Detect Windows-1252, ISO-8859-1 variants
27//! - **Multi-byte Encodings**: Identify UTF-8, GB2312, Shift_JIS patterns
28//! - **Legacy Encodings**: Support for regional and historical encodings
29//!
30//! # Supported Encodings
31//!
32//! ## Unicode Family
33//! - **UTF-8**: Universal encoding, recommended for all new files
34//! - **UTF-16LE/BE**: Unicode with byte order variants
35//! - **UTF-32**: Full Unicode support with fixed width
36//!
37//! ## Western European
38//! - **ISO-8859-1 (Latin-1)**: Basic Western European characters
39//! - **Windows-1252**: Microsoft's Western European encoding
40//! - **ISO-8859-15**: Latin-1 with Euro symbol support
41//!
42//! ## East Asian
43//! - **GB2312/GBK**: Simplified Chinese encodings
44//! - **Big5**: Traditional Chinese encoding
45//! - **Shift_JIS**: Japanese encoding
46//! - **EUC-JP**: Alternative Japanese encoding
47//! - **EUC-KR**: Korean encoding
48//!
49//! ## Cyrillic and Others
50//! - **Windows-1251**: Russian and Cyrillic languages
51//! - **KOI8-R**: Russian encoding
52//! - **ISO-8859-5**: Cyrillic alphabet
53//!
54//! # Detection Features
55//!
56//! - **Confidence Scoring**: Reliability percentage for each detection
57//! - **Alternative Suggestions**: Multiple encoding candidates with scores
58//! - **Content Sampling**: Display decoded text samples for verification
59//! - **Language Hints**: Detect probable language from character patterns
60//! - **Format Validation**: Verify encoding produces valid subtitle content
61//!
62//! # Examples
63//!
64//! ```rust,ignore
65//! use subx_cli::commands::detect_encoding_command;
66//!
67//! // Detect encoding for multiple files
68//! let files = vec![
69//!     "subtitle1.srt".to_string(),
70//!     "subtitle2.ass".to_string(),
71//! ];
72//! detect_encoding_command::detect_encoding_command(&files, true)?;
73//!
74//! // Basic detection without verbose output
75//! detect_encoding_command::detect_encoding_command(&["file.srt".to_string()], false)?;
76//! ```
77
78use std::path::PathBuf;
79
80use crate::Result;
81use crate::config::ConfigService;
82use crate::core::formats::encoding::EncodingDetector;
83use log::error;
84
85/// Execute character encoding detection for subtitle files with comprehensive analysis.
86///
87/// This function performs advanced character encoding detection on subtitle files,
88/// providing detailed information about detected encodings, confidence levels,
89/// and content samples. It supports both basic detection and verbose analysis
90/// modes to meet different user needs.
91///
92/// # Detection Process
93///
94/// 1. **File Validation**: Verify file existence and accessibility
95/// 2. **Initial Scanning**: Read file header and sample content
96/// 3. **BOM Detection**: Check for Unicode Byte Order Marks
97/// 4. **Statistical Analysis**: Analyze byte patterns and character frequencies
98/// 5. **Language Heuristics**: Apply language-specific detection rules
99/// 6. **Confidence Calculation**: Score each potential encoding
100/// 7. **Result Ranking**: Order candidates by confidence level
101/// 8. **Output Generation**: Format results for user presentation
102///
103/// # Verbose Mode Features
104///
105/// When `verbose` is enabled, the output includes:
106/// - **Confidence Percentages**: Numerical reliability scores
107/// - **Content Samples**: Decoded text previews
108/// - **Alternative Encodings**: Other possible encodings with scores
109/// - **Detection Metadata**: Technical details about the detection process
110/// - **Language Hints**: Probable content language indicators
111///
112/// # Error Handling
113///
114/// The function provides robust error handling:
115/// - **File Access**: Clear messages for permission or existence issues
116/// - **Corruption Detection**: Identification of damaged or invalid files
117/// - **Encoding Failures**: Graceful handling of undetectable encodings
118/// - **Partial Processing**: Continue with other files if individual files fail
119///
120/// # Output Formats
121///
122/// ## Basic Mode
123/// ```text
124/// file1.srt: UTF-8
125/// file2.ass: Windows-1252
126/// file3.vtt: GB2312
127/// ```
128///
129/// ## Verbose Mode
130/// ```text
131/// file1.srt: UTF-8 (99.5% confidence)
132/// Sample: "1\n00:00:01,000 --> 00:00:03,000\nHello World"
133/// Alternatives: ISO-8859-1 (15.2%), Windows-1252 (12.8%)
134/// Language: English (detected)
135///
136/// file2.ass: Windows-1252 (87.3% confidence)
137/// Sample: "[Script Info]\nTitle: Movie Subtitle"
138/// Alternatives: ISO-8859-1 (45.1%), UTF-8 (23.7%)
139/// Language: Mixed/Unknown
140/// ```
141///
142/// # Performance Considerations
143///
144/// - **Streaming Analysis**: Large files processed efficiently
145/// - **Sample-based Detection**: Uses representative file portions
146/// - **Caching**: Results cached for repeated operations
147/// - **Parallel Processing**: Multiple files analyzed concurrently
148///
149/// # Arguments
150///
151/// * `file_paths` - Vector of file paths to analyze for encoding
152/// * `verbose` - Enable detailed output with confidence scores and samples
153///
154/// # Returns
155///
156/// Returns `Ok(())` on successful analysis completion, or an error if:
157/// - Critical system resources are unavailable
158/// - All specified files are inaccessible
159/// - The encoding detection system fails to initialize
160///
161/// # Examples
162///
163/// ```rust,ignore
164/// use subx_cli::commands::detect_encoding_command;
165///
166/// // Quick encoding check for single file
167/// detect_encoding_command::detect_encoding_command(
168///     &["subtitle.srt".to_string()],
169///     false
170/// )?;
171///
172/// // Detailed analysis for multiple files
173/// let files = vec![
174///     "episode1.srt".to_string(),
175///     "episode2.ass".to_string(),
176///     "episode3.vtt".to_string(),
177/// ];
178/// detect_encoding_command::detect_encoding_command(&files, true)?;
179///
180/// // Batch analysis with glob patterns (shell expansion)
181/// let glob_files = vec![
182///     "season1/*.srt".to_string(),
183///     "season2/*.ass".to_string(),
184/// ];
185/// detect_encoding_command::detect_encoding_command(&glob_files, false)?;
186/// ```
187///
188/// # Use Cases
189///
190/// - **Troubleshooting**: Identify encoding issues causing display problems
191/// - **Conversion Planning**: Determine current encoding before conversion
192/// - **Quality Assurance**: Verify encoding consistency across file collections
193/// - **Migration**: Assess encoding diversity when migrating subtitle libraries
194/// - **Automation**: Integrate encoding detection into batch processing workflows
195use crate::cli::DetectEncodingArgs;
196use crate::error::SubXError;
197
198/// Execute character encoding detection for subtitle files based on input arguments.
199pub fn detect_encoding_command(args: &DetectEncodingArgs) -> Result<()> {
200    // Initialize the encoding detection engine
201    let detector = EncodingDetector::with_defaults();
202
203    // For `-i` input paths, use InputPathHandler with archive extraction support,
204    // keeping CollectedFiles alive so archive temp dirs persist through processing.
205    // For positional file_paths, pass them through directly — they may include
206    // nonexistent paths that are gracefully handled in the processing loop below.
207    let collected;
208    let direct_paths: Vec<PathBuf>;
209    let paths: &[PathBuf] = if !args.input_paths.is_empty() {
210        let handler = args
211            .get_input_handler()
212            .map_err(|e| SubXError::CommandExecution(e.to_string()))?;
213        collected = handler
214            .collect_files()
215            .map_err(|e| SubXError::CommandExecution(e.to_string()))?;
216        &collected
217    } else if !args.file_paths.is_empty() {
218        direct_paths = args.file_paths.iter().map(PathBuf::from).collect();
219        &direct_paths
220    } else {
221        return Err(SubXError::NoInputSpecified);
222    };
223
224    // Process each file individually to provide isolated error handling
225    for path in paths {
226        if !path.exists() {
227            error!("Path does not exist: {}", path.display());
228            continue;
229        }
230        let file_str = path.to_string_lossy();
231        match detector.detect_file_encoding(&file_str) {
232            Ok(info) => {
233                let name = path
234                    .file_name()
235                    .and_then(|n| n.to_str())
236                    .unwrap_or(&file_str);
237                println!("File: {name}");
238                println!(
239                    "  Encoding: {:?} (Confidence: {:.1}%) BOM: {}",
240                    info.charset,
241                    info.confidence * 100.0,
242                    if info.bom_detected { "Yes" } else { "No" }
243                );
244                let sample = if args.verbose {
245                    info.sample_text.clone()
246                } else if info.sample_text.len() > 50 {
247                    format!("{}...", &info.sample_text[..47])
248                } else {
249                    info.sample_text.clone()
250                };
251                println!("  Sample text: {sample}\n");
252            }
253            Err(e) => error!("Unable to detect encoding for {}: {}", path.display(), e),
254        }
255    }
256    Ok(())
257}
258
259/// Execute encoding detection command with injected configuration service.
260///
261/// This function provides the new dependency injection interface for the detect_encoding command,
262/// accepting a configuration service instead of loading configuration globally.
263///
264/// # Arguments
265///
266/// * `file_paths` - File paths to analyze for encoding detection
267/// * `verbose` - Whether to show verbose output
268/// * `config_service` - Configuration service providing access to settings
269///
270/// # Returns
271///
272/// Returns `Ok(())` on successful completion, or an error if detection fails.
273pub fn detect_encoding_command_with_config(
274    args: DetectEncodingArgs,
275    _config_service: &dyn ConfigService,
276) -> Result<()> {
277    // Delegate to new implementation based on input argument struct
278    detect_encoding_command(&args)
279}