subx_cli/commands/detect_encoding_command.rs
1//! Advanced character encoding detection command implementation.
2//!
3//! This module provides sophisticated character encoding detection capabilities
4//! for subtitle files, helping users identify and resolve encoding issues that
5//! can cause display problems with non-ASCII characters. It uses multiple
6//! detection algorithms and heuristics to provide accurate encoding identification.
7//!
8//! # Detection Algorithms
9//!
10//! The encoding detection system employs multiple complementary approaches:
11//!
12//! ## Byte Order Mark (BOM) Detection
13//! - **UTF-8**: EF BB BF byte sequence
14//! - **UTF-16LE**: FF FE byte sequence
15//! - **UTF-16BE**: FE FF byte sequence
16//! - **UTF-32**: Various 4-byte BOM sequences
17//!
18//! ## Statistical Analysis
19//! - **Character Frequency**: Analyze byte patterns for specific encodings
20//! - **Bigram Analysis**: Examine two-byte character combinations
21//! - **Language Heuristics**: Apply language-specific character patterns
22//! - **Confidence Scoring**: Quantify detection reliability
23//!
24//! ## Format-Specific Detection
25//! - **ASCII Compatibility**: Check for pure ASCII content
26//! - **Extended ASCII**: Detect Windows-1252, ISO-8859-1 variants
27//! - **Multi-byte Encodings**: Identify UTF-8, GB2312, Shift_JIS patterns
28//! - **Legacy Encodings**: Support for regional and historical encodings
29//!
30//! # Supported Encodings
31//!
32//! ## Unicode Family
33//! - **UTF-8**: Universal encoding, recommended for all new files
34//! - **UTF-16LE/BE**: Unicode with byte order variants
35//! - **UTF-32**: Full Unicode support with fixed width
36//!
37//! ## Western European
38//! - **ISO-8859-1 (Latin-1)**: Basic Western European characters
39//! - **Windows-1252**: Microsoft's Western European encoding
40//! - **ISO-8859-15**: Latin-1 with Euro symbol support
41//!
42//! ## East Asian
43//! - **GB2312/GBK**: Simplified Chinese encodings
44//! - **Big5**: Traditional Chinese encoding
45//! - **Shift_JIS**: Japanese encoding
46//! - **EUC-JP**: Alternative Japanese encoding
47//! - **EUC-KR**: Korean encoding
48//!
49//! ## Cyrillic and Others
50//! - **Windows-1251**: Russian and Cyrillic languages
51//! - **KOI8-R**: Russian encoding
52//! - **ISO-8859-5**: Cyrillic alphabet
53//!
54//! # Detection Features
55//!
56//! - **Confidence Scoring**: Reliability percentage for each detection
57//! - **Alternative Suggestions**: Multiple encoding candidates with scores
58//! - **Content Sampling**: Display decoded text samples for verification
59//! - **Language Hints**: Detect probable language from character patterns
60//! - **Format Validation**: Verify encoding produces valid subtitle content
61//!
62//! # Examples
63//!
64//! ```rust,ignore
65//! use subx_cli::commands::detect_encoding_command;
66//!
67//! // Detect encoding for multiple files
68//! let files = vec![
69//! "subtitle1.srt".to_string(),
70//! "subtitle2.ass".to_string(),
71//! ];
72//! detect_encoding_command::detect_encoding_command(&files, true)?;
73//!
74//! // Basic detection without verbose output
75//! detect_encoding_command::detect_encoding_command(&["file.srt".to_string()], false)?;
76//! ```
77
78use crate::Result;
79use crate::config::ConfigService;
80use crate::core::formats::encoding::EncodingDetector;
81use log::error;
82use std::path::Path;
83
84/// Execute character encoding detection for subtitle files with comprehensive analysis.
85///
86/// This function performs advanced character encoding detection on subtitle files,
87/// providing detailed information about detected encodings, confidence levels,
88/// and content samples. It supports both basic detection and verbose analysis
89/// modes to meet different user needs.
90///
91/// # Detection Process
92///
93/// 1. **File Validation**: Verify file existence and accessibility
94/// 2. **Initial Scanning**: Read file header and sample content
95/// 3. **BOM Detection**: Check for Unicode Byte Order Marks
96/// 4. **Statistical Analysis**: Analyze byte patterns and character frequencies
97/// 5. **Language Heuristics**: Apply language-specific detection rules
98/// 6. **Confidence Calculation**: Score each potential encoding
99/// 7. **Result Ranking**: Order candidates by confidence level
100/// 8. **Output Generation**: Format results for user presentation
101///
102/// # Verbose Mode Features
103///
104/// When `verbose` is enabled, the output includes:
105/// - **Confidence Percentages**: Numerical reliability scores
106/// - **Content Samples**: Decoded text previews
107/// - **Alternative Encodings**: Other possible encodings with scores
108/// - **Detection Metadata**: Technical details about the detection process
109/// - **Language Hints**: Probable content language indicators
110///
111/// # Error Handling
112///
113/// The function provides robust error handling:
114/// - **File Access**: Clear messages for permission or existence issues
115/// - **Corruption Detection**: Identification of damaged or invalid files
116/// - **Encoding Failures**: Graceful handling of undetectable encodings
117/// - **Partial Processing**: Continue with other files if individual files fail
118///
119/// # Output Formats
120///
121/// ## Basic Mode
122/// ```text
123/// file1.srt: UTF-8
124/// file2.ass: Windows-1252
125/// file3.vtt: GB2312
126/// ```
127///
128/// ## Verbose Mode
129/// ```text
130/// file1.srt: UTF-8 (99.5% confidence)
131/// Sample: "1\n00:00:01,000 --> 00:00:03,000\nHello World"
132/// Alternatives: ISO-8859-1 (15.2%), Windows-1252 (12.8%)
133/// Language: English (detected)
134///
135/// file2.ass: Windows-1252 (87.3% confidence)
136/// Sample: "[Script Info]\nTitle: Movie Subtitle"
137/// Alternatives: ISO-8859-1 (45.1%), UTF-8 (23.7%)
138/// Language: Mixed/Unknown
139/// ```
140///
141/// # Performance Considerations
142///
143/// - **Streaming Analysis**: Large files processed efficiently
144/// - **Sample-based Detection**: Uses representative file portions
145/// - **Caching**: Results cached for repeated operations
146/// - **Parallel Processing**: Multiple files analyzed concurrently
147///
148/// # Arguments
149///
150/// * `file_paths` - Vector of file paths to analyze for encoding
151/// * `verbose` - Enable detailed output with confidence scores and samples
152///
153/// # Returns
154///
155/// Returns `Ok(())` on successful analysis completion, or an error if:
156/// - Critical system resources are unavailable
157/// - All specified files are inaccessible
158/// - The encoding detection system fails to initialize
159///
160/// # Examples
161///
162/// ```rust,ignore
163/// use subx_cli::commands::detect_encoding_command;
164///
165/// // Quick encoding check for single file
166/// detect_encoding_command::detect_encoding_command(
167/// &["subtitle.srt".to_string()],
168/// false
169/// )?;
170///
171/// // Detailed analysis for multiple files
172/// let files = vec![
173/// "episode1.srt".to_string(),
174/// "episode2.ass".to_string(),
175/// "episode3.vtt".to_string(),
176/// ];
177/// detect_encoding_command::detect_encoding_command(&files, true)?;
178///
179/// // Batch analysis with glob patterns (shell expansion)
180/// let glob_files = vec![
181/// "season1/*.srt".to_string(),
182/// "season2/*.ass".to_string(),
183/// ];
184/// detect_encoding_command::detect_encoding_command(&glob_files, false)?;
185/// ```
186///
187/// # Use Cases
188///
189/// - **Troubleshooting**: Identify encoding issues causing display problems
190/// - **Conversion Planning**: Determine current encoding before conversion
191/// - **Quality Assurance**: Verify encoding consistency across file collections
192/// - **Migration**: Assess encoding diversity when migrating subtitle libraries
193/// - **Automation**: Integrate encoding detection into batch processing workflows
194pub fn detect_encoding_command(file_paths: &[String], verbose: bool) -> Result<()> {
195 // Initialize the encoding detection engine
196 let detector = EncodingDetector::with_defaults();
197
198 // Process each file individually to provide isolated error handling
199 for file in file_paths {
200 if !Path::new(file).exists() {
201 error!("File not found: {}", file);
202 continue;
203 }
204 match detector.detect_file_encoding(file) {
205 Ok(info) => {
206 let name = Path::new(file)
207 .file_name()
208 .and_then(|n| n.to_str())
209 .unwrap_or(file);
210 println!("File: {}", name);
211 println!(
212 " Encoding: {:?} (confidence: {:.1}%) BOM: {}",
213 info.charset,
214 info.confidence * 100.0,
215 if info.bom_detected { "yes" } else { "no" }
216 );
217 let sample = if verbose {
218 info.sample_text.clone()
219 } else if info.sample_text.len() > 50 {
220 format!("{}...", &info.sample_text[..47])
221 } else {
222 info.sample_text.clone()
223 };
224 println!(" Sample text: {}\n", sample);
225 }
226 Err(e) => error!("Failed to detect encoding for {}: {}", file, e),
227 }
228 }
229 Ok(())
230}
231
232/// Execute encoding detection command with injected configuration service.
233///
234/// This function provides the new dependency injection interface for the detect_encoding command,
235/// accepting a configuration service instead of loading configuration globally.
236///
237/// # Arguments
238///
239/// * `file_paths` - File paths to analyze for encoding detection
240/// * `verbose` - Whether to show verbose output
241/// * `config_service` - Configuration service providing access to settings
242///
243/// # Returns
244///
245/// Returns `Ok(())` on successful completion, or an error if detection fails.
246pub fn detect_encoding_command_with_config(
247 file_paths: &[String],
248 verbose: bool,
249 _config_service: std::sync::Arc<dyn ConfigService>,
250) -> Result<()> {
251 // Encoding detection doesn't need complex configuration, delegate to original implementation
252 detect_encoding_command(file_paths, verbose)
253}