Skip to main content

subx_cli/commands/
detect_encoding_command.rs

1//! Advanced character encoding detection command implementation.
2//!
3//! This module provides sophisticated character encoding detection capabilities
4//! for subtitle files, helping users identify and resolve encoding issues that
5//! can cause display problems with non-ASCII characters. It uses multiple
6//! detection algorithms and heuristics to provide accurate encoding identification.
7//!
8//! # Detection Algorithms
9//!
10//! The encoding detection system employs multiple complementary approaches:
11//!
12//! ## Byte Order Mark (BOM) Detection
13//! - **UTF-8**: EF BB BF byte sequence
14//! - **UTF-16LE**: FF FE byte sequence
15//! - **UTF-16BE**: FE FF byte sequence
16//! - **UTF-32**: Various 4-byte BOM sequences
17//!
18//! ## Statistical Analysis
19//! - **Character Frequency**: Analyze byte patterns for specific encodings
20//! - **Bigram Analysis**: Examine two-byte character combinations
21//! - **Language Heuristics**: Apply language-specific character patterns
22//! - **Confidence Scoring**: Quantify detection reliability
23//!
24//! ## Format-Specific Detection
25//! - **ASCII Compatibility**: Check for pure ASCII content
26//! - **Extended ASCII**: Detect Windows-1252, ISO-8859-1 variants
27//! - **Multi-byte Encodings**: Identify UTF-8, GB2312, Shift_JIS patterns
28//! - **Legacy Encodings**: Support for regional and historical encodings
29//!
30//! # Supported Encodings
31//!
32//! ## Unicode Family
33//! - **UTF-8**: Universal encoding, recommended for all new files
34//! - **UTF-16LE/BE**: Unicode with byte order variants
35//! - **UTF-32**: Full Unicode support with fixed width
36//!
37//! ## Western European
38//! - **ISO-8859-1 (Latin-1)**: Basic Western European characters
39//! - **Windows-1252**: Microsoft's Western European encoding
40//! - **ISO-8859-15**: Latin-1 with Euro symbol support
41//!
42//! ## East Asian
43//! - **GB2312/GBK**: Simplified Chinese encodings
44//! - **Big5**: Traditional Chinese encoding
45//! - **Shift_JIS**: Japanese encoding
46//! - **EUC-JP**: Alternative Japanese encoding
47//! - **EUC-KR**: Korean encoding
48//!
49//! ## Cyrillic and Others
50//! - **Windows-1251**: Russian and Cyrillic languages
51//! - **KOI8-R**: Russian encoding
52//! - **ISO-8859-5**: Cyrillic alphabet
53//!
54//! # Detection Features
55//!
56//! - **Confidence Scoring**: Reliability percentage for each detection
57//! - **Alternative Suggestions**: Multiple encoding candidates with scores
58//! - **Content Sampling**: Display decoded text samples for verification
59//! - **Language Hints**: Detect probable language from character patterns
60//! - **Format Validation**: Verify encoding produces valid subtitle content
61//!
62//! # Examples
63//!
64//! ```rust,ignore
65//! use subx_cli::commands::detect_encoding_command;
66//!
67//! // Detect encoding for multiple files
68//! let files = vec![
69//!     "subtitle1.srt".to_string(),
70//!     "subtitle2.ass".to_string(),
71//! ];
72//! detect_encoding_command::detect_encoding_command(&files, true)?;
73//!
74//! // Basic detection without verbose output
75//! detect_encoding_command::detect_encoding_command(&["file.srt".to_string()], false)?;
76//! ```
77
78use std::path::PathBuf;
79
80use crate::Result;
81use crate::cli::output::{self, emit_success};
82use crate::config::ConfigService;
83use crate::core::formats::encoding::{Charset, EncodingDetector};
84use log::error;
85use serde::Serialize;
86
87/// Per-item error carried inside [`DetectEncodingItem::error`].
88///
89/// This intentionally omits `exit_code`; per the
90/// `machine-readable-output` spec only the top-level error envelope
91/// carries an exit code. Per-item errors describe a single failed file
92/// while the overall command status remains `"ok"`.
93#[derive(Debug, Serialize)]
94pub struct DetectEncodingItemError {
95    /// Snake_case category mirroring [`crate::error::SubXError::category`].
96    pub category: String,
97    /// Upper-snake-case machine code mirroring
98    /// [`crate::error::SubXError::machine_code`].
99    pub code: String,
100    /// Human-readable English message.
101    pub message: String,
102}
103
104/// Per-file encoding-detection record emitted in JSON mode.
105///
106/// Successful entries carry `encoding`/`confidence`/`has_bom`/
107/// `bytes_sampled` and omit `error`. Failed entries carry `error` and
108/// omit the detection fields.
109#[derive(Debug, Serialize)]
110pub struct DetectEncodingItem {
111    /// Path as supplied on the command line (or as resolved through
112    /// `-i`/recursive directory walk).
113    pub path: String,
114    /// `"ok"` when the file was successfully sampled, `"error"`
115    /// otherwise.
116    pub status: &'static str,
117    /// Canonical encoding label (e.g. `"UTF-8"`, `"GBK"`, `"Big5"`).
118    #[serde(skip_serializing_if = "Option::is_none")]
119    pub encoding: Option<String>,
120    /// Detection confidence in `[0.0, 1.0]`.
121    #[serde(skip_serializing_if = "Option::is_none")]
122    pub confidence: Option<f32>,
123    /// Whether a Byte Order Mark was detected.
124    #[serde(skip_serializing_if = "Option::is_none")]
125    pub has_bom: Option<bool>,
126    /// Number of bytes the detector inspected. Capped at the detector's
127    /// internal sample window (currently 8 KiB).
128    #[serde(skip_serializing_if = "Option::is_none")]
129    pub bytes_sampled: Option<u64>,
130    /// Per-item error envelope when `status == "error"`.
131    #[serde(skip_serializing_if = "Option::is_none")]
132    pub error: Option<DetectEncodingItemError>,
133}
134
135/// Payload for the `detect-encoding` command's `data` field in JSON mode.
136///
137/// One element per resolved input path. Single-file invocations emit a
138/// `files` array of length 1; batch and `-i` invocations emit one entry
139/// per file in the resolution order.
140#[derive(Debug, Serialize)]
141pub struct DetectEncodingPayload {
142    /// Per-file detection records.
143    pub files: Vec<DetectEncodingItem>,
144}
145
146fn charset_to_label(c: &Charset) -> &'static str {
147    match c {
148        Charset::Utf8 => "UTF-8",
149        Charset::Utf16Le => "UTF-16LE",
150        Charset::Utf16Be => "UTF-16BE",
151        Charset::Utf32Le => "UTF-32LE",
152        Charset::Utf32Be => "UTF-32BE",
153        Charset::Gbk => "GBK",
154        Charset::ShiftJis => "Shift_JIS",
155        Charset::Iso88591 => "ISO-8859-1",
156        Charset::Windows1252 => "Windows-1252",
157        Charset::Big5 => "Big5",
158        Charset::Euckr => "EUC-KR",
159        Charset::Unknown => "unknown",
160    }
161}
162
163/// Execute character encoding detection for subtitle files with comprehensive analysis.
164///
165/// This function performs advanced character encoding detection on subtitle files,
166/// providing detailed information about detected encodings, confidence levels,
167/// and content samples. It supports both basic detection and verbose analysis
168/// modes to meet different user needs.
169///
170/// # Detection Process
171///
172/// 1. **File Validation**: Verify file existence and accessibility
173/// 2. **Initial Scanning**: Read file header and sample content
174/// 3. **BOM Detection**: Check for Unicode Byte Order Marks
175/// 4. **Statistical Analysis**: Analyze byte patterns and character frequencies
176/// 5. **Language Heuristics**: Apply language-specific detection rules
177/// 6. **Confidence Calculation**: Score each potential encoding
178/// 7. **Result Ranking**: Order candidates by confidence level
179/// 8. **Output Generation**: Format results for user presentation
180///
181/// # Verbose Mode Features
182///
183/// When `verbose` is enabled, the output includes:
184/// - **Confidence Percentages**: Numerical reliability scores
185/// - **Content Samples**: Decoded text previews
186/// - **Alternative Encodings**: Other possible encodings with scores
187/// - **Detection Metadata**: Technical details about the detection process
188/// - **Language Hints**: Probable content language indicators
189///
190/// # Error Handling
191///
192/// The function provides robust error handling:
193/// - **File Access**: Clear messages for permission or existence issues
194/// - **Corruption Detection**: Identification of damaged or invalid files
195/// - **Encoding Failures**: Graceful handling of undetectable encodings
196/// - **Partial Processing**: Continue with other files if individual files fail
197///
198/// # Output Formats
199///
200/// ## Basic Mode
201/// ```text
202/// file1.srt: UTF-8
203/// file2.ass: Windows-1252
204/// file3.vtt: GB2312
205/// ```
206///
207/// ## Verbose Mode
208/// ```text
209/// file1.srt: UTF-8 (99.5% confidence)
210/// Sample: "1\n00:00:01,000 --> 00:00:03,000\nHello World"
211/// Alternatives: ISO-8859-1 (15.2%), Windows-1252 (12.8%)
212/// Language: English (detected)
213///
214/// file2.ass: Windows-1252 (87.3% confidence)
215/// Sample: "[Script Info]\nTitle: Movie Subtitle"
216/// Alternatives: ISO-8859-1 (45.1%), UTF-8 (23.7%)
217/// Language: Mixed/Unknown
218/// ```
219///
220/// # Performance Considerations
221///
222/// - **Streaming Analysis**: Large files processed efficiently
223/// - **Sample-based Detection**: Uses representative file portions
224/// - **Caching**: Results cached for repeated operations
225/// - **Parallel Processing**: Multiple files analyzed concurrently
226///
227/// # Arguments
228///
229/// * `file_paths` - Vector of file paths to analyze for encoding
230/// * `verbose` - Enable detailed output with confidence scores and samples
231///
232/// # Returns
233///
234/// Returns `Ok(())` on successful analysis completion, or an error if:
235/// - Critical system resources are unavailable
236/// - All specified files are inaccessible
237/// - The encoding detection system fails to initialize
238///
239/// # Examples
240///
241/// ```rust,ignore
242/// use subx_cli::commands::detect_encoding_command;
243///
244/// // Quick encoding check for single file
245/// detect_encoding_command::detect_encoding_command(
246///     &["subtitle.srt".to_string()],
247///     false
248/// )?;
249///
250/// // Detailed analysis for multiple files
251/// let files = vec![
252///     "episode1.srt".to_string(),
253///     "episode2.ass".to_string(),
254///     "episode3.vtt".to_string(),
255/// ];
256/// detect_encoding_command::detect_encoding_command(&files, true)?;
257///
258/// // Batch analysis with glob patterns (shell expansion)
259/// let glob_files = vec![
260///     "season1/*.srt".to_string(),
261///     "season2/*.ass".to_string(),
262/// ];
263/// detect_encoding_command::detect_encoding_command(&glob_files, false)?;
264/// ```
265///
266/// # Use Cases
267///
268/// - **Troubleshooting**: Identify encoding issues causing display problems
269/// - **Conversion Planning**: Determine current encoding before conversion
270/// - **Quality Assurance**: Verify encoding consistency across file collections
271/// - **Migration**: Assess encoding diversity when migrating subtitle libraries
272/// - **Automation**: Integrate encoding detection into batch processing workflows
273use crate::cli::DetectEncodingArgs;
274use crate::error::SubXError;
275
276/// Execute character encoding detection for subtitle files based on input arguments.
277pub fn detect_encoding_command(args: &DetectEncodingArgs) -> Result<()> {
278    // Initialize the encoding detection engine
279    let detector = EncodingDetector::with_defaults();
280
281    // For `-i` input paths, use InputPathHandler with archive extraction support,
282    // keeping CollectedFiles alive so archive temp dirs persist through processing.
283    // For positional file_paths, pass them through directly — they may include
284    // nonexistent paths that are gracefully handled in the processing loop below.
285    let collected;
286    let direct_paths: Vec<PathBuf>;
287    let paths: &[PathBuf] = if !args.input_paths.is_empty() {
288        let handler = args
289            .get_input_handler()
290            .map_err(|e| SubXError::CommandExecution(e.to_string()))?;
291        collected = handler
292            .collect_files()
293            .map_err(|e| SubXError::CommandExecution(e.to_string()))?;
294        &collected
295    } else if !args.file_paths.is_empty() {
296        direct_paths = args.file_paths.iter().map(PathBuf::from).collect();
297        &direct_paths
298    } else {
299        return Err(SubXError::NoInputSpecified);
300    };
301
302    let mode = output::active_mode();
303    let json_mode = mode.is_json();
304
305    // Per spec: when a single positional path is supplied and it does
306    // not exist, surface a top-level error envelope instead of a
307    // per-item error. `-i` paths already error in
308    // `InputPathHandler::validate`.
309    if paths.len() == 1 && !paths[0].exists() {
310        return Err(SubXError::PathNotFound(paths[0].clone()));
311    }
312
313    let mut items: Vec<DetectEncodingItem> = Vec::new();
314
315    // Process each file individually to provide isolated error handling
316    for path in paths {
317        let path_display = path.to_string_lossy().into_owned();
318
319        if !path.exists() {
320            if json_mode {
321                items.push(DetectEncodingItem {
322                    path: path_display.clone(),
323                    status: "error",
324                    encoding: None,
325                    confidence: None,
326                    has_bom: None,
327                    bytes_sampled: None,
328                    error: Some(DetectEncodingItemError {
329                        category: "path_not_found".to_string(),
330                        code: "E_PATH_NOT_FOUND".to_string(),
331                        message: format!("Path does not exist: {}", path.display()),
332                    }),
333                });
334            } else {
335                error!("Path does not exist: {}", path.display());
336            }
337            continue;
338        }
339
340        let bytes_sampled = std::fs::metadata(path).ok().map(|m| m.len().min(8192));
341
342        match detector.detect_file_encoding(&path_display) {
343            Ok(info) => {
344                if json_mode {
345                    items.push(DetectEncodingItem {
346                        path: path_display.clone(),
347                        status: "ok",
348                        encoding: Some(charset_to_label(&info.charset).to_string()),
349                        confidence: Some(info.confidence),
350                        has_bom: Some(info.bom_detected),
351                        bytes_sampled,
352                        error: None,
353                    });
354                } else {
355                    let name = path
356                        .file_name()
357                        .and_then(|n| n.to_str())
358                        .unwrap_or(&path_display);
359                    println!("File: {name}");
360                    println!(
361                        "  Encoding: {:?} (Confidence: {:.1}%) BOM: {}",
362                        info.charset,
363                        info.confidence * 100.0,
364                        if info.bom_detected { "Yes" } else { "No" }
365                    );
366                    let sample = if args.verbose {
367                        info.sample_text.clone()
368                    } else if info.sample_text.len() > 50 {
369                        format!("{}...", &info.sample_text[..47])
370                    } else {
371                        info.sample_text.clone()
372                    };
373                    println!("  Sample text: {sample}\n");
374                }
375            }
376            Err(e) => {
377                if json_mode {
378                    items.push(DetectEncodingItem {
379                        path: path_display.clone(),
380                        status: "error",
381                        encoding: None,
382                        confidence: None,
383                        has_bom: None,
384                        bytes_sampled: None,
385                        error: Some(DetectEncodingItemError {
386                            category: e.category().to_string(),
387                            code: e.machine_code().to_string(),
388                            message: e.user_friendly_message(),
389                        }),
390                    });
391                } else {
392                    error!("Unable to detect encoding for {}: {}", path.display(), e);
393                }
394            }
395        }
396    }
397
398    if json_mode {
399        emit_success(
400            mode,
401            "detect-encoding",
402            DetectEncodingPayload { files: items },
403        );
404    }
405
406    Ok(())
407}
408
409/// Execute encoding detection command with injected configuration service.
410///
411/// This function provides the new dependency injection interface for the detect_encoding command,
412/// accepting a configuration service instead of loading configuration globally.
413///
414/// # Arguments
415///
416/// * `file_paths` - File paths to analyze for encoding detection
417/// * `verbose` - Whether to show verbose output
418/// * `config_service` - Configuration service providing access to settings
419///
420/// # Returns
421///
422/// Returns `Ok(())` on successful completion, or an error if detection fails.
423pub fn detect_encoding_command_with_config(
424    args: DetectEncodingArgs,
425    _config_service: &dyn ConfigService,
426) -> Result<()> {
427    // Delegate to new implementation based on input argument struct
428    detect_encoding_command(&args)
429}