ass_core/analysis/linting/rules/
encoding.rs

1//! Encoding issue detection rule for ASS script linting.
2//!
3//! Detects potential encoding or character issues in subtitle scripts
4//! that could cause display problems or compatibility issues.
5
6use crate::{
7    analysis::{
8        linting::{IssueCategory, IssueSeverity, LintIssue, LintRule},
9        ScriptAnalysis,
10    },
11    parser::Section,
12};
13use alloc::{format, string::ToString, vec::Vec};
14
15/// Rule for detecting encoding and character issues in subtitle scripts
16///
17/// Analyzes scripts for problematic characters that may cause encoding
18/// issues, display problems, or compatibility issues across different
19/// subtitle renderers and media players.
20///
21/// # Encoding Checks
22///
23/// - Non-printable characters: Detects control characters that shouldn't appear in text
24/// - Invalid UTF-8 sequences: Identifies corrupted character data
25/// - Suspicious character patterns: Warns about potentially problematic sequences
26///
27/// # Performance
28///
29/// - Time complexity: O(n * m) for n events and m characters per event
30/// - Memory: O(1) additional space
31/// - Target: <2ms for typical scripts with 1000 events
32///
33/// # Example
34///
35/// ```rust
36/// use ass_core::analysis::linting::rules::encoding::EncodingRule;
37/// use ass_core::analysis::linting::LintRule;
38/// use ass_core::{Script, ScriptAnalysis};
39///
40/// let script_text = format!("[Events]\nFormat: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text\nDialogue: 0,0:00:00.00,0:00:05.00,Default,,0,0,0,,Text with{}invalid character", '\x00');
41/// let script = Script::parse(&script_text)?;
42///
43/// let rule = EncodingRule;
44/// let analysis = ScriptAnalysis::analyze(&script).unwrap();
45/// let issues = rule.check_script(&analysis);
46/// assert!(!issues.is_empty()); // Should detect the control character
47/// # Ok::<(), Box<dyn std::error::Error>>(())
48/// ```
49pub struct EncodingRule;
50
51impl LintRule for EncodingRule {
52    fn id(&self) -> &'static str {
53        "encoding"
54    }
55
56    fn name(&self) -> &'static str {
57        "Encoding"
58    }
59
60    fn description(&self) -> &'static str {
61        "Detects potential encoding or character issues"
62    }
63
64    fn default_severity(&self) -> IssueSeverity {
65        IssueSeverity::Warning
66    }
67
68    fn category(&self) -> IssueCategory {
69        IssueCategory::Encoding
70    }
71
72    fn check_script(&self, analysis: &ScriptAnalysis) -> Vec<LintIssue> {
73        let mut issues = Vec::new();
74
75        if let Some(Section::Events(events)) = analysis
76            .script()
77            .sections()
78            .iter()
79            .find(|s| matches!(s, Section::Events(_)))
80        {
81            for event in events {
82                self.check_event_encoding(&mut issues, event);
83            }
84        }
85
86        self.check_script_info_encoding(&mut issues, analysis.script());
87
88        issues
89    }
90}
91
92impl EncodingRule {
93    /// Check encoding issues in a single event
94    fn check_event_encoding(&self, issues: &mut Vec<LintIssue>, event: &crate::parser::Event) {
95        if event
96            .text
97            .chars()
98            .any(|c| c.is_control() && c != '\n' && c != '\r' && c != '\t')
99        {
100            let issue = LintIssue::new(
101                self.default_severity(),
102                IssueCategory::Encoding,
103                self.id(),
104                "Event contains non-printable control characters".to_string(),
105            )
106            .with_description(
107                "Control characters may cause display issues in subtitle renderers".to_string(),
108            )
109            .with_suggested_fix(
110                "Remove or replace control characters with appropriate text".to_string(),
111            );
112            issues.push(issue);
113        }
114
115        if event.text.contains('\u{FFFD}') {
116            let issue = LintIssue::new(
117                self.default_severity(),
118                IssueCategory::Encoding,
119                self.id(),
120                "Event contains Unicode replacement character (�)".to_string(),
121            )
122            .with_description(
123                "Replacement characters indicate corrupted or invalid encoding".to_string(),
124            )
125            .with_suggested_fix("Check source file encoding and re-import".to_string());
126            issues.push(issue);
127        }
128
129        let char_count = event.text.chars().count();
130        let byte_count = event.text.len();
131
132        // Check for heavy multi-byte character usage
133        if char_count > 0 && byte_count > char_count * 3 {
134            let issue = LintIssue::new(
135                IssueSeverity::Hint,
136                IssueCategory::Encoding,
137                self.id(),
138                "Event contains many multi-byte characters".to_string(),
139            )
140            .with_description(
141                "Heavy use of multi-byte characters may impact performance".to_string(),
142            );
143            issues.push(issue);
144        }
145    }
146
147    /// Check encoding issues in script info section
148    fn check_script_info_encoding(
149        &self,
150        issues: &mut Vec<LintIssue>,
151        script: &crate::parser::Script,
152    ) {
153        if let Some(Section::ScriptInfo(info)) = script
154            .sections()
155            .iter()
156            .find(|s| matches!(s, Section::ScriptInfo(_)))
157        {
158            for (key, value) in &info.fields {
159                if value
160                    .chars()
161                    .any(|c| c.is_control() && c != '\n' && c != '\r')
162                {
163                    let issue = LintIssue::new(
164                        self.default_severity(),
165                        IssueCategory::Encoding,
166                        self.id(),
167                        format!("Script info field '{key}' contains control characters"),
168                    );
169                    issues.push(issue);
170                }
171            }
172        }
173    }
174}
175
176#[cfg(test)]
177mod tests {
178    use super::*;
179
180    #[test]
181    fn rule_metadata_correct() {
182        let rule = EncodingRule;
183        assert_eq!(rule.id(), "encoding");
184        assert_eq!(rule.name(), "Encoding");
185        assert_eq!(
186            rule.description(),
187            "Detects potential encoding or character issues"
188        );
189        assert_eq!(rule.default_severity(), IssueSeverity::Warning);
190        assert_eq!(rule.category(), IssueCategory::Encoding);
191    }
192
193    #[test]
194    fn empty_script_no_issues() {
195        let script_text = "[Script Info]\nTitle: Test";
196        let script = crate::parser::Script::parse(script_text).unwrap();
197        let analysis = ScriptAnalysis::analyze(&script).unwrap();
198
199        let rule = EncodingRule;
200        let issues = rule.check_script(&analysis);
201
202        assert!(issues.is_empty());
203    }
204
205    #[test]
206    fn valid_text_no_issues() {
207        let script_text = r"[Events]
208Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
209Dialogue: 0,0:00:00.00,0:00:05.00,Default,,0,0,0,,Valid text with unicode: ñáéíóú";
210
211        let script = crate::parser::Script::parse(script_text).unwrap();
212        let analysis = ScriptAnalysis::analyze(&script).unwrap();
213        let rule = EncodingRule;
214        let issues = rule.check_script(&analysis);
215
216        assert!(issues.is_empty());
217    }
218
219    #[test]
220    fn newlines_allowed() {
221        let script_text = r"[Events]
222Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
223Dialogue: 0,0:00:00.00,0:00:05.00,Default,,0,0,0,,Text with\Nline break";
224
225        let script = crate::parser::Script::parse(script_text).unwrap();
226        let rule = EncodingRule;
227        let analysis = ScriptAnalysis::analyze(&script).unwrap();
228        let issues = rule.check_script(&analysis);
229
230        assert!(issues.is_empty());
231    }
232
233    #[test]
234    fn tabs_allowed() {
235        let script_text = "[Events]\nFormat: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text\nDialogue: 0,0:00:00.00,0:00:05.00,Default,,0,0,0,,Text with\ttab";
236
237        let script = crate::parser::Script::parse(script_text).unwrap();
238        let rule = EncodingRule;
239        let analysis = ScriptAnalysis::analyze(&script).unwrap();
240        let issues = rule.check_script(&analysis);
241
242        assert!(issues.is_empty());
243    }
244
245    #[test]
246    fn replacement_character_detected() {
247        let script_text = r"[Events]
248Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
249Dialogue: 0,0:00:00.00,0:00:05.00,Default,,0,0,0,,Text with � replacement";
250
251        let script = crate::parser::Script::parse(script_text).unwrap();
252        let rule = EncodingRule;
253        let analysis = ScriptAnalysis::analyze(&script).unwrap();
254        let issues = rule.check_script(&analysis);
255
256        assert!(!issues.is_empty());
257        assert!(issues
258            .iter()
259            .any(|issue| issue.message().contains("replacement character")));
260    }
261
262    #[test]
263    fn control_character_in_script_info() {
264        let script_text = "[Script Info]\nTitle: Test\x00\n\n[Events]\nFormat: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text";
265
266        let script = crate::parser::Script::parse(script_text).unwrap();
267        let rule = EncodingRule;
268        let analysis = ScriptAnalysis::analyze(&script).unwrap();
269        let issues = rule.check_script(&analysis);
270
271        assert!(!issues.is_empty());
272        assert!(issues
273            .iter()
274            .any(|issue| issue.message().contains("control characters")));
275    }
276
277    #[test]
278    fn no_events_section_no_issues() {
279        let script_text = r"[Script Info]
280Title: Test
281
282[V4+ Styles]
283Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
284Style: Default,Arial,20,&H00FFFFFF&,&H000000FF&,&H00000000&,&H00000000&,0,0,0,0,100,100,0,0,1,2,0,2,10,10,10,1";
285
286        let script = crate::parser::Script::parse(script_text).unwrap();
287        let rule = EncodingRule;
288        let analysis = ScriptAnalysis::analyze(&script).unwrap();
289        let issues = rule.check_script(&analysis);
290
291        assert!(issues.is_empty());
292    }
293
294    #[test]
295    fn multibyte_characters_hint() {
296        let heavy_unicode = "🎵🎶🎵🎶".repeat(20);
297        let script_text = format!(
298            r"[Events]
299Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
300Dialogue: 0,0:00:00.00,0:00:05.00,Default,,0,0,0,,{heavy_unicode}"
301        );
302
303        let script = crate::parser::Script::parse(&script_text).unwrap();
304        let rule = EncodingRule;
305        let analysis = ScriptAnalysis::analyze(&script).unwrap();
306        let issues = rule.check_script(&analysis);
307
308        assert!(issues
309            .iter()
310            .any(|issue| issue.message().contains("multi-byte characters")));
311    }
312
313    #[test]
314    fn control_character_in_event_detected() {
315        // Test control characters in events (not script info) to cover lines 101-104, 107, 110, 112
316        let script_text = "[Events]\nFormat: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text\nDialogue: 0,0:00:00.00,0:00:05.00,Default,,0,0,0,,Text with\x00control char";
317
318        let script = crate::parser::Script::parse(script_text).unwrap();
319        let rule = EncodingRule;
320        let analysis = ScriptAnalysis::analyze(&script).unwrap();
321        let issues = rule.check_script(&analysis);
322
323        assert!(!issues.is_empty());
324        assert!(issues
325            .iter()
326            .any(|issue| issue.message().contains("non-printable control characters")));
327
328        // Check that the issue has description and suggested fix
329        let control_issue = issues
330            .iter()
331            .find(|issue| issue.message().contains("non-printable control characters"))
332            .unwrap();
333        assert!(control_issue.description().is_some());
334        assert!(control_issue.suggested_fix().is_some());
335    }
336}