Skip to main content

zantetsu_core/parser/
heuristic.rs

1use regex::Regex;
2
3use crate::error::{Result, ZantetsuError};
4use crate::types::{
5    AudioCodec, EpisodeSpec, MediaSource, ParseMode, ParseResult, Resolution, VideoCodec,
6};
7
8/// Heuristic parser using optimized regex patterns and scene naming rules.
9///
10/// This is the `ParseMode::Light` engine — fast, zero-ML-overhead parsing
11/// for instant results on any device. Accuracy is lower than the Neural CRF
12/// engine but latency is sub-microsecond.
13pub struct HeuristicParser {
14    // Resolution patterns
15    re_resolution: Regex,
16    re_resolution_dim: Regex,
17
18    // Codec patterns
19    re_vcodec: Regex,
20    re_acodec: Regex,
21
22    // Source patterns
23    re_source: Regex,
24
25    // CRC32 patterns
26    re_crc32: Regex,
27    re_crc32_no_bracket: Regex,
28
29    // Season and episode patterns
30    re_season_episode: Regex,
31    re_episode_range: Regex,
32    re_episode_version: Regex,
33    re_episode: Regex,
34    re_explicit_episode: Regex,
35    re_dash_episode: Regex,
36    re_season: Regex,
37    #[allow(dead_code)]
38    re_season_long: Regex,
39
40    // Version patterns
41    re_version: Regex,
42
43    // Year patterns
44    re_year: Regex,
45
46    // File patterns
47    re_extension: Regex,
48    re_group: Regex,
49
50    // Special episode patterns
51    #[allow(dead_code)]
52    re_special_episode: Regex,
53
54    // Multi-audio patterns
55    #[allow(dead_code)]
56    re_dual_audio: Regex,
57
58    // Subtitle patterns
59    #[allow(dead_code)]
60    re_multi_sub: Regex,
61}
62
63impl HeuristicParser {
64    /// Constructs a new `HeuristicParser` with pre-compiled regex patterns.
65    ///
66    /// # Errors
67    ///
68    /// Returns `ZantetsuError::RegexError` if any pattern fails to compile
69    /// (should never happen with the static patterns defined here).
70    pub fn new() -> Result<Self> {
71        Ok(Self {
72            // Resolution patterns
73            re_resolution: Regex::new(r"(?i)\b(2160|1080|720|480)[pi]\b")?,
74            re_resolution_dim: Regex::new(r"(?i)(\d{3,4})\s*x\s*(\d{3,4})")?,
75
76            // Codec patterns
77            re_vcodec: Regex::new(
78                r"(?i)\b(x\.?264|x\.?265|h\.?264|h\.?265|hevc|av1|vp9|mpeg4|xvid)\b",
79            )?,
80            re_acodec: Regex::new(
81                r"(?i)\b(flac|aac|opus|ac3|dts(?:-?hd)?|truehd|true\shd|mp3|vorbis|ogg|e-?aac\+?)\b",
82            )?,
83
84            // Source patterns
85            re_source: Regex::new(
86                r"(?i)(?:\b|_)(blu-?ray\s*remux|bdremux|bd-?remux|blu-?ray|bdrip|web-?dl|webrip|web-?rip|web|hdtv|dvd(?:rip)?|laserdisc|ld|vhs|bd)(?:\b|_)",
87            )?,
88
89            // CRC32 patterns
90            re_crc32: Regex::new(r"\[([0-9A-Fa-f]{8})\]")?,
91            re_crc32_no_bracket: Regex::new(r"(?i)(?:^|[\s\-_\.\(\[])((?:[0-9a-f]{8}))")?,
92
93            // Season and episode patterns
94            re_season_episode: Regex::new(r"(?i)\bS(\d{1,2})E(\d{1,4})\b")?,
95            re_episode_range: Regex::new(
96                r"(?i)(?:[\s\-_\.]|(?:^|[\s\-_\.\[\(])ep?\.?\s*)(\d{1,4})\s*[-~]\s*(\d{1,4})\b",
97            )?,
98            re_episode_version: Regex::new(
99                r"(?i)(?:[\s\-_\.]|(?:^|[\s\-_\.\[\(])ep?\.?\s*)(\d{1,4})v(\d)\b",
100            )?,
101            re_episode: Regex::new(
102                r"(?i)(?:[\s\-_\.]|(?:^|[\s\-_\.\[\(])(?:ep?\.?|episode|session)\s*)(\d{1,4})(?:\b|[^0-9v\-~])",
103            )?,
104            // Explicit episode markers: .E##., EP##, Episode ##, Session ##
105            re_explicit_episode: Regex::new(
106                r"(?i)(?:[\s\.\-_\[\(])(?:ep?\.?|episode|session)\s*(\d{1,4})\b",
107            )?,
108            // Standard anime separator: " - ## " with flexible spacing
109            re_dash_episode: Regex::new(r"(?:\s+-\s+)(\d{1,4})(?:\b|[^0-9v\-~])")?,
110
111            // Season patterns
112            re_season: Regex::new(r"(?i)(?:\bS|season\s*)(\d{1,2})\b")?,
113            re_season_long: Regex::new(r"(?i)\bseason\s*(\d{1,2})\b")?,
114
115            // Version patterns
116            re_version: Regex::new(r"(?i)\[v(\d)\]|\bv(\d)\b")?,
117
118            // Year patterns
119            re_year: Regex::new(r"\b((?:19|20)\d{2})\b")?,
120
121            // File patterns
122            re_extension: Regex::new(r"\.(\w{2,4})$")?,
123            re_group: Regex::new(r"^\[([^\]]+)\]")?,
124
125            // Special episode patterns (OVA, ONA, Movie, etc.)
126            re_special_episode: Regex::new(
127                r"(?i)\b(OVA|ONA|OAD|Movie|Film|Special|SP|ED|NCOP|NCED|Preview|Trailer|Extra)\b",
128            )?,
129
130            // Multi-audio patterns
131            re_dual_audio: Regex::new(
132                r"(?i)\b(?:dual[\s\-_]?audio|multi[\s\-_]?audio|multi[\s\-_]?(?:lang|language))\b",
133            )?,
134
135            // Subtitle patterns
136            re_multi_sub: Regex::new(
137                r"(?i)\b(?:multi[\s\-_]?(?:sub|subs|subtitle)|multiple[\s\-_]?subtitle|multi)\b",
138            )?,
139        })
140    }
141
142    /// Parses the given filename/torrent name using heuristic regex patterns.
143    ///
144    /// # Errors
145    ///
146    /// Returns `ZantetsuError::EmptyInput` if the input is empty or whitespace-only.
147    pub fn parse(&self, input: &str) -> Result<ParseResult> {
148        let trimmed = input.trim();
149        if trimmed.is_empty() {
150            return Err(ZantetsuError::EmptyInput);
151        }
152
153        let mut result = ParseResult::new(trimmed, ParseMode::Light);
154
155        // Extract structured metadata (order matters for disambiguation)
156        result.group = self.extract_group(trimmed);
157        result.extension = self.extract_extension(trimmed);
158
159        // Try CRC32 with brackets first, then without
160        result.crc32 = self
161            .extract_crc32(trimmed)
162            .or_else(|| self.extract_crc32_no_bracket(trimmed));
163
164        result.resolution = self.extract_resolution(trimmed);
165        result.video_codec = self.extract_video_codec(trimmed);
166        result.audio_codec = self.extract_audio_codec(trimmed);
167        result.source = self.extract_source(trimmed);
168        result.year = self.extract_year(trimmed);
169
170        // Season and episode: try S##E## combined first
171        let (se_season, se_episode) = self.extract_season_episode(trimmed);
172        result.season = se_season.or_else(|| self.extract_season(trimmed));
173        result.episode = se_episode.or_else(|| self.extract_episode(trimmed, &result));
174        result.version = self.extract_version(trimmed, &result.episode);
175
176        // Title extraction: everything between group tag and first metadata token
177        result.title = self.extract_title(trimmed, &result);
178
179        // Compute confidence based on how many fields were extracted
180        result.confidence = self.compute_confidence(&result);
181
182        Ok(result)
183    }
184
185    fn extract_group(&self, input: &str) -> Option<String> {
186        self.re_group
187            .captures(input)
188            .map(|c| c[1].trim().to_string())
189    }
190
191    fn extract_extension(&self, input: &str) -> Option<String> {
192        self.re_extension
193            .captures(input)
194            .map(|c| c[1].to_lowercase())
195    }
196
197    fn extract_crc32(&self, input: &str) -> Option<String> {
198        self.re_crc32.captures(input).map(|c| c[1].to_uppercase())
199    }
200
201    fn extract_crc32_no_bracket(&self, input: &str) -> Option<String> {
202        self.re_crc32_no_bracket.captures(input).and_then(|c| {
203            let crc = c.get(2)?.as_str();
204            // Only return if it looks like a valid CRC32 (8 hex chars)
205            if crc.len() == 8 && crc.chars().all(|ch| ch.is_ascii_hexdigit()) {
206                // Make sure it's not part of a number (like 1080p or episode number)
207                let prefix = &input[..c.get(1).map(|m| m.start()).unwrap_or(0)];
208                if !prefix.ends_with(char::is_numeric) {
209                    return Some(crc.to_uppercase());
210                }
211            }
212            None
213        })
214    }
215
216    fn extract_resolution(&self, input: &str) -> Option<Resolution> {
217        // Try standard NNNNp/NNNNi format first
218        if let Some(res) = self
219            .re_resolution
220            .captures(input)
221            .and_then(|c| match &c[1] {
222                "2160" => Some(Resolution::UHD2160),
223                "1080" => Some(Resolution::FHD1080),
224                "720" => Some(Resolution::HD720),
225                "480" => Some(Resolution::SD480),
226                _ => None,
227            })
228        {
229            return Some(res);
230        }
231
232        // Try WIDTHxHEIGHT format (e.g. 1920x1080, 1280x720)
233        self.re_resolution_dim.captures(input).and_then(|c| {
234            let height: u32 = c[2].parse().ok()?;
235            match height {
236                2160 => Some(Resolution::UHD2160),
237                1080 => Some(Resolution::FHD1080),
238                720 => Some(Resolution::HD720),
239                480 => Some(Resolution::SD480),
240                _ => None,
241            }
242        })
243    }
244
245    fn extract_video_codec(&self, input: &str) -> Option<VideoCodec> {
246        self.re_vcodec.captures(input).and_then(|c| {
247            let codec = c[1].to_lowercase();
248            match codec.as_str() {
249                "x264" | "x.264" | "h264" | "h.264" => Some(VideoCodec::H264),
250                "x265" | "x.265" | "h265" | "h.265" | "hevc" => Some(VideoCodec::HEVC),
251                "av1" => Some(VideoCodec::AV1),
252                "vp9" => Some(VideoCodec::VP9),
253                "mpeg4" | "xvid" => Some(VideoCodec::MPEG4),
254                _ => None,
255            }
256        })
257    }
258
259    fn extract_audio_codec(&self, input: &str) -> Option<AudioCodec> {
260        self.re_acodec.captures(input).and_then(|c| {
261            let codec = c[1].to_lowercase();
262            match codec.as_str() {
263                "flac" => Some(AudioCodec::FLAC),
264                "aac" => Some(AudioCodec::AAC),
265                "opus" => Some(AudioCodec::Opus),
266                "ac3" => Some(AudioCodec::AC3),
267                s if s.starts_with("dts") => Some(AudioCodec::DTS),
268                s if s.contains("truehd") || s.contains("true hd") => Some(AudioCodec::TrueHD),
269                "mp3" => Some(AudioCodec::MP3),
270                "vorbis" | "ogg" => Some(AudioCodec::Vorbis),
271                s if s.starts_with("e-aac") || s.starts_with("eaac") => Some(AudioCodec::EAAC),
272                _ => None,
273            }
274        })
275    }
276
277    fn extract_source(&self, input: &str) -> Option<MediaSource> {
278        // Normalize underscores to spaces for matching (e.g. _Blu-Ray_ patterns)
279        let normalized = input.replace('_', " ");
280        self.re_source.captures(&normalized).and_then(|c| {
281            let source = c[1].to_lowercase().replace([' ', '-'], "");
282            match source.as_str() {
283                s if s.contains("remux") => Some(MediaSource::BluRayRemux),
284                s if s.contains("blu") => Some(MediaSource::BluRay),
285                "bdrip" => Some(MediaSource::BluRay),
286                "bd" => Some(MediaSource::BluRay),
287                "webdl" => Some(MediaSource::WebDL),
288                "web" => Some(MediaSource::WebDL),
289                "webrip" => Some(MediaSource::WebRip),
290                "hdtv" => Some(MediaSource::HDTV),
291                s if s.starts_with("dvd") => Some(MediaSource::DVD),
292                s if s == "laserdisc" || s == "ld" => Some(MediaSource::LaserDisc),
293                "vhs" => Some(MediaSource::VHS),
294                _ => None,
295            }
296        })
297    }
298
299    fn extract_season(&self, input: &str) -> Option<u32> {
300        // Try S## pattern (but not S##E## which is handled by extract_season_episode)
301        self.re_season.captures(input).and_then(|c| {
302            // Verify it's not part of S##E## — if so, re_season_episode handles it
303            let full_match = c.get(0)?;
304            let after = &input[full_match.end()..];
305            // If immediately followed by E+digits, skip it (handled elsewhere)
306            if after.starts_with('E') || after.starts_with('e') {
307                let rest = &after[1..];
308                if rest.starts_with(|ch: char| ch.is_ascii_digit()) {
309                    return None;
310                }
311            }
312            c[1].parse().ok()
313        })
314    }
315
316    /// Extract combined S##E## season+episode notation.
317    fn extract_season_episode(&self, input: &str) -> (Option<u32>, Option<EpisodeSpec>) {
318        if let Some(caps) = self.re_season_episode.captures(input) {
319            let season: u32 = caps[1].parse().ok().unwrap_or(0);
320            let episode: u32 = caps[2].parse().ok().unwrap_or(0);
321            return (Some(season), Some(EpisodeSpec::Single(episode)));
322        }
323        (None, None)
324    }
325
326    fn extract_year(&self, input: &str) -> Option<u16> {
327        // Find all year-like matches and pick the one most likely to be a release year
328        // (between 1980 and current year + 1)
329        self.re_year.captures(input).and_then(|c| {
330            let year: u16 = c[1].parse().ok()?;
331            if (1980..=2030).contains(&year) {
332                Some(year)
333            } else {
334                None
335            }
336        })
337    }
338
339    fn extract_episode(&self, input: &str, result: &ParseResult) -> Option<EpisodeSpec> {
340        // S##E## is handled by extract_season_episode, skip if present
341        if self.re_season_episode.is_match(input) {
342            return None;
343        }
344
345        // Phase 1: Versioned episodes "12v2" — try all, validate
346        for caps in self.re_episode_version.captures_iter(input) {
347            let episode: u32 = match caps[1].parse().ok() {
348                Some(v) => v,
349                None => continue,
350            };
351            let version: u8 = match caps[2].parse().ok() {
352                Some(v) => v,
353                None => continue,
354            };
355            if !self.is_year_or_resolution(episode, result) {
356                return Some(EpisodeSpec::Version { episode, version });
357            }
358        }
359
360        // Phase 2: Episode ranges "01-12" — validate the range is not "Part X-Y"
361        for caps in self.re_episode_range.captures_iter(input) {
362            let start: u32 = match caps[1].parse().ok() {
363                Some(v) => v,
364                None => continue,
365            };
366            let end: u32 = match caps[2].parse().ok() {
367                Some(v) => v,
368                None => continue,
369            };
370            if start >= end || self.is_resolution_number(start) {
371                continue;
372            }
373            // Reject if preceded by "Part" or "Season" (e.g. "Part 2 - 25")
374            if let Some(m) = caps.get(0) {
375                let prefix = input[..m.start()].to_lowercase();
376                let prefix_trimmed = prefix.trim_end();
377                if prefix_trimmed.ends_with("part") || prefix_trimmed.ends_with("season") {
378                    continue;
379                }
380            }
381            return Some(EpisodeSpec::Range(start, end));
382        }
383
384        // Phase 3: Explicit episode markers (E##, Ep##, Episode ##, Session ##)
385        // These are the strongest signal and override bare numbers
386        if let Some(caps) = self.re_explicit_episode.captures(input) {
387            let ep: u32 = caps[1].parse().ok()?;
388            if !self.is_year_or_resolution(ep, result) {
389                return Some(EpisodeSpec::Single(ep));
390            }
391        }
392
393        // Phase 4: " - ## " separator pattern — find the LAST valid match
394        // This covers the standard anime naming convention: [Group] Title - ## (quality)
395        let mut last_dash_ep: Option<u32> = None;
396        for caps in self.re_dash_episode.captures_iter(input) {
397            let ep: u32 = match caps[1].parse().ok() {
398                Some(v) => v,
399                None => continue,
400            };
401            if self.is_year_or_resolution(ep, result) {
402                continue;
403            }
404            // Reject Vol numbers
405            if let Some(m) = caps.get(0) {
406                let prefix = input[..m.start()].to_lowercase();
407                let trimmed = prefix.trim_end();
408                if trimmed.ends_with("vol.") || trimmed.ends_with("vol") {
409                    continue;
410                }
411            }
412            last_dash_ep = Some(ep); // Keep updating — we want the LAST one
413        }
414        if let Some(ep) = last_dash_ep {
415            return Some(EpisodeSpec::Single(ep));
416        }
417
418        // Phase 5: Bare number fallback — only if no explicit or dash patterns matched
419        // Be conservative: skip numbers that look like version parts (X.Y)
420        for caps in self.re_episode.captures_iter(input) {
421            let full_match = match caps.get(0) {
422                Some(m) => m,
423                None => continue,
424            };
425            let digit_match = match caps.get(1) {
426                Some(m) => m,
427                None => continue,
428            };
429            let ep: u32 = match digit_match.as_str().parse().ok() {
430                Some(v) => v,
431                None => continue,
432            };
433
434            if self.is_year_or_resolution(ep, result) {
435                continue;
436            }
437
438            // Skip version-embedded numbers: digit.digit pattern (e.g. "2.0", "1.1")
439            if full_match.start() > 0 {
440                let prefix_byte = input.as_bytes()[full_match.start()];
441                if prefix_byte == b'.' && full_match.start() >= 2 {
442                    let before = input.as_bytes()[full_match.start() - 1];
443                    if before.is_ascii_digit() {
444                        continue;
445                    }
446                }
447            }
448
449            // Skip numbers followed by ".digit" (decimal: 2.0)
450            if digit_match.end() < input.len() {
451                let next_byte = input.as_bytes()[digit_match.end()];
452                if next_byte == b'.'
453                    && digit_match.end() + 1 < input.len()
454                    && input.as_bytes()[digit_match.end() + 1].is_ascii_digit()
455                {
456                    continue;
457                }
458            }
459
460            // Skip Vol numbers
461            if full_match.start() >= 3 {
462                let prefix = input[..full_match.start()].to_lowercase();
463                if prefix.ends_with("vol")
464                    || prefix.trim_end().ends_with("vol.")
465                    || prefix.trim_end().ends_with("vol")
466                {
467                    continue;
468                }
469            }
470
471            return Some(EpisodeSpec::Single(ep));
472        }
473
474        None
475    }
476
477    /// Check if a number is a common video resolution height.
478    fn is_resolution_number(&self, n: u32) -> bool {
479        matches!(n, 480 | 576 | 720 | 1080 | 2160 | 1280 | 1920 | 3840)
480    }
481
482    /// Check if a number is likely a year or resolution, not an episode.
483    fn is_year_or_resolution(&self, n: u32, result: &ParseResult) -> bool {
484        if let Some(year) = result.year
485            && n == u32::from(year)
486        {
487            return true;
488        }
489        self.is_resolution_number(n)
490    }
491
492    fn extract_version(&self, input: &str, episode: &Option<EpisodeSpec>) -> Option<u8> {
493        // If the episode already captured a version (e.g. "12v2"), don't double-extract
494        if let Some(EpisodeSpec::Version { .. }) = episode {
495            return None;
496        }
497
498        self.re_version.captures(input).and_then(|c| {
499            // Try group 1 (bracket form [v2]) then group 2 (bare v2)
500            c.get(1)
501                .or_else(|| c.get(2))
502                .and_then(|m| m.as_str().parse().ok())
503        })
504    }
505
506    /// Extracts the title from the input by identifying the text region
507    /// between the group tag (if any) and the first metadata token.
508    fn extract_title(&self, input: &str, result: &ParseResult) -> Option<String> {
509        let mut work = input.to_string();
510
511        // Remove the group tag from the start
512        if result.group.is_some()
513            && let Some(end) = work.find(']')
514        {
515            work = work[end + 1..].to_string();
516        }
517
518        // Remove file extension from the end
519        if let Some(ref ext) = result.extension
520            && let Some(pos) = work.rfind(&format!(".{ext}"))
521        {
522            work = work[..pos].to_string();
523        }
524
525        // Remove known metadata tokens (NOT episode)
526        let patterns_to_strip: Vec<&Regex> = vec![
527            &self.re_resolution,
528            &self.re_resolution_dim,
529            &self.re_vcodec,
530            &self.re_acodec,
531            &self.re_source,
532            &self.re_crc32,
533            &self.re_season_episode,
534            &self.re_episode_range,
535            &self.re_episode_version,
536            &self.re_season,
537            &self.re_version,
538        ];
539
540        for pattern in &patterns_to_strip {
541            work = pattern.replace_all(&work, "\x00").to_string();
542        }
543
544        // For episode: instead of replace_all (which matches title numbers too),
545        // find the correct episode position using priority-based matching
546        self.sentinel_episode_in_title(&mut work, result);
547
548        // Also strip year if it's in brackets or clearly separate
549        if let Some(year) = result.year {
550            let year_str = year.to_string();
551            let bracketed_year = format!("({year_str})");
552            work = work.replace(&bracketed_year, "\x00");
553            let bracketed_year = format!("[{year_str}]");
554            work = work.replace(&bracketed_year, "\x00");
555        }
556
557        // Remove any remaining bracketed content (typically metadata tags)
558        let re_brackets = Regex::new(r"\[[^\]]*\]|\([^\)]*\)").ok()?;
559        work = re_brackets.replace_all(&work, " ").to_string();
560
561        // Take text before the first sentinel (null byte)
562        let title_region = work.split('\x00').next().unwrap_or("");
563
564        // Clean up: replace dots, underscores with spaces; normalize whitespace
565        let cleaned = title_region
566            .replace(['.', '_'], " ")
567            .split_whitespace()
568            .collect::<Vec<_>>()
569            .join(" ")
570            .trim_matches(|c: char| c == '-' || c == ' ')
571            .to_string();
572
573        // Strip common non-title tokens from the end
574        let cleaned = strip_trailing_noise(&cleaned);
575
576        if cleaned.is_empty() {
577            None
578        } else {
579            Some(cleaned)
580        }
581    }
582
583    /// Insert episode sentinel in the title work string at the correct position.
584    /// Uses the same priority logic as extract_episode to find the RIGHT number.
585    fn sentinel_episode_in_title(&self, work: &mut String, result: &ParseResult) {
586        // If S##E## was the episode source, it's already stripped above
587        if self.re_season_episode.is_match(work) {
588            return;
589        }
590
591        // Phase 1: explicit E##/Ep## markers — sentinel these
592        if self.re_explicit_episode.is_match(work) {
593            *work = self
594                .re_explicit_episode
595                .replace_all(work, "\x00")
596                .to_string();
597            return;
598        }
599
600        // Phase 2: " - ## " dash separator — find the LAST valid match
601        let mut last_dash_pos: Option<(usize, usize)> = None;
602        for caps in self.re_dash_episode.captures_iter(work) {
603            let m = match caps.get(0) {
604                Some(m) => m,
605                None => continue,
606            };
607            let digit = match caps.get(1) {
608                Some(d) => d,
609                None => continue,
610            };
611            let ep: u32 = match digit.as_str().parse().ok() {
612                Some(v) => v,
613                None => continue,
614            };
615            if self.is_year_or_resolution(ep, result) {
616                continue;
617            }
618            last_dash_pos = Some((m.start(), m.end()));
619        }
620        if let Some((start, _end)) = last_dash_pos {
621            work.insert(start, '\x00');
622            return;
623        }
624
625        // Phase 3: Bare episode matches — use first valid one
626        for caps in self.re_episode.captures_iter(work) {
627            let full = match caps.get(0) {
628                Some(m) => m,
629                None => continue,
630            };
631            let digit = match caps.get(1) {
632                Some(d) => d,
633                None => continue,
634            };
635            let ep: u32 = match digit.as_str().parse().ok() {
636                Some(v) => v,
637                None => continue,
638            };
639            if self.is_year_or_resolution(ep, result) {
640                continue;
641            }
642            // Skip version-embedded numbers (digit.digit)
643            if full.start() > 0 {
644                let prefix_byte = work.as_bytes()[full.start()];
645                if prefix_byte == b'.' && full.start() >= 2 {
646                    let before = work.as_bytes()[full.start() - 1];
647                    if before.is_ascii_digit() {
648                        continue;
649                    }
650                }
651            }
652            // Skip numbers followed by ".digit"
653            if digit.end() < work.len() {
654                let next = work.as_bytes()[digit.end()];
655                if next == b'.'
656                    && digit.end() + 1 < work.len()
657                    && work.as_bytes()[digit.end() + 1].is_ascii_digit()
658                {
659                    continue;
660                }
661            }
662            work.insert(full.start(), '\x00');
663            return;
664        }
665    }
666
667    /// Computes a confidence score based on how many metadata fields
668    /// were successfully extracted.
669    fn compute_confidence(&self, result: &ParseResult) -> f32 {
670        let mut fields_present = 0u32;
671        let mut fields_total = 7u32; // title, group, episode, resolution, vcodec, acodec, source
672
673        if result.title.is_some() {
674            fields_present += 2; // Title is worth double
675            fields_total += 1;
676        }
677        if result.group.is_some() {
678            fields_present += 1;
679        }
680        if result.episode.is_some() {
681            fields_present += 1;
682        }
683        if result.resolution.is_some() {
684            fields_present += 1;
685        }
686        if result.video_codec.is_some() {
687            fields_present += 1;
688        }
689        if result.audio_codec.is_some() {
690            fields_present += 1;
691        }
692        if result.source.is_some() {
693            fields_present += 1;
694        }
695
696        (fields_present as f32 / fields_total as f32).min(1.0)
697    }
698}
699
700/// Strip common non-title tokens from the end of a title string.
701fn strip_trailing_noise(title: &str) -> String {
702    let noise_tokens = [
703        "RAW",
704        "VOSTFR",
705        "MULTI",
706        "Hi10P",
707        "10bit",
708        "Dual Audio",
709        "Multiple Subtitle",
710        "Multi-Subs",
711        "Main 10",
712    ];
713    let mut result = title.to_string();
714    let mut changed = true;
715    while changed {
716        changed = false;
717        let trimmed = result.trim_end_matches(['-', ' ']);
718        if trimmed.len() != result.len() {
719            result = trimmed.to_string();
720            changed = true;
721        }
722        for token in &noise_tokens {
723            if result.to_lowercase().ends_with(&token.to_lowercase()) {
724                result = result[..result.len() - token.len()].to_string();
725                changed = true;
726            }
727        }
728    }
729    result
730}
731
732#[cfg(test)]
733mod tests {
734    use super::*;
735
736    fn parser() -> HeuristicParser {
737        HeuristicParser::new().unwrap()
738    }
739
740    #[test]
741    fn empty_input_errors() {
742        let p = parser();
743        assert!(matches!(p.parse(""), Err(ZantetsuError::EmptyInput)));
744        assert!(matches!(p.parse("   "), Err(ZantetsuError::EmptyInput)));
745    }
746
747    #[test]
748    fn subsplease_standard_format() {
749        let p = parser();
750        let r = p
751            .parse("[SubsPlease] Jujutsu Kaisen - 24 (1080p) [A1B2C3D4].mkv")
752            .unwrap();
753
754        assert_eq!(r.title.as_deref(), Some("Jujutsu Kaisen"));
755        assert_eq!(r.group.as_deref(), Some("SubsPlease"));
756        assert_eq!(r.episode, Some(EpisodeSpec::Single(24)));
757        assert_eq!(r.resolution, Some(Resolution::FHD1080));
758        assert_eq!(r.crc32.as_deref(), Some("A1B2C3D4"));
759        assert_eq!(r.extension.as_deref(), Some("mkv"));
760        assert_eq!(r.parse_mode, ParseMode::Light);
761    }
762
763    #[test]
764    fn erai_raws_versioned_episode() {
765        let p = parser();
766        let r = p
767            .parse("[Erai-raws] Shingeki no Kyojin - The Final Season - 28v2 [1080p][HEVC].mkv")
768            .unwrap();
769
770        assert_eq!(r.group.as_deref(), Some("Erai-raws"));
771        assert_eq!(
772            r.episode,
773            Some(EpisodeSpec::Version {
774                episode: 28,
775                version: 2
776            })
777        );
778        assert_eq!(r.resolution, Some(Resolution::FHD1080));
779        assert_eq!(r.video_codec, Some(VideoCodec::HEVC));
780        assert_eq!(r.extension.as_deref(), Some("mkv"));
781    }
782
783    #[test]
784    fn batch_episode_range() {
785        let p = parser();
786        let r = p
787            .parse("[Judas] Golden Kamuy S3 - 01-12 (1080p) [Batch]")
788            .unwrap();
789
790        assert_eq!(r.group.as_deref(), Some("Judas"));
791        assert_eq!(r.season, Some(3));
792        assert_eq!(r.episode, Some(EpisodeSpec::Range(1, 12)));
793        assert_eq!(r.resolution, Some(Resolution::FHD1080));
794    }
795
796    #[test]
797    fn dot_separated_format() {
798        let p = parser();
799        let r = p
800            .parse("One.Piece.1084.VOSTFR.1080p.WEB.x264-AAC.mkv")
801            .unwrap();
802
803        assert_eq!(r.title.as_deref(), Some("One Piece"));
804        assert_eq!(r.episode, Some(EpisodeSpec::Single(1084)));
805        assert_eq!(r.resolution, Some(Resolution::FHD1080));
806        assert_eq!(r.video_codec, Some(VideoCodec::H264));
807        assert_eq!(r.audio_codec, Some(AudioCodec::AAC));
808        assert_eq!(r.extension.as_deref(), Some("mkv"));
809    }
810
811    #[test]
812    fn resolution_extraction() {
813        let p = parser();
814
815        let r = p.parse("[Test] Show - 01 (480p).mkv").unwrap();
816        assert_eq!(r.resolution, Some(Resolution::SD480));
817
818        let r = p.parse("[Test] Show - 01 (720p).mkv").unwrap();
819        assert_eq!(r.resolution, Some(Resolution::HD720));
820
821        let r = p.parse("[Test] Show - 01 (2160p).mkv").unwrap();
822        assert_eq!(r.resolution, Some(Resolution::UHD2160));
823    }
824
825    #[test]
826    fn video_codec_variants() {
827        let p = parser();
828
829        for (input, expected) in [
830            ("x264", VideoCodec::H264),
831            ("H.264", VideoCodec::H264),
832            ("x265", VideoCodec::HEVC),
833            ("HEVC", VideoCodec::HEVC),
834            ("H.265", VideoCodec::HEVC),
835            ("AV1", VideoCodec::AV1),
836            ("VP9", VideoCodec::VP9),
837        ] {
838            let r = p
839                .parse(&format!("[Group] Title - 01 [{input}].mkv"))
840                .unwrap();
841            assert_eq!(r.video_codec, Some(expected), "failed for input: {input}");
842        }
843    }
844
845    #[test]
846    fn audio_codec_variants() {
847        let p = parser();
848
849        for (input, expected) in [
850            ("FLAC", AudioCodec::FLAC),
851            ("AAC", AudioCodec::AAC),
852            ("Opus", AudioCodec::Opus),
853            ("AC3", AudioCodec::AC3),
854            ("DTS", AudioCodec::DTS),
855            ("MP3", AudioCodec::MP3),
856        ] {
857            let r = p
858                .parse(&format!("[Group] Title - 01 [{input}].mkv"))
859                .unwrap();
860            assert_eq!(r.audio_codec, Some(expected), "failed for input: {input}");
861        }
862    }
863
864    #[test]
865    fn source_extraction() {
866        let p = parser();
867
868        let r = p.parse("[Group] Title - 01 Blu-ray 1080p.mkv").unwrap();
869        assert_eq!(r.source, Some(MediaSource::BluRay));
870
871        let r = p.parse("[Group] Title - 01 WEB-DL 1080p.mkv").unwrap();
872        assert_eq!(r.source, Some(MediaSource::WebDL));
873
874        let r = p.parse("[Group] Title - 01 HDTV 720p.mkv").unwrap();
875        assert_eq!(r.source, Some(MediaSource::HDTV));
876    }
877
878    #[test]
879    fn year_extraction() {
880        let p = parser();
881        let r = p.parse("[Group] Title (2024) - 01 (1080p).mkv").unwrap();
882        assert_eq!(r.year, Some(2024));
883    }
884
885    #[test]
886    fn confidence_scales_with_fields() {
887        let p = parser();
888
889        // Minimal parse — only title
890        let r = p.parse("Some Random Title.mkv").unwrap();
891        assert!(
892            r.confidence < 0.5,
893            "confidence should be low: {}",
894            r.confidence
895        );
896
897        // Rich parse — many fields
898        let r = p
899            .parse("[SubsPlease] Jujutsu Kaisen - 24 (1080p) [H264] [AAC] [A1B2C3D4].mkv")
900            .unwrap();
901        assert!(
902            r.confidence > 0.7,
903            "confidence should be high: {}",
904            r.confidence
905        );
906    }
907
908    #[test]
909    fn parse_result_is_serializable() {
910        let p = parser();
911        let r = p
912            .parse("[SubsPlease] Jujutsu Kaisen - 24 (1080p) [A1B2C3D4].mkv")
913            .unwrap();
914
915        let json = serde_json::to_string(&r).unwrap();
916        let back: ParseResult = serde_json::from_str(&json).unwrap();
917        assert_eq!(r, back);
918    }
919}