use std::ops::Range;
use std::sync::LazyLock;
use log::debug;
use crate::tokenizer::TokenStream;
#[derive(Debug, Clone)]
pub struct ZoneMap {
pub title_zone: Range<usize>,
#[allow(dead_code)]
pub tech_zone: Range<usize>,
pub has_anchors: bool,
pub year: Option<YearInfo>,
pub dir_zones: Vec<SegmentZone>,
}
#[derive(Debug, Clone)]
pub struct SegmentZone {
pub segment_idx: usize,
pub title_zone: Range<usize>,
#[allow(dead_code)]
pub tech_zone: Range<usize>,
pub has_anchors: bool,
}
#[derive(Debug, Clone)]
pub struct YearInfo {
pub value: u32,
pub start: usize,
#[allow(dead_code)]
pub end: usize,
pub title_years: Vec<TitleYear>,
}
#[derive(Debug, Clone)]
pub struct TitleYear {
#[allow(dead_code)]
pub value: u32,
pub start: usize,
pub end: usize,
}
static SXXEXX_ANCHOR: LazyLock<regex::Regex> = LazyLock::new(|| {
regex::Regex::new(r"(?i)(?:^|[^a-zA-Z0-9])S\d{1,3}[. ]?E\d{1,4}")
.expect("SXXEXX_ANCHOR regex is valid")
});
static NXN_ANCHOR: LazyLock<regex::Regex> = LazyLock::new(|| {
regex::Regex::new(r"(?:^|[^a-zA-Z0-9])\d{1,2}[xX]\d{1,4}(?:$|[^a-zA-Z0-9])")
.expect("NXN_ANCHOR regex is valid")
});
static SUFFIXED_RESOLUTION: LazyLock<regex::Regex> = LazyLock::new(|| {
regex::Regex::new(
r"(?i)(?:^|[^a-zA-Z0-9])(?:480|576|720|1080|1440|2160|4320)[pi](?:$|[^a-zA-Z0-9])",
)
.expect("SUFFIXED_RESOLUTION regex is valid")
});
const TIER2_TOKENS: &[&str] = &[
"x264", "x265", "h264", "h265", "hevc", "xvid", "divx", "av1", "avc",
"aac", "ac3", "dts", "flac", "opus", "truehd", "atmos", "eac3", "pcm",
"bluray", "bdrip", "brrip", "dvdrip", "webrip", "hdrip", "hdtv", "pdtv", "sdtv", "dsr",
"dvdscr", "hddvd", "dvd", "dvdr", "bd",
"web-dl", "web-rip", "pal", "ntsc", "secam", "remux", "repack", "proper",
];
pub fn is_tier2_token(text: &str) -> bool {
let lower = text.to_lowercase();
TIER2_TOKENS.contains(&lower.as_str())
}
static YEAR_CANDIDATE: LazyLock<regex::Regex> = LazyLock::new(|| {
regex::Regex::new(r"(?P<year>(?:19|20)\d{2})").expect("YEAR_CANDIDATE regex is valid")
});
fn year_has_boundaries(input: &[u8], start: usize, end: usize) -> bool {
let left_ok = start == 0 || !input[start - 1].is_ascii_digit();
let right_ok = end >= input.len() || !input[end].is_ascii_digit();
left_ok && right_ok
}
static PAREN_YEAR: LazyLock<regex::Regex> = LazyLock::new(|| {
regex::Regex::new(r"\((?P<year>(?:19|20)\d{2})\)").expect("PAREN_YEAR regex is valid")
});
#[derive(Debug, Clone)]
struct YearCandidate {
value: u32,
start: usize,
end: usize,
parenthesized: bool,
}
pub fn build_zone_map(input: &str, token_stream: &TokenStream) -> ZoneMap {
let fn_start = token_stream.filename_start;
let fn_end = input.len();
let filename = &input[fn_start..];
let mut tech_zone_start = fn_end;
for re in [&*SXXEXX_ANCHOR, &*NXN_ANCHOR, &*SUFFIXED_RESOLUTION] {
if let Some(m) = re.find(filename) {
let offset = if m.start() == 0 { 0 } else { 1 };
let abs_pos = fn_start + m.start() + offset;
if abs_pos < tech_zone_start {
tech_zone_start = abs_pos;
}
}
}
for segment in &token_stream.segments {
for token in &segment.tokens {
if token.start < fn_start {
continue;
}
if token.start >= tech_zone_start {
break; }
if is_tier2_token(&token.text) {
tech_zone_start = token.start;
}
}
}
let has_tier12 = tech_zone_start < fn_end;
let year_info = disambiguate_years(input, fn_start, tech_zone_start);
if let Some(ref yi) = year_info
&& yi.start < tech_zone_start
{
tech_zone_start = yi.start;
}
let has_anchors = has_tier12
|| year_info.as_ref().is_some_and(|yi| {
let title_len = yi.start.saturating_sub(fn_start);
title_len >= 6
});
let dir_zones = build_dir_zones(input, token_stream);
debug!(
"zone map built: title={}..{}, tech={}..{}, anchors={}, year={:?}, dir_zones={}",
fn_start,
tech_zone_start,
tech_zone_start,
fn_end,
has_anchors,
year_info.as_ref().map(|y| y.value),
dir_zones.len()
);
ZoneMap {
title_zone: fn_start..tech_zone_start,
tech_zone: tech_zone_start..fn_end,
has_anchors,
year: year_info,
dir_zones,
}
}
fn build_dir_zones(input: &str, token_stream: &TokenStream) -> Vec<SegmentZone> {
use crate::tokenizer::SegmentKind;
let mut zones = Vec::new();
for (idx, segment) in token_stream.segments.iter().enumerate() {
if segment.kind != SegmentKind::Directory || segment.tokens.is_empty() {
continue;
}
let seg_start = segment.start;
let seg_end = segment.end;
let mut tech_start = seg_end;
let seg_text = &input[seg_start..seg_end];
for re in [&*SXXEXX_ANCHOR, &*NXN_ANCHOR, &*SUFFIXED_RESOLUTION] {
if let Some(m) = re.find(seg_text) {
let offset = if m.start() == 0 { 0 } else { 1 };
let abs_pos = seg_start + m.start() + offset;
if abs_pos < tech_start {
tech_start = abs_pos;
}
}
}
static DIR_SEASON_ANCHOR: LazyLock<regex::Regex> = LazyLock::new(|| {
regex::Regex::new(r"(?i)(?:^|[^a-zA-Z0-9])(?:S\d{1,3}|Season\s*\d+)(?:$|[^a-zA-Z0-9])")
.expect("DIR_SEASON_ANCHOR regex is valid")
});
if let Some(m) = DIR_SEASON_ANCHOR.find(seg_text) {
let offset = if m.start() == 0 { 0 } else { 1 };
let abs_pos = seg_start + m.start() + offset;
if abs_pos < tech_start {
tech_start = abs_pos;
}
}
for token in &segment.tokens {
if token.start >= tech_start {
break;
}
if is_tier2_token(&token.text) {
tech_start = token.start;
}
}
let has_anchors = tech_start < seg_end;
zones.push(SegmentZone {
segment_idx: idx,
title_zone: seg_start..tech_start,
tech_zone: tech_start..seg_end,
has_anchors,
});
}
zones
}
fn disambiguate_years(input: &str, fn_start: usize, _tech_zone_start: usize) -> Option<YearInfo> {
let filename = &input[fn_start..];
let mut candidates: Vec<YearCandidate> = Vec::new();
for cap in PAREN_YEAR.captures_iter(filename) {
let year_match = cap
.name("year")
.expect("year group always present in PAREN_YEAR");
let value: u32 = year_match.as_str().parse().unwrap_or(0);
let full = cap.get(0).expect("group 0 always present in a regex match");
candidates.push(YearCandidate {
value,
start: fn_start + full.start(),
end: fn_start + full.end(),
parenthesized: true,
});
}
let bytes = input.as_bytes();
for cap in YEAR_CANDIDATE.captures_iter(filename) {
let year_match = cap
.name("year")
.expect("year group always present in YEAR_CANDIDATE");
let value: u32 = year_match.as_str().parse().unwrap_or(0);
let abs_start = fn_start + year_match.start();
let abs_end = fn_start + year_match.end();
if !year_has_boundaries(bytes, abs_start, abs_end) {
continue;
}
if candidates
.iter()
.any(|c| c.parenthesized && abs_start >= c.start && abs_end <= c.end)
{
continue;
}
if value == 264 || value == 265 {
continue;
}
candidates.push(YearCandidate {
value,
start: abs_start,
end: abs_end,
parenthesized: false,
});
}
if candidates.is_empty() {
return None;
}
candidates.sort_by_key(|c| c.start);
if candidates.len() == 1 {
let c = &candidates[0];
return Some(YearInfo {
value: c.value,
start: c.start,
end: c.end,
title_years: vec![],
});
}
let paren_candidates: Vec<&YearCandidate> =
candidates.iter().filter(|c| c.parenthesized).collect();
if let Some(paren) = paren_candidates.first() {
let title_years: Vec<TitleYear> = candidates
.iter()
.filter(|c| c.start != paren.start)
.map(|c| TitleYear {
value: c.value,
start: c.start,
end: c.end,
})
.collect();
return Some(YearInfo {
value: paren.value,
start: paren.start,
end: paren.end,
title_years,
});
}
let last = candidates
.last()
.expect("candidates is non-empty after earlier checks");
let title_years: Vec<TitleYear> = candidates[..candidates.len() - 1]
.iter()
.map(|c| TitleYear {
value: c.value,
start: c.start,
end: c.end,
})
.collect();
Some(YearInfo {
value: last.value,
start: last.start,
end: last.end,
title_years,
})
}
#[cfg(test)]
mod tests {
use super::*;
use crate::tokenizer;
fn zones(input: &str) -> ZoneMap {
let ts = tokenizer::tokenize(input);
build_zone_map(input, &ts)
}
#[test]
fn test_basic_movie() {
let zm = zones("The.Matrix.1999.1080p.BluRay.x264-GROUP.mkv");
assert!(zm.year.is_some());
assert_eq!(zm.year.as_ref().unwrap().value, 1999);
assert!(zm.year.as_ref().unwrap().title_years.is_empty());
assert!(zm.title_zone.end <= 15); }
#[test]
fn test_year_as_title_2001() {
let zm = zones("2001.A.Space.Odyssey.1968.HDDVD.1080p.DTS.x264.mkv");
let yi = zm.year.as_ref().unwrap();
assert_eq!(yi.value, 1968);
assert_eq!(yi.title_years.len(), 1);
assert_eq!(yi.title_years[0].value, 2001);
}
#[test]
fn test_year_as_title_2012() {
let zm = zones("2012.2009.720p.BluRay.x264.DTS.mkv");
let yi = zm.year.as_ref().unwrap();
assert_eq!(yi.value, 2009);
assert_eq!(yi.title_years.len(), 1);
assert_eq!(yi.title_years[0].value, 2012);
}
#[test]
fn test_year_as_title_1917() {
let zm = zones("1917.2019.1080p.BluRay.x264-GROUP.mkv");
let yi = zm.year.as_ref().unwrap();
assert_eq!(yi.value, 2019);
assert_eq!(yi.title_years.len(), 1);
assert_eq!(yi.title_years[0].value, 1917);
}
#[test]
fn test_year_as_title_1922() {
let zm = zones("1922.2017.WEB-DL.x264.mkv");
let yi = zm.year.as_ref().unwrap();
assert_eq!(yi.value, 2017);
assert_eq!(yi.title_years.len(), 1);
assert_eq!(yi.title_years[0].value, 1922);
}
#[test]
fn test_parenthesized_year() {
let zm = zones("Movie (2019).mkv");
let yi = zm.year.as_ref().unwrap();
assert_eq!(yi.value, 2019);
assert!(yi.title_years.is_empty());
}
#[test]
fn test_episode_anchor() {
let zm = zones("Show.Name.S01E02.720p.HDTV.x264-GROUP.mkv");
assert!(zm.title_zone.end <= 11); assert!(zm.year.is_none());
}
#[test]
fn test_no_tech_tokens() {
let zm = zones("Just A Simple Title.mkv");
assert_eq!(zm.title_zone.start, 0);
assert_eq!(zm.title_zone.end, zm.tech_zone.start);
}
#[test]
fn test_single_year_no_tech() {
let zm = zones("Movie.2019.mkv");
let yi = zm.year.as_ref().unwrap();
assert_eq!(yi.value, 2019);
assert!(yi.title_years.is_empty());
}
}