pub(crate) mod context;
mod invariance;
mod matching;
mod pass2_helpers;
mod proper_count;
mod rule_registry;
pub(crate) mod token_context;
mod zone_rules;
use crate::hunch_result::HunchResult;
use crate::matcher::engine;
use crate::matcher::span::{MatchSpan, Property};
use crate::tokenizer::{self, TokenStream};
use crate::zone_map::{self, TitleYear, ZoneMap};
use matching::MatchContext;
use rule_registry::{LegacyMatcherFn, SegmentScope, TomlRule};
pub(super) fn match_overlaps_any_title_year(
match_start: usize,
match_end: usize,
title_years: &[TitleYear],
) -> bool {
title_years
.iter()
.any(|ty| match_start < ty.end && match_end > ty.start)
}
use log::{debug, trace};
use crate::priority;
use crate::properties::part;
use crate::properties::release_group;
use crate::properties::title;
use crate::properties::title::{TitleConfidence, TitleExtraction};
struct TitleHint {
value: String,
position: Option<usize>,
source: &'static str,
}
#[must_use = "a Pipeline is only useful when you call `.run()` / `.run_with_context()` on it"]
pub struct Pipeline {
toml_rules: Vec<TomlRule>,
legacy_matchers: Vec<LegacyMatcherFn>,
}
impl Default for Pipeline {
fn default() -> Self {
Self::new()
}
}
impl Pipeline {
pub fn new() -> Self {
Self {
toml_rules: rule_registry::build_toml_rules(),
legacy_matchers: rule_registry::build_legacy_matchers(),
}
}
pub fn run(&self, input: &str) -> HunchResult {
let (mut matches, token_stream, zone_map) = self.pass1(input);
self.pass2(input, &mut matches, &zone_map, &token_stream, None, None)
}
pub fn run_with_context<S: AsRef<str>>(&self, input: &str, siblings: &[S]) -> HunchResult {
let sibs: Vec<&str> = siblings.iter().map(|s| s.as_ref()).collect();
self.run_with_context_inner(input, &sibs)
}
fn run_with_context_inner(&self, input: &str, siblings: &[&str]) -> HunchResult {
self.run_with_context_and_fallback_inner(input, siblings, None)
}
pub fn run_with_context_and_fallback<S: AsRef<str>>(
&self,
input: &str,
siblings: &[S],
fallback_title: Option<&str>,
) -> HunchResult {
let sibs: Vec<&str> = siblings.iter().map(|s| s.as_ref()).collect();
self.run_with_context_and_fallback_inner(input, &sibs, fallback_title)
}
fn run_with_context_and_fallback_inner(
&self,
input: &str,
siblings: &[&str],
fallback_title: Option<&str>,
) -> HunchResult {
if siblings.is_empty() && fallback_title.is_none() {
return self.run(input);
}
if siblings.is_empty() {
let (mut matches, ts, zm) = self.pass1(input);
let hint = fallback_title.map(|fb| TitleHint {
value: fb.to_string(),
position: None,
source: "fallback",
});
return self.pass2(input, &mut matches, &zm, &ts, hint, None);
}
let (target_matches, target_ts, target_zm) = self.pass1(input);
let sibling_results: Vec<_> = siblings.iter().map(|s| self.pass1(s)).collect();
let sibling_analyses: Vec<_> = siblings
.iter()
.zip(&sibling_results)
.map(|(s, (matches, _, _))| invariance::FileAnalysis {
input: s,
matches: matches.as_slice(),
})
.collect();
let report = invariance::analyze_invariance(
&invariance::FileAnalysis {
input,
matches: &target_matches,
},
&sibling_analyses,
);
debug!(
"cross-file context: {} sibling(s), title={:?}, {} year signal(s), {} episode signal(s)",
siblings.len(),
report.title,
report.year_signals.len(),
report.episode_signals.len(),
);
for ys in &report.year_signals {
trace!(
" [YEAR] {} at {}..{} invariant={}",
ys.value, ys.start, ys.end, ys.is_invariant
);
}
for es in &report.episode_signals {
trace!(
" [EPISODE] {} at {}..{} sequential={} digits={}",
es.value, es.start, es.end, es.is_sequential, es.digit_count
);
}
let title_hint = match (report.title.as_ref(), fallback_title) {
(Some(inv), _) => Some(TitleHint {
value: inv.clone(),
position: report.title_start,
source: "invariance",
}),
(None, Some(fb)) => Some(TitleHint {
value: fb.to_string(),
position: None,
source: "fallback",
}),
(None, None) => None,
};
let mut matches = target_matches;
self.pass2(
input,
&mut matches,
&target_zm,
&target_ts,
title_hint,
Some(&report),
)
}
fn pass1(&self, input: &str) -> (Vec<MatchSpan>, TokenStream, ZoneMap) {
let token_stream = tokenizer::tokenize(input);
debug!(
"step 1: tokenized into {} segment(s), {} total token(s)",
token_stream.segments.len(),
token_stream
.segments
.iter()
.map(|s| s.tokens.len())
.sum::<usize>()
);
let zone_map = zone_map::build_zone_map(input, &token_stream);
debug!(
"step 1b: zone map — has_anchors={}, title_zone={}..{}, year={:?}",
zone_map.has_anchors,
zone_map.title_zone.start,
zone_map.title_zone.end,
zone_map.year.as_ref().map(|y| y.value)
);
let mut all_matches = self.match_all(input, &token_stream, &zone_map);
debug!(
"step 2: matching produced {} raw match(es)",
all_matches.len()
);
for m in &all_matches {
trace!(
" raw match: {:?}={} at {}..{} (pri={})",
m.property, m.value, m.start, m.end, m.priority
);
}
if let Some(ref yi) = zone_map.year {
all_matches.retain(|m| {
if m.property != Property::Year {
return true;
}
!match_overlaps_any_title_year(m.start, m.end, &yi.title_years)
});
}
let pre_resolve_count = all_matches.len();
engine::resolve_conflicts(&mut all_matches);
debug!(
"step 3: conflict resolution — {} → {} match(es)",
pre_resolve_count,
all_matches.len()
);
let pre_zone_count = all_matches.len();
zone_rules::apply_zone_rules(input, &zone_map, &token_stream, &mut all_matches);
debug!(
"step 4: zone disambiguation — {} → {} match(es)",
pre_zone_count,
all_matches.len()
);
part::mark_reclaimable_when_episode_present(&mut all_matches);
for m in &all_matches {
trace!(
" resolved: {:?}={} at {}..{}",
m.property, m.value, m.start, m.end
);
}
(all_matches, token_stream, zone_map)
}
fn pass2(
&self,
input: &str,
all_matches: &mut Vec<MatchSpan>,
zone_map: &ZoneMap,
token_stream: &TokenStream,
title_hint: Option<TitleHint>,
report: Option<&invariance::InvarianceReport>,
) -> HunchResult {
let rg_matches = release_group::find_matches(input, all_matches, zone_map, token_stream);
debug!(
"step 5a: release group — found {:?}",
rg_matches
.iter()
.map(|m| m.value.as_str())
.collect::<Vec<_>>()
);
all_matches.extend(rg_matches);
zone_rules::apply_post_release_group_rules(all_matches);
if let Some(report) = report {
pass2_helpers::apply_invariance_signals(all_matches, report);
}
let extraction = title::extract_title(input, all_matches, zone_map, token_stream);
if let Some(final_title) = pick_final_title(input, extraction, title_hint.as_ref()) {
debug!(
"step 5b: title → {:?} at {}..{}",
final_title.value, final_title.start, final_title.end
);
title::absorb_reclaimable(&final_title, all_matches);
all_matches.push(final_title);
}
if let Some((film_title, adjusted_title)) =
title::extract_film_title(input, all_matches, token_stream)
{
all_matches.retain(|m| m.property != Property::Title);
all_matches.push(film_title);
all_matches.push(adjusted_title);
}
if let Some(ep_title) = title::extract_episode_title(input, all_matches, token_stream) {
debug!("step 5c: episode title — \"{}\"", ep_title.value);
let ep_start = ep_title.start;
let ep_end = ep_title.end;
all_matches.retain(|m| {
if m.property != Property::ReleaseGroup {
return true;
}
!pass2_helpers::release_group_overlaps_episode_title(
m.start, m.end, ep_start, ep_end,
)
});
all_matches.push(ep_title);
}
let alt_titles = title::extract_alternative_titles(input, all_matches, token_stream);
for alt_title in alt_titles {
all_matches.push(alt_title);
}
let media_type = title::infer_media_type(input, all_matches);
let proper_count = proper_count::compute_proper_count(input, all_matches);
if media_type == "movie" {
all_matches.retain(|m| {
!(m.property == Property::Episode && m.priority <= priority::HEURISTIC)
});
}
pass2_helpers::strip_tech_from_subtitle_containers(all_matches);
debug!(
"step 6: building result from {} final match(es), media_type={}",
all_matches.len(),
media_type
);
let mut result = HunchResult::from_matches(all_matches);
result.set(Property::MediaType, media_type);
if proper_count > 0 {
result.set(Property::ProperCount, proper_count.to_string());
}
let confidence =
pass2_helpers::compute_confidence(&result, title_hint.is_some(), all_matches);
result.set_confidence(confidence);
debug!("step 7: confidence = {:?}", confidence);
result
}
fn match_all(
&self,
input: &str,
token_stream: &TokenStream,
zone_map: &ZoneMap,
) -> Vec<MatchSpan> {
let mut matches = Vec::new();
for rule in &self.toml_rules {
for (seg_idx, segment) in token_stream.segments.iter().enumerate() {
let is_dir = segment.kind == tokenizer::SegmentKind::Directory;
if is_dir && rule.scope == SegmentScope::FilenameOnly {
continue;
}
let effective_priority =
matching::effective_priority_for_segment(rule.priority, is_dir);
let dir_zone = if is_dir {
matching::find_dir_zone_for_segment(&zone_map.dir_zones, seg_idx)
} else {
None
};
let tokens = &segment.tokens;
matching::match_tokens_in_segment(
&MatchContext {
input,
tokens,
rule_set: rule.rules,
property: rule.property,
priority: effective_priority,
zone_map,
dir_zone,
},
&mut matches,
);
}
}
for matcher in &self.legacy_matchers {
matches.extend(matcher(input));
}
if let Some(ext) = &token_stream.extension {
let ext_start = input.len() - ext.len();
matches.push(
MatchSpan::new(ext_start, input.len(), Property::Container, ext.as_str())
.with_extension()
.with_priority(priority::EXTENSION),
);
}
matches
}
}
fn pick_final_title(
input: &str,
extraction: Option<TitleExtraction>,
hint: Option<&TitleHint>,
) -> Option<MatchSpan> {
match (extraction, hint) {
(Some(ex), _) if ex.confidence == TitleConfidence::Strong => {
trace!(
"title decision: STRONG extraction wins ({:?}); hint discarded",
ex.span.value
);
Some(ex.span)
}
(ex, Some(h)) => {
trace!(
"title decision: hint wins (source={}, value={:?}); extraction was {:?}",
h.source,
h.value,
ex.as_ref().map(|e| (&e.span.value, e.confidence))
);
Some(hint_to_match(input, h))
}
(Some(ex), None) => {
trace!(
"title decision: weak extraction wins ({:?}); no hint available",
ex.span.value
);
Some(ex.span)
}
(None, None) => {
trace!("title decision: no extraction, no hint — no title");
None
}
}
}
fn hint_to_match(input: &str, hint: &TitleHint) -> crate::matcher::span::MatchSpan {
use crate::matcher::span::{MatchSpan, Property};
let value = hint.value.as_str();
let position = hint.position.or_else(|| input.find(value));
if let Some(start) = position {
let (start, end) =
pass2_helpers::compute_override_title_span(start, value.len(), input.len());
MatchSpan::new(start, end, Property::Title, value)
} else {
MatchSpan::new(0, 0, Property::Title, value)
}
}
#[cfg(test)]
mod tests {
use super::*;
fn ty(start: usize, end: usize) -> TitleYear {
TitleYear {
value: 2049, start,
end,
}
}
#[test]
fn overlap_empty_title_years_returns_false() {
assert!(!match_overlaps_any_title_year(0, 100, &[]));
}
#[test]
fn overlap_match_fully_inside_title_year_returns_true() {
assert!(match_overlaps_any_title_year(11, 13, &[ty(10, 14)]));
}
#[test]
fn overlap_match_fully_contains_title_year_returns_true() {
assert!(match_overlaps_any_title_year(5, 20, &[ty(10, 14)]));
}
#[test]
fn overlap_match_disjoint_before_returns_false() {
assert!(!match_overlaps_any_title_year(0, 5, &[ty(10, 14)]));
}
#[test]
fn overlap_match_disjoint_after_returns_false() {
assert!(!match_overlaps_any_title_year(20, 25, &[ty(10, 14)]));
}
#[test]
fn overlap_match_touching_at_left_returns_false() {
assert!(!match_overlaps_any_title_year(5, 10, &[ty(10, 14)]));
}
#[test]
fn overlap_match_touching_at_right_returns_false() {
assert!(!match_overlaps_any_title_year(14, 20, &[ty(10, 14)]));
}
#[test]
fn overlap_match_one_byte_inside_at_right_edge_returns_true() {
assert!(match_overlaps_any_title_year(13, 20, &[ty(10, 14)]));
}
#[test]
fn overlap_match_one_byte_inside_at_left_edge_returns_true() {
assert!(match_overlaps_any_title_year(5, 11, &[ty(10, 14)]));
}
#[test]
fn overlap_with_multiple_title_years_returns_true_if_any_match() {
let years = vec![ty(0, 4), ty(10, 14), ty(20, 24)];
assert!(match_overlaps_any_title_year(22, 23, &years));
}
#[test]
fn overlap_with_multiple_title_years_returns_false_if_none_match() {
let years = vec![ty(0, 4), ty(10, 14), ty(20, 24)];
assert!(!match_overlaps_any_title_year(15, 19, &years));
}
}