hunch 2.0.2

A media filename parser for movies, TV, and anime — built in Rust, inspired by guessit
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
//! Title extraction — positional rule ("whatever's left" after other matchers).
//!
//! Split into submodules:
//! - `clean` — string cleaning (separators, brackets, casing)
//! - `secondary` — episode_title, film_title, alternative_title, media_type
//!
//! ## Why this lives in Rust (not `src/rules/`)
//!
//! Title is the *residual* extractor: it can't be expressed as a
//! pattern because its definition is "the substring no other property
//! claimed". The cleanup phase then applies algorithmic post-processing
//! (separator normalization, bracket stripping, casing rules) that's
//! pure logic, not vocabulary. See DESIGN.md D2 decision table →
//! "cross-pattern coordination" (depends on every other matcher's
//! output spans).

mod clean;
mod secondary;
mod strategies;

pub use secondary::{
    extract_alternative_titles, extract_episode_title, extract_film_title, infer_media_type,
};
pub use strategies::{TitleConfidence, TitleExtraction};

use crate::FILENAME_SEPS as SEPS;
use crate::matcher::span::{MatchSpan, Property};
use crate::tokenizer::TokenStream;
use crate::zone_map::ZoneMap;
use clean::{clean_title, is_abbreviated, is_likely_extension, pick_better_casing};
use strategies::{StrategyContext, TitleStrategy};

/// Characters we strip from title boundaries.
const BRACKETS: &[char] = &['(', ')', '[', ']', '{', '}'];

/// Whether a property is a technical metadata property (not a title word).
fn is_tech_property(p: Property) -> bool {
    matches!(
        p,
        Property::VideoCodec
            | Property::AudioCodec
            | Property::Source
            | Property::ScreenSize
            | Property::AudioChannels
            | Property::AudioProfile
            | Property::VideoProfile
            | Property::FrameRate
            | Property::ColorDepth
            | Property::StreamingService
            | Property::Edition
            | Property::Other
    )
}

/// Extract title from the input string by finding the gap before the first
/// recognized match. This is a post-processing step, not a `PropertyMatcher`.
///
/// The `zone_map` is used for year-as-title disambiguation (e.g., "2001" in
/// "2001.A.Space.Odyssey.1968" is a title word, not the release year).
///
/// Reclaimable matches (marked by TOML `requires_nearby`) are transparent
/// to the title boundary: they don't stop the title, and if absorbed into
/// the title span they are removed from `matches`.
/// Extract title from the input string by finding the gap before the first
/// recognized match. This is a post-processing step, not a `PropertyMatcher`.
///
/// The `zone_map` is used for year-as-title disambiguation (e.g., "2001" in
/// "2001.A.Space.Odyssey.1968" is a title word, not the release year).
///
/// to the title boundary: they don't stop the title, and if absorbed into
/// the title span they are removed from `matches`.
///
/// Returns a [`TitleExtraction`] pairing the title span with a confidence
/// score that the pipeline uses to decide whether cross-file context
/// (invariance, ancestor fallback) should override this title.
pub fn extract_title(
    input: &str,
    matches: &[MatchSpan],
    zone_map: &ZoneMap,
    _token_stream: &TokenStream,
) -> Option<TitleExtraction> {
    let filename_start = input.rfind(['/', '\\']).map(|i| i + 1).unwrap_or(0);
    let filename = &input[filename_start..];

    // Title boundary: first non-extension match in the filename.
    // Reclaimable matches are skipped ONLY if there's title content before
    // them (e.g., "Pacific.Rim.3D" → skip 3D, absorb into title).
    // If a reclaimable match starts at the filename beginning, it's treated
    // normally (e.g., "3D.2019" → 3D is Other, not title content).
    let first_match_in_filename = matches
        .iter()
        .filter(|m| {
            m.start >= filename_start
                && !m.is_extension
                && (!m.reclaimable || m.start == filename_start)
        })
        .min_by_key(|m| m.start);

    let title_end_abs = match first_match_in_filename {
        Some(m) => m.start,
        None => {
            let ext_start = filename.rfind('.').unwrap_or(filename.len());
            if ext_start < filename.len() {
                let candidate_ext = &filename[ext_start + 1..];
                if is_likely_extension(&candidate_ext.to_lowercase()) {
                    filename_start + ext_start
                } else {
                    filename_start + filename.len()
                }
            } else {
                filename_start + filename.len()
            }
        }
    };

    if title_end_abs <= filename_start {
        return handle_empty_title(
            input,
            filename_start,
            filename,
            matches,
            zone_map,
            first_match_in_filename,
        );
    }

    let raw_title = &input[filename_start..title_end_abs];

    // Truncate at structural separators (" - ", "--", "(").
    let structural_sep_offset = find_first_structural_separator(raw_title);
    let title_end_abs = structural_sep_offset
        .map(|offset| filename_start + offset)
        .unwrap_or(title_end_abs);
    let raw_title = &input[filename_start..title_end_abs];

    // Confidence classification for the residual extractor.
    //
    // Strong: the title was bounded by an explicit author-placed marker
    //   - a structural separator (" - ", "(", "--")
    //   - a content property (Year, Season, Episode, Part, ...)
    //   The file is self-describing.
    //
    // Weak: the title is just "the bytes before the first tech token"
    //   or "the bytes before the extension" — no deliberate marker. A
    //   filename like `Special.720p.mkv` extracts "Special" as a Weak
    //   title, and the pipeline will prefer ancestor fallback if any.
    let confidence = if structural_sep_offset.is_some() {
        TitleConfidence::Strong
    } else {
        match first_match_in_filename {
            Some(m) if !is_tech_property(m.property) => TitleConfidence::Strong,
            _ => TitleConfidence::Weak,
        }
    };

    let cleaned = clean_title(raw_title);

    if cleaned.is_empty() {
        let ctx = StrategyContext {
            input,
            matches,
            filename_start,
        };
        if let Some(title) = strategies::run_fallback_ladder(&ctx) {
            return Some(title);
        }
        return None;
    }

    // Prefer parent dir casing when titles match case-insensitively.
    if has_parent_dir(input)
        && let Some(parent_match) = strategies::ParentDir.try_extract(&StrategyContext {
            input,
            matches,
            filename_start,
        })
        && parent_match.value.to_lowercase() == cleaned.to_lowercase()
        && parent_match.value != cleaned
    {
        let best = pick_better_casing(&cleaned, &parent_match.value);
        if best != cleaned {
            return Some(TitleExtraction::new(
                MatchSpan::new(filename_start, title_end_abs, Property::Title, best),
                confidence,
            ));
        }
    }

    // Abbreviated filenames fall back to parent directory.
    if is_abbreviated(&cleaned)
        && has_parent_dir(input)
        && let Some(parent_title) = strategies::ParentDir.try_extract(&StrategyContext {
            input,
            matches,
            filename_start,
        })
    {
        return Some(TitleExtraction::new(
            parent_title,
            strategies::ParentDir.confidence(),
        ));
    }

    Some(TitleExtraction::new(
        MatchSpan::new(filename_start, title_end_abs, Property::Title, cleaned),
        confidence,
    ))
}

/// Remove reclaimable matches that fall within the title span.
///
/// Called after title extraction. Any reclaimable match whose byte range
/// overlaps with the title is considered absorbed into the title.
pub fn absorb_reclaimable(title: &MatchSpan, matches: &mut Vec<MatchSpan>) {
    matches.retain(|m| {
        if !m.reclaimable {
            return true;
        }
        // Drop if this match falls within the title span.
        !(m.start >= title.start && m.end <= title.end)
    });
}

/// Handle the case where title_end_abs <= filename_start (empty title zone).
fn handle_empty_title(
    input: &str,
    filename_start: usize,
    filename: &str,
    matches: &[MatchSpan],
    zone_map: &ZoneMap,
    first_match_in_filename: Option<&MatchSpan>,
) -> Option<TitleExtraction> {
    // Year-as-title via ZoneMap: e.g., "2001" in "2001.A.Space.Odyssey.1968".
    if let Some(ref yi) = zone_map.year
        && let Some(ty) = yi.title_years.iter().find(|ty| ty.start == filename_start)
        && let Some(title) =
            extract_title_after_position(input, ty.end, filename_start, filename, matches)
    {
        // Year-anchored title is a structural marker.
        return Some(TitleExtraction::new(title, TitleConfidence::Strong));
    }
    // Fallback: first match is a Year at filename start.
    if let Some(first_m) = first_match_in_filename
        && first_m.property == Property::Year
        && first_m.start == filename_start
        && let Some(title) =
            extract_title_after_position(input, first_m.end, filename_start, filename, matches)
    {
        return Some(TitleExtraction::new(title, TitleConfidence::Strong));
    }
    // Leading tech tokens at filename start (e.g., "h265 - HEVC Riddick...").
    // Skip past all contiguous tech matches at the start to find the title gap.
    if let Some(first_m) = first_match_in_filename
        && first_m.start == filename_start
        && is_tech_property(first_m.property)
    {
        // Find the end of the last contiguous tech match at the start.
        let mut skip_end = first_m.end;
        loop {
            let next = matches.iter().find(|m| {
                m.start >= skip_end
                    && m.start <= skip_end + 3 // allow small separator gap
                    && m.start < filename_start + filename.len()
                    && !m.is_extension
                    && is_tech_property(m.property)
            });
            match next {
                Some(m) => skip_end = m.end,
                None => break,
            }
        }
        if let Some(title) =
            extract_title_after_position(input, skip_end, filename_start, filename, matches)
        {
            // Bounded by tech only — weak (residual extraction).
            return Some(TitleExtraction::new(title, TitleConfidence::Weak));
        }
    }
    // Single short word with no path/extension → treat as title.
    if !input.contains(['/', '\\']) && !input.contains('.') && input.len() <= 10 {
        let cleaned = clean_title(input);
        if !cleaned.is_empty() {
            return Some(TitleExtraction::new(
                MatchSpan::new(0, input.len(), Property::Title, cleaned),
                TitleConfidence::Weak,
            ));
        }
    }
    // Unclaimed bracket content: when everything is in brackets and one
    // bracket group isn't claimed by any matcher, it's likely the title.
    // E.g., [DBD-Raws][4K_HDR][ready.player.one][2160P][...].mkv
    let ctx = StrategyContext {
        input,
        matches,
        filename_start,
    };
    if let Some(title) = strategies::UnclaimedBracket.try_extract(&ctx) {
        return Some(TitleExtraction::new(
            title,
            strategies::UnclaimedBracket.confidence(),
        ));
    }
    strategies::ParentDir
        .try_extract(&ctx)
        .map(|t| TitleExtraction::new(t, strategies::ParentDir.confidence()))
}

/// Extract title from position `start` to the next match in the filename.
fn extract_title_after_position(
    input: &str,
    start: usize,
    filename_start: usize,
    filename: &str,
    matches: &[MatchSpan],
) -> Option<MatchSpan> {
    let next_match = matches
        .iter()
        .filter(|m| m.start > start && !m.is_extension)
        .min_by_key(|m| m.start);
    let title_end = next_match
        .map(|m| m.start)
        .unwrap_or(filename_start + filename.len());
    if title_end > start {
        let raw = &input[start..title_end];
        let cleaned = clean_title(raw);
        if !cleaned.is_empty() {
            return Some(MatchSpan::new(start, title_end, Property::Title, cleaned));
        }
    }
    None
}

fn has_parent_dir(input: &str) -> bool {
    input.contains('/') || input.contains('\\')
}

/// Return the byte offset of the **first** structural separator in `raw`,
/// or `None` if the input has no separator that qualifies (or one occurs
/// inside the leading 3 bytes — too short to be a real title prefix).
///
/// "Structural separators" are the punctuation patterns release-naming
/// conventions use to split a title from its trailing metadata: `" ("`,
/// `" - "`, `"--"`, and their `_`/`.`-flanked equivalents.
///
/// # Semantics: first wins
///
/// **All current callers want this**: a parenthesized year, alt-title,
/// or `" - "` segment marks the *end* of the canonical title; everything
/// after it is metadata or a sub-title. So the function returns the
/// EARLIEST qualifying offset (`min` over per-separator `find` results).
///
/// # When NOT to use this
///
/// Some inputs legitimately contain `" - "` *inside* the title:
///
/// - Anime multi-segment releases:
///   `[Group] Show - Sub-arc Part 2 - 13 [tags].mkv` — here the first
///   `" - "` separates two title segments, NOT title from metadata.
/// - Spider-style hyphenated names that survive `normalize_separators`
///   are a separate concern (they keep the `-`, no surrounding spaces).
///
/// In those cases the caller already knows the boundary structurally
/// (e.g. an Episode `MatchSpan` after the title) and should compute the
/// trim point directly rather than asking this function. See
/// `strategies::AfterBracketGroup` for the canonical example: it skips
/// `find_first_structural_separator` on the anime-episode branch and
/// trims trailing separators by hand instead. See also #124 / #127.
pub(super) fn find_first_structural_separator(raw: &str) -> Option<usize> {
    /// Minimum length the title prefix must have for a separator to count.
    /// Guards against pathological inputs like `"a - b"` where `" - "`
    /// at offset 1 would yield an empty title.
    const MIN_TITLE_LEN: usize = 3;

    // The earliest hit across any separator wins.
    const SEPARATORS: &[&str] = &[" (", "_(", ".(", " - ", "_-_", ".-.", "--"];

    SEPARATORS
        .iter()
        .filter_map(|sep| raw.find(sep).filter(|&pos| pos >= MIN_TITLE_LEN))
        .min()
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::tokenizer;
    use crate::zone_map;

    fn test_zone_map(input: &str) -> ZoneMap {
        let ts = tokenizer::tokenize(input);
        zone_map::build_zone_map(input, &ts)
    }

    fn test_ts(input: &str) -> tokenizer::TokenStream {
        tokenizer::tokenize(input)
    }

    // ── find_first_structural_separator ──────────────────────────
    //
    // These tests pin the "first wins" semantic. They exist so that
    // anyone tempted to change `min` to `max` (or to add an EpisodeAware
    // mode without a use case) reads this comment first. See the rustdoc
    // on the function for the rationale.

    #[test]
    fn first_separator_wins_picks_earliest_offset() {
        // " - " at offset 4 wins over " (" at offset 12.
        assert_eq!(
            find_first_structural_separator("Show - Subtitle (2020)"),
            Some(4)
        );
    }

    #[test]
    fn first_separator_skips_too_short_prefix() {
        // "a - b": " - " at offset 1 < MIN_TITLE_LEN, so None.
        assert_eq!(find_first_structural_separator("a - b"), None);
        // "abc - d": offset 3 ≥ MIN_TITLE_LEN, accepted.
        assert_eq!(find_first_structural_separator("abc - d"), Some(3));
    }

    #[test]
    fn first_separator_returns_none_on_separatorless_input() {
        assert_eq!(
            find_first_structural_separator("PlainTitleNoSeparator"),
            None
        );
    }

    #[test]
    fn first_separator_caveat_anime_multi_segment() {
        // KNOWN LIMITATION (documented on the function): for anime-style
        // multi-segment titles, the FIRST " - " is INSIDE the title, not at
        // the boundary. Callers facing this case must NOT use this function.
        // This test pins the limitation so a future "fix" doesn't silently
        // break `strategies::AfterBracketGroup`'s anime-episode branch.
        let raw = "Enen no Shouboutai - San no Shou Part 2";
        assert_eq!(
            find_first_structural_separator(raw),
            Some(18),
            "function returns the first \" - \"; AfterBracketGroup must \
             bypass it on the anime-episode branch (#124 / #127)"
        );
    }

    #[test]
    fn test_title_before_year() {
        let input = "The.Matrix.1999.1080p.mkv";
        let matches = vec![MatchSpan::new(11, 15, Property::Year, "1999")];
        let zm = test_zone_map(input);
        let ts = test_ts(input);
        let title = extract_title(input, &matches, &zm, &ts).unwrap();
        assert_eq!(title.span.value, "The Matrix");
    }

    #[test]
    fn test_title_no_matches() {
        let input = "JustATitle.mkv";
        let zm = test_zone_map(input);
        let ts = test_ts(input);
        let title = extract_title(input, &[], &zm, &ts).unwrap();
        assert_eq!(title.span.value, "JustATitle");
    }

    #[test]
    fn test_title_with_path() {
        let input = "/movies/dir/The.Movie.2020.mkv";
        let matches = vec![MatchSpan::new(22, 26, Property::Year, "2020")];
        let zm = test_zone_map(input);
        let ts = test_ts(input);
        let title = extract_title(input, &matches, &zm, &ts).unwrap();
        assert_eq!(title.span.value, "The Movie");
    }

    #[test]
    fn test_abbreviated_fallback() {
        let input = "Movies/Alice in Wonderland DVDRip.XviD-DiAMOND/dmd-aw.avi";
        let matches = vec![MatchSpan::new(27, 34, Property::Source, "DVD")];
        let zm = test_zone_map(input);
        let ts = test_ts(input);
        let title = extract_title(input, &matches, &zm, &ts);
        assert!(title.is_some());
        assert_eq!(title.unwrap().span.value, "Alice in Wonderland");
    }

    #[test]
    fn test_infer_episode() {
        let matches = vec![
            MatchSpan::new(0, 5, Property::Season, "1"),
            MatchSpan::new(5, 10, Property::Episode, "3"),
        ];
        assert_eq!(infer_media_type("Show.S01E03.mkv", &matches), "episode");
    }

    #[test]
    fn test_reclaimable_absorbed_into_title() {
        let input = "Harold.And.Kumar.3D.Christmas.mkv";
        let reclaimable_3d = MatchSpan::new(17, 19, Property::Other, "3D").with_reclaimable();
        let mut matches = vec![reclaimable_3d];
        let zm = test_zone_map(input);
        let ts = test_ts(input);
        let title = extract_title(input, &matches, &zm, &ts).unwrap();
        assert_eq!(title.span.value, "Harold And Kumar 3D Christmas");
        // Absorb should remove the reclaimable match.
        absorb_reclaimable(&title.span, &mut matches);
        assert!(matches.is_empty(), "reclaimable 3D should be absorbed");
    }

    #[test]
    fn test_confident_3d_stops_title() {
        // When 3D is NOT reclaimable (confident), it sets the title boundary.
        let input = "Pacific.Rim.3D.2013.BluRay.mkv";
        let confident_3d = MatchSpan::new(12, 14, Property::Other, "3D");
        let year = MatchSpan::new(15, 19, Property::Year, "2013");
        let matches = vec![confident_3d, year];
        let zm = test_zone_map(input);
        let ts = test_ts(input);
        let title = extract_title(input, &matches, &zm, &ts).unwrap();
        assert_eq!(title.span.value, "Pacific Rim");
    }

    #[test]
    fn test_infer_movie() {
        let matches = vec![MatchSpan::new(0, 4, Property::Year, "2024")];
        assert_eq!(infer_media_type("Movie.2024.mkv", &matches), "movie");
    }

    #[test]
    fn test_movie_dir_suppresses_heuristic_episode() {
        // "Movie 10" in a movie/ directory: bare number is a franchise number,
        // not an episode. Path context should win over heuristic episode.
        let matches = vec![
            MatchSpan::new(52, 56, Property::Episode, "10")
                .with_priority(crate::priority::HEURISTIC),
        ];
        assert_eq!(
            infer_media_type(
                "movie/Japanese/Detective Conan/Detective.Conan.Movie.10.mkv",
                &matches
            ),
            "movie"
        );
    }

    #[test]
    fn test_movie_dir_keeps_strong_episode() {
        // SxxExx in a movie/ directory: strong signal overrides path context.
        let matches = vec![
            MatchSpan::new(0, 6, Property::Season, "1"),
            MatchSpan::new(0, 6, Property::Episode, "3").with_priority(crate::priority::STRUCTURAL),
        ];
        assert_eq!(
            infer_media_type("movie/Show.S01E03.mkv", &matches),
            "episode"
        );
    }
}