#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Token {
pub text: String,
pub start: usize,
pub end: usize,
pub separator: Separator,
pub in_brackets: bool,
lower: String,
}
impl Token {
pub(crate) fn new(
text: String,
start: usize,
end: usize,
separator: Separator,
in_brackets: bool,
) -> Self {
let lower = text.to_lowercase();
Self {
text,
start,
end,
separator,
in_brackets,
lower,
}
}
pub fn lower(&self) -> &str {
&self.lower
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[non_exhaustive]
pub enum Separator {
None,
Dot,
Dash,
Space,
Underscore,
PathSep,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[non_exhaustive]
pub enum SegmentKind {
Directory,
Filename,
}
#[derive(Debug, Clone)]
pub struct PathSegment {
pub kind: SegmentKind,
pub tokens: Vec<Token>,
pub start: usize,
pub end: usize,
#[allow(dead_code)]
pub depth: usize,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct BracketGroup {
pub kind: BracketKind,
pub open: usize,
pub close: usize,
pub content: String,
pub segment_idx: usize,
}
impl BracketGroup {
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[non_exhaustive]
pub enum BracketKind {
Square,
Round,
Curly,
}
#[derive(Debug, Clone)]
pub struct TokenStream {
#[allow(dead_code)]
pub input: String,
pub tokens: Vec<Token>,
pub segments: Vec<PathSegment>,
pub filename_start: usize,
pub extension: Option<String>,
pub bracket_groups: Vec<BracketGroup>,
}
pub fn tokenize(input: &str) -> TokenStream {
let filename_start = crate::filename_start(input);
let raw_parts = split_path_segments(input);
let mut segments = Vec::new();
let mut extension = None;
let last_idx = raw_parts.len().saturating_sub(1);
for (depth, &(seg_text, seg_start)) in raw_parts.iter().enumerate() {
if seg_text.is_empty() {
continue;
}
if is_drive_letter(seg_text) {
continue;
}
let is_filename = depth == last_idx;
let kind = if is_filename {
SegmentKind::Filename
} else {
SegmentKind::Directory
};
let (name_part, seg_ext) = if is_filename {
split_extension(seg_text)
} else {
(seg_text, None)
};
if is_filename {
extension = seg_ext;
}
let protected = find_dot_acronyms(name_part);
let tokens = split_into_tokens(name_part, seg_start, &protected);
let actual_depth = segments.len();
segments.push(PathSegment {
kind,
tokens,
start: seg_start,
end: seg_start + seg_text.len(),
depth: actual_depth,
});
}
let tokens: Vec<Token> = segments
.iter()
.flat_map(|seg| seg.tokens.iter().cloned())
.collect();
let bracket_groups = extract_bracket_groups(input, &segments);
TokenStream {
input: input.to_string(),
tokens,
segments,
filename_start,
extension,
bracket_groups,
}
}
fn extract_bracket_groups(input: &str, segments: &[PathSegment]) -> Vec<BracketGroup> {
let mut groups = Vec::new();
let bytes = input.as_bytes();
let mut i = 0;
while i < bytes.len() {
let (open_char, close_char, kind) = match bytes[i] {
b'[' => (b'[', b']', BracketKind::Square),
b'(' => (b'(', b')', BracketKind::Round),
b'{' => (b'{', b'}', BracketKind::Curly),
_ => {
i += 1;
continue;
}
};
let _ = open_char;
if let Some(close_offset) = input[i + 1..].find(close_char as char) {
let close_pos = i + 1 + close_offset;
let content = &input[i + 1..close_pos];
let segment_idx = segments
.iter()
.position(|seg| i >= seg.start && i < seg.end)
.unwrap_or(segments.len().saturating_sub(1));
groups.push(BracketGroup {
kind,
open: i,
close: close_pos,
content: content.to_string(),
segment_idx,
});
i = close_pos + 1;
} else {
i += 1;
}
}
groups
}
fn split_path_segments(input: &str) -> Vec<(&str, usize)> {
let mut parts = Vec::new();
let mut start = 0;
for (i, ch) in input.char_indices() {
if ch == '/' || ch == '\\' {
parts.push((&input[start..i], start));
start = i + 1;
}
}
parts.push((&input[start..], start));
parts
}
fn is_drive_letter(seg: &str) -> bool {
let bytes = seg.as_bytes();
bytes.len() == 2 && bytes[0].is_ascii_alphabetic() && bytes[1] == b':'
}
fn split_extension(filename: &str) -> (&str, Option<String>) {
if let Some(dot_pos) = filename.rfind('.') {
let ext = &filename[dot_pos + 1..];
if !ext.is_empty() && ext.len() <= 5 && is_known_extension(ext) {
return (&filename[..dot_pos], Some(ext.to_lowercase()));
}
}
(filename, None)
}
fn is_known_extension(ext: &str) -> bool {
matches!(
ext.to_lowercase().as_str(),
"mkv"
| "mp4"
| "avi"
| "wmv"
| "flv"
| "mov"
| "webm"
| "ogm"
| "ogv"
| "ts"
| "m2ts"
| "m4v"
| "mpg"
| "mpeg"
| "vob"
| "divx"
| "3gp"
| "srt"
| "sub"
| "ssa"
| "ass"
| "idx"
| "sup"
| "vtt"
| "nfo"
| "txt"
| "jpg"
| "jpeg"
| "png"
| "nzb"
| "par"
| "par2"
| "iso"
| "img"
| "rar"
| "zip"
| "7z"
)
}
fn find_dot_acronyms(s: &str) -> Vec<(usize, usize)> {
let mut ranges = Vec::new();
let bytes = s.as_bytes();
let len = bytes.len();
let mut i = 0;
while i < len {
let preceded_by_alpha = i > 0 && bytes[i - 1].is_ascii_alphanumeric();
if bytes[i].is_ascii_alphanumeric()
&& !preceded_by_alpha
&& i + 2 < len
&& bytes[i + 1] == b'.'
{
let start = i;
let mut end = i + 1;
while end < len
&& bytes[end] == b'.'
&& end + 1 < len
&& bytes[end + 1].is_ascii_alphanumeric()
{
end += 2; }
if end < len && bytes[end].is_ascii_alphanumeric() {
end -= 2;
}
let letter_count = (end - start).div_ceil(2);
if letter_count >= 3 {
ranges.push((start, end));
i = end;
continue;
}
}
i += 1;
}
ranges
}
fn in_protected(pos: usize, protected: &[(usize, usize)]) -> bool {
protected.iter().any(|(s, e)| pos >= *s && pos < *e)
}
fn split_into_tokens(name: &str, base_offset: usize, protected: &[(usize, usize)]) -> Vec<Token> {
split_into_tokens_inner(name, base_offset, protected, 0)
}
fn split_into_tokens_inner(
name: &str,
base_offset: usize,
protected: &[(usize, usize)],
depth: u32,
) -> Vec<Token> {
if depth > 3 {
if !name.is_empty() {
return vec![Token::new(
name.to_string(),
base_offset,
base_offset + name.len(),
Separator::None,
true,
)];
}
return Vec::new();
}
let mut tokens = Vec::new();
let bytes = name.as_bytes();
let len = bytes.len();
let mut i = 0;
let mut current_sep = Separator::None;
let mut bracket_depth: u32 = 0;
while i < len {
if bytes[i] == b'[' || bytes[i] == b'(' {
bracket_depth += 1;
let close_char = if bytes[i] == b'[' { b']' } else { b')' };
let content_start = i + 1; let mut j = content_start;
while j < len && bytes[j] != close_char {
j += 1;
}
let bracket_content = &name[content_start..j];
if !bracket_content.is_empty() {
let inner_tokens = split_into_tokens_inner(
bracket_content,
base_offset + content_start,
&[],
depth + 1,
);
for mut t in inner_tokens {
t.in_brackets = true;
tokens.push(t);
}
}
i = if j < len { j + 1 } else { j };
current_sep = Separator::None;
bracket_depth = bracket_depth.saturating_sub(1);
continue;
}
if bytes[i] == b']' || bytes[i] == b')' {
bracket_depth = bracket_depth.saturating_sub(1);
i += 1;
continue;
}
if is_separator(bytes[i]) && !in_protected(i, protected) {
current_sep = byte_to_separator(bytes[i]);
i += 1;
while i < len && is_separator(bytes[i]) && !in_protected(i, protected) {
let next_sep = byte_to_separator(bytes[i]);
if sep_priority(next_sep) > sep_priority(current_sep) {
current_sep = next_sep;
}
i += 1;
}
continue;
}
let token_start = i;
while i < len && !is_separator(bytes[i]) || in_protected(i, protected) {
if bytes[i] == b'[' || bytes[i] == b'(' || bytes[i] == b']' || bytes[i] == b')' {
break;
}
i += 1;
}
let text = &name[token_start..i];
if !text.is_empty() {
tokens.push(Token::new(
text.to_string(),
base_offset + token_start,
base_offset + i,
current_sep,
bracket_depth > 0,
));
current_sep = Separator::None;
}
}
tokens
}
fn is_separator(b: u8) -> bool {
matches!(b, b'.' | b'-' | b'_' | b' ' | b',')
}
fn byte_to_separator(b: u8) -> Separator {
match b {
b'.' => Separator::Dot,
b'-' => Separator::Dash,
b'_' => Separator::Underscore,
b' ' | b',' => Separator::Space,
b'/' | b'\\' => Separator::PathSep,
_ => Separator::None,
}
}
fn sep_priority(s: Separator) -> u8 {
match s {
Separator::None => 0,
Separator::Dot => 1,
Separator::Underscore => 2,
Separator::Dash => 3,
Separator::Space => 4,
Separator::PathSep => 5,
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_basic_dot_separated() {
let ts = tokenize("The.Walking.Dead.S05E03.720p.mkv");
let texts: Vec<&str> = ts.tokens.iter().map(|t| t.text.as_str()).collect();
assert_eq!(texts, vec!["The", "Walking", "Dead", "S05E03", "720p"]);
assert_eq!(ts.extension, Some("mkv".to_string()));
}
#[test]
fn test_space_separated() {
let ts = tokenize("The Walking Dead S05E03 720p.mkv");
let texts: Vec<&str> = ts.tokens.iter().map(|t| t.text.as_str()).collect();
assert_eq!(texts, vec!["The", "Walking", "Dead", "S05E03", "720p"]);
}
#[test]
fn test_underscore_separated() {
let ts = tokenize("The_Walking_Dead_S05E03_720p.mkv");
let texts: Vec<&str> = ts.tokens.iter().map(|t| t.text.as_str()).collect();
assert_eq!(texts, vec!["The", "Walking", "Dead", "S05E03", "720p"]);
}
#[test]
fn test_dot_acronym_shield() {
let ts = tokenize("Marvels.Agents.of.S.H.I.E.L.D.S01E06.720p.mkv");
let texts: Vec<&str> = ts.tokens.iter().map(|t| t.text.as_str()).collect();
assert_eq!(
texts,
vec!["Marvels", "Agents", "of", "S.H.I.E.L.D", "S01E06", "720p"]
);
}
#[test]
fn test_bracket_group() {
let ts = tokenize("Movie.720p.x264-GROUP[rarbg].mkv");
let texts: Vec<&str> = ts.tokens.iter().map(|t| t.text.as_str()).collect();
assert_eq!(texts, vec!["Movie", "720p", "x264", "GROUP", "rarbg"]);
assert!(!ts.tokens[3].in_brackets); assert!(ts.tokens[4].in_brackets); }
#[test]
fn test_dash_release_group() {
let ts = tokenize("Movie.720p.BluRay.x264-DEMAND.mkv");
let texts: Vec<&str> = ts.tokens.iter().map(|t| t.text.as_str()).collect();
assert_eq!(texts, vec!["Movie", "720p", "BluRay", "x264", "DEMAND"]);
assert_eq!(ts.tokens[4].separator, Separator::Dash);
}
#[test]
fn test_path_with_directory() {
let ts = tokenize("/media/movies/Movie.720p.mkv");
let texts: Vec<&str> = ts.tokens.iter().map(|t| t.text.as_str()).collect();
assert_eq!(texts, vec!["media", "movies", "Movie", "720p"]);
assert_eq!(ts.filename_start, 14); }
#[test]
fn test_consecutive_separators() {
let ts = tokenize("Movie..720p.mkv");
let texts: Vec<&str> = ts.tokens.iter().map(|t| t.text.as_str()).collect();
assert_eq!(texts, vec!["Movie", "720p"]);
}
#[test]
fn test_mixed_separators() {
let ts = tokenize("Movie.Name - 720p.mkv");
let texts: Vec<&str> = ts.tokens.iter().map(|t| t.text.as_str()).collect();
assert_eq!(texts, vec!["Movie", "Name", "720p"]);
assert_eq!(ts.tokens[2].separator, Separator::Space);
}
#[test]
fn test_no_extension() {
let ts = tokenize("Movie.Name.S01E02");
let texts: Vec<&str> = ts.tokens.iter().map(|t| t.text.as_str()).collect();
assert_eq!(texts, vec!["Movie", "Name", "S01E02"]);
assert_eq!(ts.extension, None);
}
#[test]
fn test_parenthesized_year() {
let ts = tokenize("Movie Name (2024) 720p.mkv");
let texts: Vec<&str> = ts.tokens.iter().map(|t| t.text.as_str()).collect();
assert_eq!(texts, vec!["Movie", "Name", "2024", "720p"]);
assert!(ts.tokens[2].in_brackets);
}
#[test]
fn test_anime_brackets() {
let ts = tokenize("[SubGroup] Series Name - 01 [720p].mkv");
let texts: Vec<&str> = ts.tokens.iter().map(|t| t.text.as_str()).collect();
assert_eq!(texts, vec!["SubGroup", "Series", "Name", "01", "720p"]);
assert!(ts.tokens[0].in_brackets); assert!(!ts.tokens[1].in_brackets); assert!(ts.tokens[4].in_brackets); }
#[test]
fn test_dot_acronym_minimum() {
let ts = tokenize("A.B.Movie.mkv");
let texts: Vec<&str> = ts.tokens.iter().map(|t| t.text.as_str()).collect();
assert_eq!(texts, vec!["A", "B", "Movie"]);
}
#[test]
fn test_dot_acronym_three_letters() {
let ts = tokenize("A.B.C.Movie.mkv");
let texts: Vec<&str> = ts.tokens.iter().map(|t| t.text.as_str()).collect();
assert_eq!(texts, vec!["A.B.C", "Movie"]);
}
#[test]
fn test_separator_types() {
let ts = tokenize("A.B-C_D E.mkv");
assert_eq!(ts.tokens[0].separator, Separator::None); assert_eq!(ts.tokens[1].separator, Separator::Dot); assert_eq!(ts.tokens[2].separator, Separator::Dash); assert_eq!(ts.tokens[3].separator, Separator::Underscore); assert_eq!(ts.tokens[4].separator, Separator::Space); }
#[test]
fn test_empty_input() {
let ts = tokenize("");
assert!(ts.tokens.is_empty());
assert_eq!(ts.extension, None);
}
#[test]
fn test_extension_only() {
let ts = tokenize("movie.mkv");
let texts: Vec<&str> = ts.tokens.iter().map(|t| t.text.as_str()).collect();
assert_eq!(texts, vec!["movie"]);
assert_eq!(ts.extension, Some("mkv".to_string()));
}
#[test]
fn test_dts_hd_ma_tokens() {
let ts = tokenize("Movie.DTS-HD.MA.5.1.mkv");
let texts: Vec<&str> = ts.tokens.iter().map(|t| t.text.as_str()).collect();
assert_eq!(texts, vec!["Movie", "DTS", "HD", "MA", "5", "1"]);
}
#[test]
fn test_web_dl_tokens() {
let ts = tokenize("Movie.WEB-DL.1080p.mkv");
let texts: Vec<&str> = ts.tokens.iter().map(|t| t.text.as_str()).collect();
assert_eq!(texts, vec!["Movie", "WEB", "DL", "1080p"]);
}
#[test]
fn test_path_segments_basic() {
let ts = tokenize("Movies/Movie.720p.mkv");
assert_eq!(ts.segments.len(), 2);
assert_eq!(ts.segments[0].kind, SegmentKind::Directory);
assert_eq!(ts.segments[0].tokens[0].text, "Movies");
assert_eq!(ts.segments[0].depth, 0);
assert_eq!(ts.segments[1].kind, SegmentKind::Filename);
assert_eq!(ts.segments[1].depth, 1);
assert!(ts.tokens.iter().any(|t| t.text == "Movies"));
}
#[test]
fn test_path_segments_deep() {
let ts = tokenize("TV/Show Name/Season 01/Show.S01E01.720p.mkv");
assert_eq!(ts.segments.len(), 4);
assert_eq!(ts.segments[0].kind, SegmentKind::Directory);
assert_eq!(ts.segments[3].kind, SegmentKind::Filename);
assert_eq!(ts.segments[0].depth, 0);
assert_eq!(ts.segments[3].depth, 3);
}
#[test]
fn test_path_segments_no_path() {
let ts = tokenize("Movie.720p.mkv");
assert_eq!(ts.segments.len(), 1);
assert_eq!(ts.segments[0].kind, SegmentKind::Filename);
assert_eq!(ts.segments[0].depth, 0);
}
#[test]
fn test_path_segments_dir_metadata() {
let ts = tokenize("movies/Movie Name (2009) BRrip 720p/abbreviated.avi");
assert_eq!(ts.segments.len(), 3);
let dir_tokens: Vec<&str> = ts.segments[1]
.tokens
.iter()
.map(|t| t.text.as_str())
.collect();
assert!(dir_tokens.contains(&"BRrip"));
assert!(dir_tokens.contains(&"720p"));
assert!(dir_tokens.contains(&"2009"));
}
#[test]
fn test_windows_path() {
let ts = tokenize("D:\\TV\\Show.S01E01.mkv");
assert_eq!(ts.segments.last().unwrap().kind, SegmentKind::Filename);
let filename_tokens: Vec<&str> = ts
.segments
.last()
.unwrap()
.tokens
.iter()
.map(|t| t.text.as_str())
.collect();
assert!(filename_tokens.contains(&"Show"));
assert!(filename_tokens.contains(&"S01E01"));
}
#[test]
fn test_leading_slash_skips_empty() {
let ts = tokenize("/movies/Movie.mkv");
assert!(ts.segments.iter().all(|s| !s.tokens.is_empty()));
assert_eq!(ts.segments.last().unwrap().kind, SegmentKind::Filename);
}
#[test]
fn test_dir_no_extension_stripping() {
let ts = tokenize("movie.2009/file.mkv");
let dir_tokens: Vec<&str> = ts.segments[0]
.tokens
.iter()
.map(|t| t.text.as_str())
.collect();
assert!(dir_tokens.contains(&"2009"));
}
#[test]
fn test_bracket_groups() {
let ts = tokenize("[SubGroup] Title - 01 [1080p][DEADBEEF].mkv");
assert_eq!(ts.bracket_groups.len(), 3);
assert_eq!(ts.bracket_groups[0].content, "SubGroup");
assert_eq!(ts.bracket_groups[0].kind, BracketKind::Square);
assert_eq!(ts.bracket_groups[1].content, "1080p");
assert_eq!(ts.bracket_groups[2].content, "DEADBEEF");
}
#[test]
fn test_bracket_groups_mixed() {
let ts = tokenize("Movie (2019) {Fr-Eng} [Group].mkv");
assert_eq!(ts.bracket_groups.len(), 3);
assert_eq!(ts.bracket_groups[0].kind, BracketKind::Round);
assert_eq!(ts.bracket_groups[0].content, "2019");
assert_eq!(ts.bracket_groups[1].kind, BracketKind::Curly);
assert_eq!(ts.bracket_groups[1].content, "Fr-Eng");
assert_eq!(ts.bracket_groups[2].kind, BracketKind::Square);
assert_eq!(ts.bracket_groups[2].content, "Group");
}
#[test]
fn test_compound_bracket_groups() {
let ts = tokenize("Movie (1080p BluRay x265 HEVC 10bit AAC 7.1 Tigole) [QxR].mkv");
assert_eq!(ts.bracket_groups.len(), 2);
assert_eq!(
ts.bracket_groups[0].content,
"1080p BluRay x265 HEVC 10bit AAC 7.1 Tigole"
);
assert_eq!(ts.bracket_groups[1].content, "QxR");
}
}