#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Token {
pub text: String,
pub start: usize,
pub end: usize,
pub index: usize,
}
const DELIMITERS: &[char] = &['[', ']', '(', ')', '_', '.', '-', ' '];
#[derive(Debug, Clone, Default)]
pub struct Tokenizer;
impl Tokenizer {
pub fn new() -> Self {
Self
}
pub fn tokenize(&self, input: &str) -> Vec<Token> {
let mut tokens = Vec::new();
let mut current_start = 0;
let mut token_index = 0;
for (idx, c) in input.char_indices() {
if DELIMITERS.contains(&c) {
if idx > current_start {
let text = input[current_start..idx].to_string();
if !text.is_empty() {
let normalized = self.normalize(&text);
if normalized.is_empty() {
current_start = idx + c.len_utf8();
continue;
}
tokens.push(Token {
text: normalized,
start: current_start,
end: idx,
index: token_index,
});
token_index += 1;
}
}
current_start = idx + c.len_utf8();
}
}
if current_start < input.len() {
let text = input[current_start..].to_string();
if !text.is_empty() {
let normalized = self.normalize(&text);
if normalized.is_empty() {
return tokens;
}
tokens.push(Token {
text: normalized,
start: current_start,
end: input.len(),
index: token_index,
});
}
}
tokens
}
fn normalize(&self, text: &str) -> String {
text.to_lowercase()
.chars()
.filter(|c| c.is_ascii_alphanumeric() || *c == ' ')
.collect::<String>()
.trim()
.to_string()
}
pub fn get_spans(
&self,
tokens: &[Token],
start_idx: usize,
end_idx: usize,
) -> Option<(usize, usize)> {
if start_idx >= tokens.len() || end_idx > tokens.len() || start_idx >= end_idx {
return None;
}
let start = tokens[start_idx].start;
let end = tokens[end_idx - 1].end;
Some((start, end))
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_tokenize_basic() {
let tokenizer = Tokenizer::new();
let tokens = tokenizer.tokenize("[SubsPlease] Jujutsu Kaisen - 24 (1080p)");
assert!(!tokens.is_empty());
assert_eq!(tokens[0].text, "subsplease");
assert_eq!(tokens[0].start, 1);
assert_eq!(tokens[0].end, 11);
}
#[test]
fn test_tokenize_with_dots() {
let tokenizer = Tokenizer::new();
let tokens = tokenizer.tokenize("One.Piece.1084.VOSTFR.1080p");
assert!(tokens.len() >= 5);
assert_eq!(tokens[0].text, "one");
assert_eq!(tokens[1].text, "piece");
assert_eq!(tokens[2].text, "1084");
}
#[test]
fn test_tokenize_with_underscores() {
let tokenizer = Tokenizer::new();
let tokens = tokenizer.tokenize("[Judas]_Golden_Kamuy_S3_01");
assert!(tokens.len() >= 5);
let texts: Vec<_> = tokens.iter().map(|t| t.text.clone()).collect();
assert!(texts.contains(&"judas".to_string()));
assert!(texts.contains(&"golden".to_string()));
assert!(texts.contains(&"kamuy".to_string()));
}
#[test]
fn test_tokenize_empty() {
let tokenizer = Tokenizer::new();
let tokens = tokenizer.tokenize("");
assert!(tokens.is_empty());
}
#[test]
fn test_tokenize_only_delimiters() {
let tokenizer = Tokenizer::new();
let tokens = tokenizer.tokenize("[[[]]]()..--__");
assert!(tokens.is_empty());
}
#[test]
fn test_get_spans() {
let tokenizer = Tokenizer::new();
let tokens = tokenizer.tokenize("[SubsPlease] Jujutsu Kaisen - 24");
let span = tokenizer.get_spans(&tokens, 1, 3);
assert!(span.is_some());
}
}