#![allow(dead_code)]
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct SearchHit {
pub index: usize,
pub byte_offset: usize,
pub match_len: usize,
}
#[derive(Clone, Debug)]
pub struct SearchOptions {
pub case_insensitive: bool,
pub whole_word: bool,
pub max_results: usize,
}
impl Default for SearchOptions {
fn default() -> Self {
Self {
case_insensitive: true,
whole_word: false,
max_results: 0,
}
}
}
impl SearchOptions {
#[must_use]
pub fn new() -> Self {
Self::default()
}
#[must_use]
pub fn case_insensitive(mut self, val: bool) -> Self {
self.case_insensitive = val;
self
}
#[must_use]
pub fn whole_word(mut self, val: bool) -> Self {
self.whole_word = val;
self
}
#[must_use]
pub fn max_results(mut self, val: usize) -> Self {
self.max_results = val;
self
}
}
fn is_word_boundary(text: &str, pos: usize, match_len: usize) -> bool {
let before_ok = if pos == 0 {
true
} else {
text.as_bytes()
.get(pos - 1)
.map_or(true, |&b| !b.is_ascii_alphanumeric())
};
let after_ok = {
let end = pos + match_len;
if end >= text.len() {
true
} else {
text.as_bytes()
.get(end)
.map_or(true, |&b| !b.is_ascii_alphanumeric())
}
};
before_ok && after_ok
}
fn find_all_in_text(text: &str, query: &str, opts: &SearchOptions) -> Vec<(usize, usize)> {
let mut hits = Vec::new();
if query.is_empty() {
return hits;
}
let (haystack, needle) = if opts.case_insensitive {
(text.to_lowercase(), query.to_lowercase())
} else {
(text.to_string(), query.to_string())
};
let mut start = 0;
while let Some(pos) = haystack[start..].find(&needle) {
let abs_pos = start + pos;
if !opts.whole_word || is_word_boundary(text, abs_pos, needle.len()) {
hits.push((abs_pos, needle.len()));
}
start = abs_pos + 1;
}
hits
}
#[derive(Clone, Debug)]
pub struct SearchableSubtitle {
pub text: String,
pub start_ms: i64,
pub end_ms: i64,
}
impl SearchableSubtitle {
#[must_use]
pub fn new(text: impl Into<String>, start_ms: i64, end_ms: i64) -> Self {
Self {
text: text.into(),
start_ms,
end_ms,
}
}
}
#[must_use]
pub fn search_subtitles(
subtitles: &[SearchableSubtitle],
query: &str,
opts: &SearchOptions,
) -> Vec<SearchHit> {
let mut results = Vec::new();
for (idx, sub) in subtitles.iter().enumerate() {
let matches = find_all_in_text(&sub.text, query, opts);
for (offset, len) in matches {
results.push(SearchHit {
index: idx,
byte_offset: offset,
match_len: len,
});
if opts.max_results > 0 && results.len() >= opts.max_results {
return results;
}
}
}
results
}
#[must_use]
pub fn search_in_time_range(
subtitles: &[SearchableSubtitle],
query: &str,
opts: &SearchOptions,
range_start_ms: i64,
range_end_ms: i64,
) -> Vec<SearchHit> {
let mut results = Vec::new();
for (idx, sub) in subtitles.iter().enumerate() {
if sub.end_ms < range_start_ms || sub.start_ms > range_end_ms {
continue;
}
let matches = find_all_in_text(&sub.text, query, opts);
for (offset, len) in matches {
results.push(SearchHit {
index: idx,
byte_offset: offset,
match_len: len,
});
if opts.max_results > 0 && results.len() >= opts.max_results {
return results;
}
}
}
results
}
#[must_use]
pub fn count_occurrences(
subtitles: &[SearchableSubtitle],
query: &str,
case_insensitive: bool,
) -> usize {
let opts = SearchOptions {
case_insensitive,
whole_word: false,
max_results: 0,
};
subtitles
.iter()
.map(|sub| find_all_in_text(&sub.text, query, &opts).len())
.sum()
}
#[derive(Clone, Debug, Default)]
pub struct SubtitleWordIndex {
entries: Vec<(String, Vec<usize>)>,
}
impl SubtitleWordIndex {
#[must_use]
pub fn build(subtitles: &[SearchableSubtitle]) -> Self {
let mut entries: Vec<(String, Vec<usize>)> = Vec::new();
for (idx, sub) in subtitles.iter().enumerate() {
for word in sub.text.split(|c: char| !c.is_alphanumeric()) {
if word.is_empty() {
continue;
}
let lower = word.to_lowercase();
if let Some(entry) = entries.iter_mut().find(|(w, _)| w == &lower) {
if entry.1.last() != Some(&idx) {
entry.1.push(idx);
}
} else {
entries.push((lower, vec![idx]));
}
}
}
Self { entries }
}
#[must_use]
pub fn lookup(&self, word: &str) -> Vec<usize> {
let lower = word.to_lowercase();
self.entries
.iter()
.find(|(w, _)| w == &lower)
.map_or_else(Vec::new, |(_, indices)| indices.clone())
}
#[must_use]
pub fn word_count(&self) -> usize {
self.entries.len()
}
}
#[cfg(test)]
mod tests {
use super::*;
fn sample_subs() -> Vec<SearchableSubtitle> {
vec![
SearchableSubtitle::new("Hello world", 0, 2000),
SearchableSubtitle::new("The quick brown fox", 2000, 4000),
SearchableSubtitle::new("Hello again, world!", 4000, 6000),
SearchableSubtitle::new("Goodbye everyone", 6000, 8000),
]
}
#[test]
fn test_basic_search() {
let subs = sample_subs();
let hits = search_subtitles(&subs, "hello", &SearchOptions::default());
assert_eq!(hits.len(), 2);
assert_eq!(hits[0].index, 0);
assert_eq!(hits[1].index, 2);
}
#[test]
fn test_case_sensitive_search() {
let subs = sample_subs();
let opts = SearchOptions::new().case_insensitive(false);
let hits = search_subtitles(&subs, "hello", &opts);
assert_eq!(hits.len(), 0);
let hits2 = search_subtitles(&subs, "Hello", &opts);
assert_eq!(hits2.len(), 2);
}
#[test]
fn test_whole_word_search() {
let subs = vec![SearchableSubtitle::new("the cat sat on the mat", 0, 1000)];
let opts = SearchOptions::new().whole_word(true);
let hits = search_subtitles(&subs, "the", &opts);
assert_eq!(hits.len(), 2);
}
#[test]
fn test_whole_word_no_partial() {
let subs = vec![SearchableSubtitle::new("theater", 0, 1000)];
let opts = SearchOptions::new().whole_word(true);
let hits = search_subtitles(&subs, "the", &opts);
assert_eq!(hits.len(), 0);
}
#[test]
fn test_max_results() {
let subs = sample_subs();
let opts = SearchOptions::new().max_results(1);
let hits = search_subtitles(&subs, "hello", &opts);
assert_eq!(hits.len(), 1);
}
#[test]
fn test_empty_query() {
let subs = sample_subs();
let hits = search_subtitles(&subs, "", &SearchOptions::default());
assert!(hits.is_empty());
}
#[test]
fn test_search_in_time_range() {
let subs = sample_subs();
let hits = search_in_time_range(&subs, "hello", &SearchOptions::default(), 3000, 7000);
assert_eq!(hits.len(), 1);
assert_eq!(hits[0].index, 2);
}
#[test]
fn test_search_time_range_no_match() {
let subs = sample_subs();
let hits = search_in_time_range(&subs, "hello", &SearchOptions::default(), 6500, 8000);
assert!(hits.is_empty());
}
#[test]
fn test_count_occurrences() {
let subs = sample_subs();
let count = count_occurrences(&subs, "world", true);
assert_eq!(count, 2);
}
#[test]
fn test_word_index_build_and_lookup() {
let subs = sample_subs();
let index = SubtitleWordIndex::build(&subs);
let hello_indices = index.lookup("hello");
assert_eq!(hello_indices, vec![0, 2]);
let fox_indices = index.lookup("fox");
assert_eq!(fox_indices, vec![1]);
}
#[test]
fn test_word_index_missing_word() {
let subs = sample_subs();
let index = SubtitleWordIndex::build(&subs);
let indices = index.lookup("nonexistent");
assert!(indices.is_empty());
}
#[test]
fn test_word_index_word_count() {
let subs = vec![SearchableSubtitle::new("one two three", 0, 1000)];
let index = SubtitleWordIndex::build(&subs);
assert_eq!(index.word_count(), 3);
}
#[test]
fn test_search_hit_byte_offset() {
let subs = vec![SearchableSubtitle::new("abc hello xyz", 0, 1000)];
let hits = search_subtitles(&subs, "hello", &SearchOptions::default());
assert_eq!(hits.len(), 1);
assert_eq!(hits[0].byte_offset, 4);
assert_eq!(hits[0].match_len, 5);
}
}