ass_editor/utils/
indexing.rs

1//! FST-based search indexing for fast ASS content queries
2//!
3//! Provides trie-based indexing for regex and fuzzy search queries as specified
4//! in the architecture (lines 142-143). Fallback to linear search for WASM.
5
6use crate::core::{EditorDocument, Position, Range, Result};
7#[cfg(feature = "search-index")]
8use crate::utils::search::SearchScope;
9use crate::utils::search::{DocumentSearch, SearchOptions, SearchResult, SearchStats};
10
11#[cfg(feature = "search-index")]
12use fst::{automaton, IntoStreamer, Set, SetBuilder, Streamer};
13
14#[cfg(feature = "std")]
15use std::{borrow::Cow, collections::HashMap};
16
17#[cfg(not(feature = "std"))]
18use alloc::{borrow::Cow, collections::BTreeMap as HashMap, string::String};
19
20#[cfg(not(feature = "std"))]
21use alloc::{boxed::Box, string::ToString, vec::Vec};
22
23#[cfg(feature = "std")]
24use std::time::Instant;
25
26#[cfg(not(feature = "std"))]
27type Instant = u64;
28
29/// FST-based search index for high-performance queries
30#[cfg(feature = "search-index")]
31pub struct FstSearchIndex {
32    /// FST set for fast prefix/fuzzy matching
33    fst_set: Option<Set<Vec<u8>>>,
34
35    /// Map from FST keys to document positions
36    position_map: HashMap<String, Vec<IndexEntry>>,
37
38    /// Last known document content hash for cache invalidation
39    content_hash: u64,
40
41    /// Index build timestamp for statistics
42    build_time: Instant,
43
44    /// Index size in bytes
45    index_size: usize,
46}
47
48/// Linear search fallback for when FST is not available
49pub struct LinearSearchIndex {
50    /// Simple word -> positions mapping
51    word_positions: HashMap<String, Vec<IndexEntry>>,
52
53    /// Content hash for invalidation
54    content_hash: u64,
55
56    /// Build timestamp
57    build_time: Instant,
58}
59
60/// Entry in the search index
61#[derive(Debug, Clone)]
62pub struct IndexEntry {
63    /// Position in document
64    pub position: Position,
65
66    /// Context around the match
67    pub context: String,
68
69    /// Line number (0-based)
70    pub line: usize,
71
72    /// Column number (0-based)  
73    pub column: usize,
74
75    /// Section type (Events, Styles, etc.)
76    pub section_type: Option<String>,
77}
78
79#[cfg(feature = "search-index")]
80impl Default for FstSearchIndex {
81    fn default() -> Self {
82        Self::new()
83    }
84}
85
86#[cfg(feature = "search-index")]
87impl FstSearchIndex {
88    /// Create a new FST search index
89    pub fn new() -> Self {
90        Self {
91            fst_set: None,
92            position_map: HashMap::new(),
93            content_hash: 0,
94            build_time: {
95                #[cfg(feature = "std")]
96                {
97                    Instant::now()
98                }
99                #[cfg(not(feature = "std"))]
100                {
101                    0
102                }
103            },
104            index_size: 0,
105        }
106    }
107
108    /// Extract words and positions from document content
109    fn extract_words(&self, content: &str) -> Vec<(String, IndexEntry)> {
110        let mut words = Vec::new();
111        let mut line_start = 0;
112        let mut current_section = None;
113
114        for (current_line, line) in content.lines().enumerate() {
115            // Track current section
116            if line.starts_with('[') && line.ends_with(']') {
117                current_section = Some(line[1..line.len() - 1].to_string());
118            }
119
120            // Extract words from line
121            let mut word_start = 0;
122            let mut in_word = false;
123
124            for (char_offset, ch) in line.char_indices() {
125                if ch.is_alphanumeric() || ch == '_' {
126                    if !in_word {
127                        word_start = char_offset;
128                        in_word = true;
129                    }
130                } else if in_word {
131                    // End of word
132                    let word = &line[word_start..char_offset];
133                    if word.len() >= 2 {
134                        // Index words with 2+ characters
135                        let entry = IndexEntry {
136                            position: Position::new(line_start + word_start),
137                            context: line.to_string(),
138                            line: current_line,
139                            column: word_start,
140                            section_type: current_section.clone(),
141                        };
142                        words.push((word.to_lowercase(), entry));
143                    }
144                    in_word = false;
145                }
146            }
147
148            // Handle word at end of line
149            if in_word {
150                let word = &line[word_start..];
151                if word.len() >= 2 {
152                    let entry = IndexEntry {
153                        position: Position::new(line_start + word_start),
154                        context: line.to_string(),
155                        line: current_line,
156                        column: word_start,
157                        section_type: current_section.clone(),
158                    };
159                    words.push((word.to_lowercase(), entry));
160                }
161            }
162
163            line_start += line.len() + 1; // +1 for newline
164        }
165
166        words
167    }
168
169    /// Build FST from extracted words
170    fn build_fst(&mut self, words: Vec<(String, IndexEntry)>) -> Result<()> {
171        // Group entries by word
172        let mut word_map: HashMap<String, Vec<IndexEntry>> = HashMap::new();
173        for (word, entry) in words {
174            word_map.entry(word).or_default().push(entry);
175        }
176
177        // Build FST set
178        let mut set_builder = SetBuilder::memory();
179        let mut keys: Vec<_> = word_map.keys().cloned().collect();
180        keys.sort();
181
182        for key in &keys {
183            set_builder
184                .insert(key)
185                .map_err(|e| crate::EditorError::SearchIndexError {
186                    message: format!("FST insert error: {e}"),
187                })?;
188        }
189
190        let fst_bytes =
191            set_builder
192                .into_inner()
193                .map_err(|e| crate::EditorError::SearchIndexError {
194                    message: format!("FST build error: {e}"),
195                })?;
196        self.index_size = fst_bytes.len();
197        self.fst_set =
198            Some(
199                Set::new(fst_bytes).map_err(|e| crate::EditorError::SearchIndexError {
200                    message: format!("FST creation error: {e}"),
201                })?,
202            );
203        self.position_map = word_map;
204
205        Ok(())
206    }
207}
208
209#[cfg(feature = "search-index")]
210impl DocumentSearch for FstSearchIndex {
211    fn build_index(&mut self, document: &EditorDocument) -> Result<()> {
212        let content = document.text();
213        self.content_hash = calculate_hash(&content);
214        self.build_time = {
215            #[cfg(feature = "std")]
216            {
217                Instant::now()
218            }
219            #[cfg(not(feature = "std"))]
220            {
221                0
222            }
223        };
224
225        let words = self.extract_words(&content);
226        self.build_fst(words)?;
227
228        Ok(())
229    }
230
231    fn update_index(&mut self, document: &EditorDocument, _changes: &[Range]) -> Result<()> {
232        // For simplicity, rebuild entire index on changes
233        // In production, this could be optimized for incremental updates
234        self.build_index(document)
235    }
236
237    fn search(&self, pattern: &str, options: &SearchOptions) -> Result<Vec<SearchResult>> {
238        #[cfg(feature = "std")]
239        let _start_time = Instant::now();
240        let mut results = Vec::new();
241
242        if let Some(ref fst_set) = self.fst_set {
243            let query = if options.case_sensitive {
244                pattern.to_string()
245            } else {
246                pattern.to_lowercase()
247            };
248
249            // For simplicity, use subsequence automaton for all searches
250            // In production, this could be optimized with different automaton types
251            let automaton = automaton::Subsequence::new(&query);
252            let mut stream = fst_set.search(automaton).into_stream();
253            let mut count = 0;
254
255            while let Some(key) = stream.next() {
256                if options.max_results > 0 && count >= options.max_results {
257                    break;
258                }
259
260                let key_str = String::from_utf8_lossy(key);
261                if let Some(entries) = self.position_map.get(key_str.as_ref()) {
262                    for entry in entries {
263                        // Apply scope filtering
264                        if self.matches_scope(entry, &options.scope) {
265                            results.push(SearchResult {
266                                start: entry.position,
267                                end: Position::new(entry.position.offset + pattern.len()),
268                                text: std::borrow::Cow::Owned(pattern.to_string()),
269                                context: std::borrow::Cow::Owned(entry.context.clone()),
270                                line: entry.line,
271                                column: entry.column,
272                            });
273                            count += 1;
274
275                            if options.max_results > 0 && count >= options.max_results {
276                                break;
277                            }
278                        }
279                    }
280                }
281            }
282        }
283
284        Ok(results)
285    }
286
287    fn find_replace(
288        &self,
289        document: &mut EditorDocument,
290        pattern: &str,
291        replacement: &str,
292        options: &SearchOptions,
293    ) -> Result<Vec<SearchResult>> {
294        let results = self.search(pattern, options)?;
295
296        // Apply replacements in reverse order to maintain position validity
297        let mut sorted_results = results.clone();
298        sorted_results.sort_by(|a, b| b.start.offset.cmp(&a.start.offset));
299
300        for result in &sorted_results {
301            let range = Range::new(result.start, result.end);
302            document.replace(range, replacement)?;
303        }
304
305        Ok(results)
306    }
307
308    fn stats(&self) -> SearchStats {
309        SearchStats {
310            match_count: self.position_map.len(),
311            search_time_us: {
312                #[cfg(feature = "std")]
313                {
314                    self.build_time.elapsed().as_micros() as u64
315                }
316                #[cfg(not(feature = "std"))]
317                {
318                    0
319                }
320            },
321            hit_limit: false,
322            index_size: self.index_size,
323        }
324    }
325
326    fn clear_index(&mut self) {
327        self.fst_set = None;
328        self.position_map.clear();
329        self.index_size = 0;
330    }
331}
332
333#[cfg(feature = "search-index")]
334impl FstSearchIndex {
335    fn matches_scope(&self, entry: &IndexEntry, scope: &SearchScope) -> bool {
336        match scope {
337            SearchScope::All => true,
338            SearchScope::Lines { start, end } => entry.line >= *start && entry.line <= *end,
339            SearchScope::Sections(sections) => {
340                if let Some(ref section) = entry.section_type {
341                    sections.contains(section)
342                } else {
343                    false
344                }
345            }
346            SearchScope::Range(range) => {
347                entry.position.offset >= range.start.offset
348                    && entry.position.offset <= range.end.offset
349            }
350        }
351    }
352}
353
354// Linear search fallback implementation
355impl Default for LinearSearchIndex {
356    fn default() -> Self {
357        Self::new()
358    }
359}
360
361impl LinearSearchIndex {
362    /// Create a new linear search index
363    pub fn new() -> Self {
364        Self {
365            word_positions: HashMap::new(),
366            content_hash: 0,
367            build_time: {
368                #[cfg(feature = "std")]
369                {
370                    Instant::now()
371                }
372                #[cfg(not(feature = "std"))]
373                {
374                    0
375                }
376            },
377        }
378    }
379}
380
381impl DocumentSearch for LinearSearchIndex {
382    fn build_index(&mut self, document: &EditorDocument) -> Result<()> {
383        let content = document.text();
384        self.content_hash = calculate_hash(&content);
385        self.build_time = {
386            #[cfg(feature = "std")]
387            {
388                Instant::now()
389            }
390            #[cfg(not(feature = "std"))]
391            {
392                0
393            }
394        };
395
396        // Simple word extraction for linear search
397        self.word_positions.clear();
398
399        let mut line_start = 0;
400
401        for (current_line, line) in content.lines().enumerate() {
402            for (word_start, word) in line.split_whitespace().enumerate() {
403                let entry = IndexEntry {
404                    position: Position::new(line_start + word_start),
405                    context: line.to_string(),
406                    line: current_line,
407                    column: word_start,
408                    section_type: None,
409                };
410                self.word_positions
411                    .entry(word.to_lowercase())
412                    .or_default()
413                    .push(entry);
414            }
415            line_start += line.len() + 1;
416        }
417
418        Ok(())
419    }
420
421    fn update_index(&mut self, document: &EditorDocument, _changes: &[Range]) -> Result<()> {
422        self.build_index(document)
423    }
424
425    fn search(&self, pattern: &str, _options: &SearchOptions) -> Result<Vec<SearchResult>> {
426        let query = pattern.to_lowercase();
427        let mut results = Vec::new();
428
429        // Simple linear search through indexed words
430        for (word, entries) in &self.word_positions {
431            if word.contains(&query) {
432                for entry in entries {
433                    results.push(SearchResult {
434                        start: entry.position,
435                        end: Position::new(entry.position.offset + pattern.len()),
436                        text: Cow::Owned(pattern.to_string()),
437                        context: Cow::Owned(entry.context.clone()),
438                        line: entry.line,
439                        column: entry.column,
440                    });
441                }
442            }
443        }
444
445        Ok(results)
446    }
447
448    fn find_replace(
449        &self,
450        document: &mut EditorDocument,
451        pattern: &str,
452        replacement: &str,
453        options: &SearchOptions,
454    ) -> Result<Vec<SearchResult>> {
455        let results = self.search(pattern, options)?;
456
457        for result in &results {
458            let range = Range::new(result.start, result.end);
459            document.replace(range, replacement)?;
460        }
461
462        Ok(results)
463    }
464
465    fn stats(&self) -> SearchStats {
466        SearchStats {
467            match_count: self.word_positions.len(),
468            search_time_us: {
469                #[cfg(feature = "std")]
470                {
471                    self.build_time.elapsed().as_micros() as u64
472                }
473                #[cfg(not(feature = "std"))]
474                {
475                    0
476                }
477            },
478            hit_limit: false,
479            index_size: self.word_positions.len() * 64, // Rough estimate
480        }
481    }
482
483    fn clear_index(&mut self) {
484        self.word_positions.clear();
485    }
486}
487
488/// Factory function to create the appropriate search index
489pub fn create_search_index() -> Box<dyn DocumentSearch> {
490    #[cfg(feature = "search-index")]
491    {
492        Box::new(FstSearchIndex::new())
493    }
494    #[cfg(not(feature = "search-index"))]
495    {
496        Box::new(LinearSearchIndex::new())
497    }
498}
499
500/// Simple hash function for content change detection
501fn calculate_hash(content: &str) -> u64 {
502    // Simple FNV hash - in production might use a proper hasher
503    let mut hash = 0xcbf29ce484222325u64;
504    for byte in content.bytes() {
505        hash ^= byte as u64;
506        hash = hash.wrapping_mul(0x100000001b3);
507    }
508    hash
509}
510
511#[cfg(test)]
512mod tests {
513    use super::*;
514    use crate::utils::search::SearchScope;
515    #[cfg(not(feature = "std"))]
516    use crate::EditorDocument;
517
518    #[test]
519    fn test_linear_search_index() {
520        let content = r#"[Script Info]
521Title: Test Movie
522Author: Test Author
523
524[Events]
525Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
526Dialogue: 0,0:00:05.00,0:00:10.00,Default,John,0,0,0,,Hello world
527Dialogue: 0,0:00:12.00,0:00:15.00,Default,Jane,0,0,0,,How are you"#;
528
529        let document = EditorDocument::from_content(content).unwrap();
530        let mut search_index = LinearSearchIndex::new();
531
532        search_index.build_index(&document).unwrap();
533
534        let results = search_index
535            .search("Hello", &SearchOptions::default())
536            .unwrap();
537        assert!(!results.is_empty());
538
539        let stats = search_index.stats();
540        assert!(stats.match_count > 0);
541    }
542
543    #[cfg(feature = "search-index")]
544    #[test]
545    fn test_fst_search_index() {
546        let content = r#"[Script Info]
547Title: Test Movie
548
549[Events]  
550Dialogue: 0,0:00:05.00,0:00:10.00,Default,John,0,0,0,,Hello world"#;
551
552        let document = EditorDocument::from_content(content).unwrap();
553        let mut search_index = FstSearchIndex::new();
554
555        search_index.build_index(&document).unwrap();
556
557        let results = search_index
558            .search("hello", &SearchOptions::default())
559            .unwrap();
560        assert!(!results.is_empty());
561
562        let stats = search_index.stats();
563        assert!(stats.index_size > 0);
564    }
565
566    #[test]
567    fn test_search_factory() {
568        let index = create_search_index();
569
570        let content = "[Script Info]\nTitle: Test";
571        let _document = EditorDocument::from_content(content).unwrap();
572
573        // Just test that we can create and use the index
574        let stats = index.stats();
575        assert_eq!(stats.match_count, 0); // No index built yet
576    }
577
578    #[test]
579    fn test_scope_filtering() {
580        let content = r#"[Script Info]
581Title: Test
582
583[Events]
584Dialogue: Hello world"#;
585
586        let document = EditorDocument::from_content(content).unwrap();
587        let mut index = LinearSearchIndex::new();
588        index.build_index(&document).unwrap();
589
590        // Test different scopes
591        let line_scope = SearchOptions {
592            scope: SearchScope::Lines { start: 0, end: 2 },
593            ..Default::default()
594        };
595
596        let _results = index.search("Test", &line_scope).unwrap();
597        // Should find results in the specified line range
598    }
599}