blz_core/
index.rs

1use crate::profiling::{ComponentTimings, OperationTimer, PerformanceMetrics};
2use crate::{Error, HeadingBlock, Result, SearchHit};
3use base64::{Engine, engine::general_purpose::STANDARD as B64};
4use sha2::{Digest, Sha256};
5use std::path::Path;
6use tantivy::collector::TopDocs;
7use tantivy::query::QueryParser;
8use tantivy::schema::{Field, STORED, STRING, Schema, TEXT, Value};
9use tantivy::{Index, IndexReader, doc};
10use tracing::{Level, debug, info};
11
12/// Tantivy-based search index for llms.txt documentation
13pub struct SearchIndex {
14    index: Index,
15    #[allow(dead_code)]
16    schema: Schema,
17    content_field: Field,
18    path_field: Field,
19    heading_path_field: Field,
20    lines_field: Field,
21    alias_field: Field,
22    anchor_field: Option<Field>,
23    reader: IndexReader,
24    metrics: Option<PerformanceMetrics>,
25}
26
27impl SearchIndex {
28    /// Enable performance metrics collection
29    #[must_use]
30    pub fn with_metrics(mut self, metrics: PerformanceMetrics) -> Self {
31        self.metrics = Some(metrics);
32        self
33    }
34
35    /// Get the performance metrics instance
36    #[must_use]
37    pub const fn metrics(&self) -> Option<&PerformanceMetrics> {
38        self.metrics.as_ref()
39    }
40    /// Creates a new search index at the specified path
41    pub fn create(index_path: &Path) -> Result<Self> {
42        let mut schema_builder = Schema::builder();
43
44        let content_field = schema_builder.add_text_field("content", TEXT | STORED);
45        let path_field = schema_builder.add_text_field("path", STRING | STORED);
46        let heading_path_field = schema_builder.add_text_field("heading_path", TEXT | STORED);
47        let lines_field = schema_builder.add_text_field("lines", STRING | STORED);
48        let alias_field = schema_builder.add_text_field("alias", STRING | STORED);
49        let anchor_field = schema_builder.add_text_field("anchor", STRING | STORED);
50
51        let schema = schema_builder.build();
52
53        std::fs::create_dir_all(index_path)
54            .map_err(|e| Error::Index(format!("Failed to create index directory: {e}")))?;
55
56        let index = Index::create_in_dir(index_path, schema.clone())
57            .map_err(|e| Error::Index(format!("Failed to create index: {e}")))?;
58
59        let reader = index
60            .reader_builder()
61            .reload_policy(tantivy::ReloadPolicy::OnCommitWithDelay)
62            .try_into()
63            .map_err(|e| Error::Index(format!("Failed to create reader: {e}")))?;
64
65        Ok(Self {
66            index,
67            schema,
68            content_field,
69            path_field,
70            heading_path_field,
71            lines_field,
72            alias_field,
73            reader,
74            anchor_field: Some(anchor_field),
75            metrics: None,
76        })
77    }
78
79    /// Creates a new search index or opens an existing one at the specified path
80    pub fn create_or_open(index_path: &Path) -> Result<Self> {
81        if index_path.exists() {
82            Self::open(index_path)
83        } else {
84            Self::create(index_path)
85        }
86    }
87
88    /// Opens an existing search index at the specified path
89    pub fn open(index_path: &Path) -> Result<Self> {
90        let index = Index::open_in_dir(index_path)
91            .map_err(|e| Error::Index(format!("Failed to open index: {e}")))?;
92
93        let schema = index.schema();
94
95        let content_field = schema
96            .get_field("content")
97            .map_err(|_| Error::Index("Missing content field".into()))?;
98        let path_field = schema
99            .get_field("path")
100            .map_err(|_| Error::Index("Missing path field".into()))?;
101        let heading_path_field = schema
102            .get_field("heading_path")
103            .map_err(|_| Error::Index("Missing heading_path field".into()))?;
104        let lines_field = schema
105            .get_field("lines")
106            .map_err(|_| Error::Index("Missing lines field".into()))?;
107        let alias_field = schema
108            .get_field("alias")
109            .map_err(|_| Error::Index("Missing alias field".into()))?;
110
111        // Anchor is optional for backward compatibility with older indexes
112        let anchor_field = schema.get_field("anchor").ok();
113
114        let reader = index
115            .reader_builder()
116            .reload_policy(tantivy::ReloadPolicy::OnCommitWithDelay)
117            .try_into()
118            .map_err(|e| Error::Index(format!("Failed to create reader: {e}")))?;
119
120        Ok(Self {
121            index,
122            schema,
123            content_field,
124            path_field,
125            heading_path_field,
126            lines_field,
127            alias_field,
128            reader,
129            anchor_field,
130            metrics: None,
131        })
132    }
133
134    /// Indexes a collection of heading blocks for a given alias
135    pub fn index_blocks(
136        &self,
137        alias: &str,
138        file_path: &str,
139        blocks: &[HeadingBlock],
140    ) -> Result<()> {
141        let timer = self.metrics.as_ref().map_or_else(
142            || OperationTimer::new(&format!("index_{alias}")),
143            |metrics| OperationTimer::with_metrics(&format!("index_{alias}"), metrics.clone()),
144        );
145
146        let mut timings = ComponentTimings::new();
147
148        let mut writer = timings.time("writer_creation", || {
149            self.index
150                .writer(50_000_000)
151                .map_err(|e| Error::Index(format!("Failed to create writer: {e}")))
152        })?;
153
154        let _deleted = timings.time("delete_existing", || {
155            writer.delete_term(tantivy::Term::from_field_text(self.alias_field, alias))
156        });
157
158        let mut total_content_bytes = 0usize;
159
160        timings.time("document_creation", || {
161            for block in blocks {
162                total_content_bytes += block.content.len();
163                let heading_path_str = block.path.join(" > ");
164                let lines_str = format!("{}-{}", block.start_line, block.end_line);
165                // Compute anchor from last heading text
166                let anchor = block.path.last().map(|h| Self::compute_anchor(h));
167
168                let mut doc = doc!(
169                    self.content_field => block.content.as_str(),  // Use &str instead of clone
170                    self.path_field => file_path,
171                    self.heading_path_field => heading_path_str,
172                    self.lines_field => lines_str,
173                    self.alias_field => alias
174                );
175                if let (Some(f), Some(a)) = (self.anchor_field, anchor) {
176                    doc.add_text(f, a);
177                }
178
179                writer
180                    .add_document(doc)
181                    .map_err(|e| Error::Index(format!("Failed to add document: {e}")))?;
182            }
183            Ok::<(), Error>(())
184        })?;
185
186        timings.time("commit", || {
187            writer
188                .commit()
189                .map_err(|e| Error::Index(format!("Failed to commit: {e}")))
190        })?;
191
192        timings.time("reader_reload", || {
193            self.reader
194                .reload()
195                .map_err(|e| Error::Index(format!("Failed to reload reader: {e}")))
196        })?;
197
198        let duration = timer.finish_index(total_content_bytes);
199
200        // Print detailed breakdown if debug logging is enabled
201        if tracing::enabled!(Level::DEBUG) {
202            timings.print_breakdown();
203        }
204
205        info!(
206            "Indexed {} blocks ({} bytes) for {} in {:.2}ms",
207            blocks.len(),
208            total_content_bytes,
209            alias,
210            duration.as_millis()
211        );
212
213        Ok(())
214    }
215
216    /// Searches the index with optional alias filtering
217    #[allow(clippy::too_many_lines)] // Complex search logic requires detailed implementation
218    pub fn search(
219        &self,
220        query_str: &str,
221        alias: Option<&str>,
222        limit: usize,
223    ) -> Result<Vec<SearchHit>> {
224        let timer = self.metrics.as_ref().map_or_else(
225            || OperationTimer::new(&format!("search_{query_str}")),
226            |metrics| OperationTimer::with_metrics(&format!("search_{query_str}"), metrics.clone()),
227        );
228
229        let mut timings = ComponentTimings::new();
230        let mut lines_searched = 0usize;
231
232        let searcher = timings.time("searcher_creation", || self.reader.searcher());
233
234        let query_parser = timings.time("query_parser_creation", || {
235            QueryParser::for_index(
236                &self.index,
237                vec![self.content_field, self.heading_path_field],
238            )
239        });
240
241        // Sanitize query more efficiently with a single allocation
242        let needs_escaping = query_str.chars().any(|c| {
243            matches!(
244                c,
245                '\\' | '"' | '(' | ')' | '[' | ']' | '{' | '}' | '^' | '~'
246            )
247        });
248
249        let full_query_str = if needs_escaping {
250            // Only allocate if we need to escape characters
251            let mut sanitized = String::with_capacity(query_str.len() * 2);
252
253            for ch in query_str.chars() {
254                match ch {
255                    '\\' => sanitized.push_str("\\\\"),
256                    '"' => sanitized.push_str("\\\""),
257                    '(' => sanitized.push_str("\\("),
258                    ')' => sanitized.push_str("\\)"),
259                    '[' => sanitized.push_str("\\["),
260                    ']' => sanitized.push_str("\\]"),
261                    '{' => sanitized.push_str("\\{"),
262                    '}' => sanitized.push_str("\\}"),
263                    '^' => sanitized.push_str("\\^"),
264                    '~' => sanitized.push_str("\\~"),
265                    _ => sanitized.push(ch),
266                }
267            }
268
269            if let Some(alias) = alias {
270                // Alias is internally controlled, no need to sanitize
271                format!("alias:{alias} AND ({sanitized})")
272            } else {
273                sanitized
274            }
275        } else {
276            // No escaping needed, minimize allocations
277            alias.map_or_else(
278                || query_str.to_string(),
279                |alias| format!("alias:{alias} AND ({query_str})"),
280            )
281        };
282
283        let query = timings.time("query_parsing", || {
284            query_parser
285                .parse_query(&full_query_str)
286                .map_err(|e| Error::Index(format!("Failed to parse query: {e}")))
287        })?;
288
289        let top_docs = timings.time("tantivy_search", || {
290            searcher
291                .search(&query, &TopDocs::with_limit(limit))
292                .map_err(|e| Error::Index(format!("Search failed: {e}")))
293        })?;
294
295        let mut hits = Vec::new();
296
297        timings.time("result_processing", || {
298            for (score, doc_address) in top_docs {
299                let doc = searcher
300                    .doc(doc_address)
301                    .map_err(|e| Error::Index(format!("Failed to retrieve doc: {e}")))?;
302
303                let alias = Self::get_field_text(&doc, self.alias_field)?;
304                let file = Self::get_field_text(&doc, self.path_field)?;
305                let heading_path_str = Self::get_field_text(&doc, self.heading_path_field)?;
306                let lines = Self::get_field_text(&doc, self.lines_field)?;
307                let content = Self::get_field_text(&doc, self.content_field)?;
308                let anchor = self.anchor_field.and_then(|f| {
309                    doc.get_first(f)
310                        .and_then(|v| v.as_str())
311                        .map(std::string::ToString::to_string)
312                });
313
314                // Count lines for metrics
315                lines_searched += content.lines().count();
316
317                let heading_path: Vec<String> = heading_path_str
318                    .split(" > ")
319                    .map(std::string::ToString::to_string)
320                    .collect();
321
322                let snippet = Self::extract_snippet(&content, query_str, 100);
323
324                // Prefer exact match line(s) when possible for better citations
325                let exact_lines = Self::compute_match_lines(&content, query_str, &lines)
326                    .unwrap_or_else(|| lines.clone());
327
328                // Parse numeric line range for convenience
329                let line_numbers = Self::parse_lines_range(&exact_lines);
330
331                hits.push(SearchHit {
332                    alias: alias.clone(),
333                    source: alias,
334                    file,
335                    heading_path,
336                    lines: exact_lines,
337                    line_numbers,
338                    snippet,
339                    score,
340                    source_url: None,
341                    checksum: String::new(),
342                    anchor,
343                });
344            }
345            Ok::<(), Error>(())
346        })?;
347
348        let duration = timer.finish_search(lines_searched);
349
350        // Print detailed breakdown if debug logging is enabled
351        if tracing::enabled!(Level::DEBUG) {
352            timings.print_breakdown();
353        }
354
355        debug!(
356            "Found {} hits for query '{}' in {:.2}ms (searched {} lines)",
357            hits.len(),
358            query_str,
359            duration.as_millis(),
360            lines_searched
361        );
362
363        Ok(hits)
364    }
365
366    fn compute_anchor(heading_text: &str) -> String {
367        let mut hasher = Sha256::new();
368        hasher.update(heading_text.trim().to_lowercase().as_bytes());
369        let digest = hasher.finalize();
370        let full = B64.encode(digest);
371        full[..22.min(full.len())].to_string()
372    }
373
374    fn get_field_text(doc: &tantivy::TantivyDocument, field: Field) -> Result<String> {
375        doc.get_first(field)
376            .and_then(|v| v.as_str())
377            .map(std::string::ToString::to_string)
378            .ok_or_else(|| Error::Index("Field not found in document".into()))
379    }
380
381    /// Compute exact match line(s) within a block's content relative to its stored line range.
382    /// Returns a "start-end" string (typically a single line) falling back to the original range on failure.
383    fn compute_match_lines(content: &str, query: &str, block_lines: &str) -> Option<String> {
384        // Parse the block's starting line
385        let block_start: usize = block_lines
386            .split(['-', ':'])
387            .next()
388            .and_then(|s| s.trim().parse::<usize>().ok())?;
389
390        // Tokenize query naively by whitespace; try to find the earliest occurrence
391        let mut best_pos: Option<usize> = None;
392        for token in query.split_whitespace() {
393            if token.is_empty() {
394                continue;
395            }
396            if let Some(pos) = content.find(token) {
397                match best_pos {
398                    Some(cur) if pos < cur => best_pos = Some(pos),
399                    None => best_pos = Some(pos),
400                    _ => {},
401                }
402            }
403        }
404
405        let pos = best_pos?;
406        // Count newlines before position to get 0-based line offset
407        let local_line = content[..pos].bytes().filter(|&b| b == b'\n').count();
408        let abs_line = block_start.saturating_add(local_line);
409        Some(format!("{abs_line}-{abs_line}"))
410    }
411
412    /// Parse a `"start-end"` or `"start:end"` range into a two-element vector.
413    /// Returns None if parsing fails or inputs are invalid.
414    fn parse_lines_range(range: &str) -> Option<Vec<usize>> {
415        let mut parts = range.split(['-', ':']);
416        let start = parts.next()?.trim().parse::<usize>().ok()?;
417        let end = parts.next()?.trim().parse::<usize>().ok()?;
418        Some(vec![start, end])
419    }
420
421    fn extract_snippet(content: &str, query: &str, max_len: usize) -> String {
422        let query_lower = query.to_lowercase();
423
424        // Find match position using character indices to handle Unicode correctly
425        let mut match_char_pos = None;
426
427        // Use a sliding window approach with character iteration
428        let content_chars: Vec<char> = content.chars().collect();
429        let query_chars: Vec<char> = query_lower.chars().collect();
430
431        if !query_chars.is_empty() {
432            for window_start in 0..content_chars.len() {
433                let window_end = (window_start + query_chars.len()).min(content_chars.len());
434                if window_end - window_start < query_chars.len() {
435                    break;
436                }
437
438                // Check if this window matches (case-insensitive)
439                let window_matches = content_chars[window_start..window_end]
440                    .iter()
441                    .zip(query_chars.iter())
442                    .all(|(c1, c2)| c1.to_lowercase().eq(c2.to_lowercase()));
443
444                if window_matches {
445                    match_char_pos = Some(window_start);
446                    break;
447                }
448            }
449        }
450
451        if let Some(char_pos) = match_char_pos {
452            // Derive context from max_len so we don't overshoot the requested length.
453            let total_chars = content_chars.len();
454            let qlen = query_chars.len();
455            let ctx_each_side = max_len.saturating_sub(qlen) / 2;
456
457            let start_char = char_pos.saturating_sub(ctx_each_side);
458            let mut end_char = (char_pos + qlen + ctx_each_side).min(total_chars);
459
460            // Clamp to at most max_len characters around the match.
461            let span = end_char.saturating_sub(start_char);
462            if span > max_len {
463                end_char = start_char + max_len;
464            }
465
466            let left_trunc = start_char > 0;
467            let right_trunc = end_char < total_chars;
468
469            // Build snippet
470            let mut snippet = String::with_capacity((end_char - start_char) * 4 + 6);
471            if left_trunc {
472                snippet.push_str("...");
473            }
474            for &ch in content_chars.iter().take(end_char).skip(start_char) {
475                snippet.push(ch);
476            }
477            if right_trunc {
478                snippet.push_str("...");
479            }
480            return snippet;
481        }
482
483        // No match found - return truncated content using character count
484        let content_chars: Vec<char> = content.chars().collect();
485        if content_chars.len() <= max_len {
486            content.to_string()
487        } else {
488            // Truncate based on character count, not byte count
489            let mut result = String::with_capacity(max_len * 4 + 3);
490            for (i, ch) in content_chars.iter().enumerate() {
491                if i >= max_len {
492                    break;
493                }
494                result.push(*ch);
495            }
496            result.push_str("...");
497            result
498        }
499    }
500}
501
502#[cfg(test)]
503mod tests {
504    #![allow(clippy::panic)]
505    #![allow(clippy::disallowed_macros)]
506    #![allow(clippy::unwrap_used)]
507    use super::*;
508    use crate::HeadingBlock;
509    use std::time::Instant;
510    use tempfile::TempDir;
511
512    fn create_test_blocks() -> Vec<HeadingBlock> {
513        vec![
514            HeadingBlock {
515                path: vec!["React".to_string(), "Hooks".to_string()],
516                content: "useState is a React hook that lets you add state to functional components. It returns an array with the current state value and a function to update it.".to_string(),
517                start_line: 100,
518                end_line: 120,
519            },
520            HeadingBlock {
521                path: vec!["React".to_string(), "Components".to_string()],
522                content: "Components are the building blocks of React applications. They can be function components or class components.".to_string(),
523                start_line: 50,
524                end_line: 75,
525            },
526            HeadingBlock {
527                path: vec!["Next.js".to_string(), "Routing".to_string()],
528                content: "App Router is the new routing system in Next.js 13+. It provides better performance and developer experience.".to_string(),
529                start_line: 200,
530                end_line: 250,
531            },
532        ]
533    }
534
535    #[test]
536    fn test_index_creation() {
537        let temp_dir = TempDir::new().expect("Failed to create temp dir");
538        let index_path = temp_dir.path().join("test_index");
539
540        let result = SearchIndex::create(&index_path);
541        assert!(result.is_ok(), "Should create index successfully");
542
543        // Verify index directory was created
544        assert!(index_path.exists());
545    }
546
547    #[test]
548    fn test_index_open_nonexistent() {
549        let temp_dir = TempDir::new().expect("Failed to create temp dir");
550        let index_path = temp_dir.path().join("nonexistent");
551
552        let result = SearchIndex::open(&index_path);
553        assert!(result.is_err(), "Should fail to open non-existent index");
554    }
555
556    #[test]
557    fn test_index_and_search_basic() {
558        let temp_dir = TempDir::new().expect("Failed to create temp dir");
559        let index_path = temp_dir.path().join("test_index");
560
561        // Create index and add blocks
562        let index = SearchIndex::create(&index_path).expect("Should create index");
563        let blocks = create_test_blocks();
564
565        index
566            .index_blocks("test", "test.md", &blocks)
567            .expect("Should index blocks");
568
569        // Search for content
570        let hits = index
571            .search("useState", Some("test"), 10)
572            .expect("Should search");
573
574        assert!(!hits.is_empty(), "Should find results for useState");
575        assert!(
576            hits[0].snippet.contains("useState"),
577            "Result should contain useState"
578        );
579        assert_eq!(hits[0].alias, "test");
580        assert_eq!(hits[0].file, "test.md");
581    }
582
583    #[test]
584    fn test_search_limit() {
585        let temp_dir = TempDir::new().expect("Failed to create temp dir");
586        let index_path = temp_dir.path().join("test_index");
587
588        let index = SearchIndex::create(&index_path).expect("Should create index");
589        let blocks = create_test_blocks();
590
591        index
592            .index_blocks("test", "test.md", &blocks)
593            .expect("Should index blocks");
594
595        // Search with limit
596        let hits = index
597            .search("React", Some("test"), 1)
598            .expect("Should search");
599
600        assert!(!hits.is_empty(), "Should find results");
601        assert!(hits.len() <= 1, "Should respect limit");
602    }
603
604    #[test]
605    fn test_search_includes_anchor() {
606        let temp_dir = TempDir::new().expect("Failed to create temp dir");
607        let index_path = temp_dir.path().join("test_index");
608
609        let index = SearchIndex::create(&index_path).expect("Should create index");
610
611        let blocks = vec![HeadingBlock {
612            path: vec!["API".to_string(), "Reference".to_string()],
613            content: "token auth key".to_string(),
614            start_line: 10,
615            end_line: 20,
616        }];
617
618        index
619            .index_blocks("test", "api.md", &blocks)
620            .expect("Should index blocks");
621
622        let hits = index
623            .search("token", Some("test"), 10)
624            .expect("Should search");
625
626        assert!(!hits.is_empty());
627        assert!(hits[0].anchor.is_some(), "anchor should be present in hits");
628        // Anchor should be derived from the last heading segment
629        let expected = SearchIndex::compute_anchor("Reference");
630        assert_eq!(hits[0].anchor.clone().unwrap(), expected);
631    }
632
633    #[test]
634    fn test_search_no_results() {
635        let temp_dir = TempDir::new().expect("Failed to create temp dir");
636        let index_path = temp_dir.path().join("test_index");
637
638        let index = SearchIndex::create(&index_path).expect("Should create index");
639        let blocks = create_test_blocks();
640
641        index
642            .index_blocks("test", "test.md", &blocks)
643            .expect("Should index blocks");
644
645        // Search for non-existent term
646        let hits = index
647            .search("nonexistentterm12345", Some("test"), 10)
648            .expect("Should search");
649
650        assert!(
651            hits.is_empty(),
652            "Should find no results for non-existent term"
653        );
654    }
655
656    #[test]
657    fn test_search_performance() {
658        let temp_dir = TempDir::new().expect("Failed to create temp dir");
659        let index_path = temp_dir.path().join("test_index");
660
661        let index = SearchIndex::create(&index_path).expect("Should create index");
662
663        // Create many blocks for performance testing
664        let mut blocks = Vec::new();
665        for i in 0..100 {
666            blocks.push(HeadingBlock {
667                path: vec![format!("Section{}", i)],
668                content: format!("This is content block {i} with various keywords like React, hooks, components, and performance testing."),
669                start_line: i * 10,
670                end_line: i * 10 + 5,
671            });
672        }
673
674        index
675            .index_blocks("perftest", "large.md", &blocks)
676            .expect("Should index many blocks");
677
678        // Test search performance
679        let start = Instant::now();
680        let hits = index
681            .search("React", Some("perftest"), 50)
682            .expect("Should search");
683        let duration = start.elapsed();
684
685        assert!(!hits.is_empty(), "Should find results");
686        assert!(
687            duration.as_millis() < 100,
688            "Search should be fast (<100ms), took {}ms",
689            duration.as_millis()
690        );
691    }
692
693    #[test]
694    fn test_search_scoring() {
695        let temp_dir = TempDir::new().expect("Failed to create temp dir");
696        let index_path = temp_dir.path().join("test_index");
697
698        let index = SearchIndex::create(&index_path).expect("Should create index");
699
700        let blocks = vec![
701            HeadingBlock {
702                path: vec!["Exact Match".to_string()],
703                content: "React hooks".to_string(),
704                start_line: 1,
705                end_line: 5,
706            },
707            HeadingBlock {
708                path: vec!["Partial Match".to_string()],
709                content: "React components and hooks are useful features".to_string(),
710                start_line: 10,
711                end_line: 15,
712            },
713            HeadingBlock {
714                path: vec!["Distant Match".to_string()],
715                content: "In React, you can use various hooks for different purposes".to_string(),
716                start_line: 20,
717                end_line: 25,
718            },
719        ];
720
721        index
722            .index_blocks("test", "test.md", &blocks)
723            .expect("Should index blocks");
724
725        let hits = index
726            .search("React hooks", Some("test"), 10)
727            .expect("Should search");
728
729        assert!(!hits.is_empty(), "Should find results");
730
731        // Results should be ordered by relevance (score)
732        for i in 1..hits.len() {
733            assert!(
734                hits[i - 1].score >= hits[i].score,
735                "Results should be ordered by descending score"
736            );
737        }
738
739        // The exact match should have the highest score
740        assert!(
741            hits[0].snippet.contains("React hooks"),
742            "Highest scored result should contain exact match"
743        );
744    }
745
746    #[test]
747    fn test_heading_path_in_results() {
748        let temp_dir = TempDir::new().expect("Failed to create temp dir");
749        let index_path = temp_dir.path().join("test_index");
750
751        let index = SearchIndex::create(&index_path).expect("Should create index");
752
753        let blocks = vec![HeadingBlock {
754            path: vec![
755                "API".to_string(),
756                "Reference".to_string(),
757                "Hooks".to_string(),
758            ],
759            content: "useState hook documentation".to_string(),
760            start_line: 100,
761            end_line: 120,
762        }];
763
764        index
765            .index_blocks("test", "api.md", &blocks)
766            .expect("Should index blocks");
767
768        let hits = index
769            .search("useState", Some("test"), 10)
770            .expect("Should search");
771
772        assert!(!hits.is_empty(), "Should find results");
773        assert_eq!(hits[0].heading_path, vec!["API", "Reference", "Hooks"]);
774        assert_eq!(hits[0].file, "api.md");
775        // Lines should point to the exact match within the block (first line)
776        assert!(
777            hits[0].lines.starts_with("100-"),
778            "Expected match to start at line 100, got {}",
779            hits[0].lines
780        );
781    }
782
783    #[test]
784    fn test_unicode_snippet_extraction() {
785        let temp_dir = TempDir::new().expect("Failed to create temp dir");
786        let index_path = temp_dir.path().join("test_index");
787        let index = SearchIndex::create(&index_path).expect("Should create index");
788
789        // Test with various Unicode content
790        let unicode_blocks = vec![
791            HeadingBlock {
792                path: vec!["Unicode".to_string(), "Emoji".to_string()],
793                content: "This is a test with emojis: πŸ‘‹ Hello 🌍 World! πŸš€ Let's go! πŸŽ‰"
794                    .to_string(),
795                start_line: 1,
796                end_line: 10,
797            },
798            HeadingBlock {
799                path: vec!["Unicode".to_string(), "Chinese".to_string()],
800                content: "θΏ™ζ˜―δΈ­ζ–‡ζ΅‹θ―•γ€‚Hello δΈ–η•ŒοΌProgramming 编程 is εΎˆζœ‰θΆ£γ€‚".to_string(),
801                start_line: 20,
802                end_line: 30,
803            },
804            HeadingBlock {
805                path: vec!["Unicode".to_string(), "Mixed".to_string()],
806                content: "ζ—₯本θͺž γƒ†γ‚Ήγƒˆ πŸ‡―πŸ‡΅ with mixed content".to_string(),
807                start_line: 40,
808                end_line: 50,
809            },
810        ];
811
812        index
813            .index_blocks("unicode_test", "test.md", &unicode_blocks)
814            .expect("Should index blocks");
815
816        // Test searching for various Unicode content
817        let test_cases = vec![("emoji", "πŸ‘‹"), ("δΈ­ζ–‡", "ζ΅‹θ―•"), ("programming", "编程")];
818
819        for (query, _expected_content) in test_cases {
820            let results = index
821                .search(query, Some("unicode_test"), 10)
822                .unwrap_or_else(|_| panic!("Should search for '{query}'"));
823
824            if !results.is_empty() {
825                let hit = &results[0];
826                // Verify snippet doesn't panic on Unicode boundaries
827                assert!(hit.snippet.is_char_boundary(0));
828                assert!(hit.snippet.is_char_boundary(hit.snippet.len()));
829
830                // Verify we can iterate over chars without panic
831                let _char_count = hit.snippet.chars().count();
832            }
833        }
834    }
835
836    #[test]
837    fn test_edge_case_unicode_truncation() {
838        let temp_dir = TempDir::new().expect("Failed to create temp dir");
839        let index_path = temp_dir.path().join("test_index");
840        let index = SearchIndex::create(&index_path).expect("Should create index");
841
842        // Create content where truncation would happen in middle of multi-byte chars
843        let mut long_content = String::new();
844        for _ in 0..20 {
845            long_content.push_str("πŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘¦"); // Family emoji (complex grapheme cluster)
846        }
847        long_content.push_str(" MARKER ");
848        for _ in 0..20 {
849            long_content.push_str("πŸ³οΈβ€πŸŒˆ"); // Rainbow flag (another complex emoji)
850        }
851
852        let blocks = vec![HeadingBlock {
853            path: vec!["Test".to_string()],
854            content: long_content.clone(),
855            start_line: 1,
856            end_line: 10,
857        }];
858
859        index
860            .index_blocks("edge_test", "test.md", &blocks)
861            .expect("Should index blocks");
862
863        let results = index
864            .search("MARKER", Some("edge_test"), 10)
865            .expect("Should search");
866
867        assert!(!results.is_empty());
868        let snippet = &results[0].snippet;
869
870        // Verify the snippet is valid UTF-8 and doesn't panic
871        assert!(snippet.is_char_boundary(0));
872        assert!(snippet.is_char_boundary(snippet.len()));
873        assert!(snippet.contains("MARKER"));
874
875        // Verify we can iterate over chars without panic
876        let char_count = snippet.chars().count();
877        assert!(char_count > 0);
878    }
879}