blz_core/
index.rs

1use crate::profiling::{ComponentTimings, OperationTimer, PerformanceMetrics};
2use crate::{Error, HeadingBlock, Result, SearchHit};
3use base64::{Engine, engine::general_purpose::STANDARD as B64};
4use sha2::{Digest, Sha256};
5use std::path::Path;
6use tantivy::collector::TopDocs;
7use tantivy::query::QueryParser;
8use tantivy::schema::{Field, STORED, STRING, Schema, TEXT, Value};
9use tantivy::{Index, IndexReader, doc};
10use tracing::{Level, debug, info};
11
12/// Default number of characters returned for a search snippet (before any ellipses).
13pub const DEFAULT_SNIPPET_CHAR_LIMIT: usize = 200;
14/// Minimum number of characters permitted for a search snippet.
15pub const MIN_SNIPPET_CHAR_LIMIT: usize = 50;
16/// Maximum number of characters permitted for a search snippet.
17pub const MAX_SNIPPET_CHAR_LIMIT: usize = 1_000;
18
19pub(crate) const fn clamp_snippet_chars(chars: usize) -> usize {
20    if chars < MIN_SNIPPET_CHAR_LIMIT {
21        MIN_SNIPPET_CHAR_LIMIT
22    } else if chars > MAX_SNIPPET_CHAR_LIMIT {
23        MAX_SNIPPET_CHAR_LIMIT
24    } else {
25        chars
26    }
27}
28
29/// Tantivy-based search index for llms.txt documentation
30pub struct SearchIndex {
31    index: Index,
32    #[allow(dead_code)]
33    schema: Schema,
34    content_field: Field,
35    path_field: Field,
36    heading_path_field: Field,
37    lines_field: Field,
38    alias_field: Field,
39    anchor_field: Option<Field>,
40    reader: IndexReader,
41    metrics: Option<PerformanceMetrics>,
42}
43
44impl SearchIndex {
45    /// Enable performance metrics collection
46    #[must_use]
47    pub fn with_metrics(mut self, metrics: PerformanceMetrics) -> Self {
48        self.metrics = Some(metrics);
49        self
50    }
51
52    /// Get the performance metrics instance
53    #[must_use]
54    pub const fn metrics(&self) -> Option<&PerformanceMetrics> {
55        self.metrics.as_ref()
56    }
57    /// Creates a new search index at the specified path
58    pub fn create(index_path: &Path) -> Result<Self> {
59        let mut schema_builder = Schema::builder();
60
61        let content_field = schema_builder.add_text_field("content", TEXT | STORED);
62        let path_field = schema_builder.add_text_field("path", STRING | STORED);
63        let heading_path_field = schema_builder.add_text_field("heading_path", TEXT | STORED);
64        let lines_field = schema_builder.add_text_field("lines", STRING | STORED);
65        let alias_field = schema_builder.add_text_field("alias", STRING | STORED);
66        let anchor_field = schema_builder.add_text_field("anchor", STRING | STORED);
67
68        let schema = schema_builder.build();
69
70        std::fs::create_dir_all(index_path)
71            .map_err(|e| Error::Index(format!("Failed to create index directory: {e}")))?;
72
73        let index = Index::create_in_dir(index_path, schema.clone())
74            .map_err(|e| Error::Index(format!("Failed to create index: {e}")))?;
75
76        let reader = index
77            .reader_builder()
78            .reload_policy(tantivy::ReloadPolicy::OnCommitWithDelay)
79            .try_into()
80            .map_err(|e| Error::Index(format!("Failed to create reader: {e}")))?;
81
82        Ok(Self {
83            index,
84            schema,
85            content_field,
86            path_field,
87            heading_path_field,
88            lines_field,
89            alias_field,
90            reader,
91            anchor_field: Some(anchor_field),
92            metrics: None,
93        })
94    }
95
96    /// Creates a new search index or opens an existing one at the specified path
97    pub fn create_or_open(index_path: &Path) -> Result<Self> {
98        if index_path.exists() {
99            Self::open(index_path)
100        } else {
101            Self::create(index_path)
102        }
103    }
104
105    /// Opens an existing search index at the specified path
106    pub fn open(index_path: &Path) -> Result<Self> {
107        let index = Index::open_in_dir(index_path)
108            .map_err(|e| Error::Index(format!("Failed to open index: {e}")))?;
109
110        let schema = index.schema();
111
112        let content_field = schema
113            .get_field("content")
114            .map_err(|_| Error::Index("Missing content field".into()))?;
115        let path_field = schema
116            .get_field("path")
117            .map_err(|_| Error::Index("Missing path field".into()))?;
118        let heading_path_field = schema
119            .get_field("heading_path")
120            .map_err(|_| Error::Index("Missing heading_path field".into()))?;
121        let lines_field = schema
122            .get_field("lines")
123            .map_err(|_| Error::Index("Missing lines field".into()))?;
124        let alias_field = schema
125            .get_field("alias")
126            .map_err(|_| Error::Index("Missing alias field".into()))?;
127
128        // Anchor is optional for backward compatibility with older indexes
129        let anchor_field = schema.get_field("anchor").ok();
130
131        let reader = index
132            .reader_builder()
133            .reload_policy(tantivy::ReloadPolicy::OnCommitWithDelay)
134            .try_into()
135            .map_err(|e| Error::Index(format!("Failed to create reader: {e}")))?;
136
137        Ok(Self {
138            index,
139            schema,
140            content_field,
141            path_field,
142            heading_path_field,
143            lines_field,
144            alias_field,
145            reader,
146            anchor_field,
147            metrics: None,
148        })
149    }
150
151    /// Indexes a collection of heading blocks for a given alias
152    pub fn index_blocks(&self, alias: &str, blocks: &[HeadingBlock]) -> Result<()> {
153        let timer = self.metrics.as_ref().map_or_else(
154            || OperationTimer::new(&format!("index_{alias}")),
155            |metrics| OperationTimer::with_metrics(&format!("index_{alias}"), metrics.clone()),
156        );
157
158        let mut timings = ComponentTimings::new();
159
160        let mut writer = timings.time("writer_creation", || {
161            self.index
162                .writer(50_000_000)
163                .map_err(|e| Error::Index(format!("Failed to create writer: {e}")))
164        })?;
165
166        // Delete all existing documents for this alias
167        let _deleted = timings.time("delete_existing", || {
168            writer.delete_term(tantivy::Term::from_field_text(self.alias_field, alias))
169        });
170
171        let mut total_content_bytes = 0usize;
172
173        timings.time("document_creation", || {
174            for block in blocks {
175                total_content_bytes += block.content.len();
176                let heading_path_str = block.path.join(" > ");
177                let lines_str = format!("{}-{}", block.start_line, block.end_line);
178                // Compute anchor from last heading text
179                let anchor = block.path.last().map(|h| Self::compute_anchor(h));
180
181                let mut doc = doc!(
182                    self.content_field => block.content.as_str(),  // Use &str instead of clone
183                    self.path_field => "llms.txt",  // Always llms.txt (no flavor variants)
184                    self.heading_path_field => heading_path_str,
185                    self.lines_field => lines_str,
186                    self.alias_field => alias
187                );
188                if let (Some(f), Some(a)) = (self.anchor_field, anchor) {
189                    doc.add_text(f, a);
190                }
191
192                writer
193                    .add_document(doc)
194                    .map_err(|e| Error::Index(format!("Failed to add document: {e}")))?;
195            }
196            Ok::<(), Error>(())
197        })?;
198
199        timings.time("commit", || {
200            writer
201                .commit()
202                .map_err(|e| Error::Index(format!("Failed to commit: {e}")))
203        })?;
204
205        timings.time("reader_reload", || {
206            self.reader
207                .reload()
208                .map_err(|e| Error::Index(format!("Failed to reload reader: {e}")))
209        })?;
210
211        let duration = timer.finish_index(total_content_bytes);
212
213        // Print detailed breakdown if debug logging is enabled
214        if tracing::enabled!(Level::DEBUG) {
215            timings.print_breakdown();
216        }
217
218        info!(
219            "Indexed {} blocks ({} bytes) for {} in {:.2}ms",
220            blocks.len(),
221            total_content_bytes,
222            alias,
223            duration.as_millis()
224        );
225
226        Ok(())
227    }
228
229    /// Searches the index with optional alias filtering
230    pub fn search(
231        &self,
232        query_str: &str,
233        alias: Option<&str>,
234        limit: usize,
235    ) -> Result<Vec<SearchHit>> {
236        self.search_with_snippet_limit(query_str, alias, limit, DEFAULT_SNIPPET_CHAR_LIMIT)
237    }
238
239    /// Searches the index with optional alias filtering and an explicit snippet character limit.
240    #[allow(clippy::too_many_lines)] // Complex search logic requires detailed implementation
241    pub fn search_with_snippet_limit(
242        &self,
243        query_str: &str,
244        alias: Option<&str>,
245        limit: usize,
246        snippet_max_chars: usize,
247    ) -> Result<Vec<SearchHit>> {
248        let timer = self.metrics.as_ref().map_or_else(
249            || OperationTimer::new(&format!("search_{query_str}")),
250            |metrics| OperationTimer::with_metrics(&format!("search_{query_str}"), metrics.clone()),
251        );
252
253        let mut timings = ComponentTimings::new();
254        let mut lines_searched = 0usize;
255        let snippet_limit = clamp_snippet_chars(snippet_max_chars);
256
257        let searcher = timings.time("searcher_creation", || self.reader.searcher());
258
259        let query_parser = timings.time("query_parser_creation", || {
260            QueryParser::for_index(
261                &self.index,
262                vec![self.content_field, self.heading_path_field],
263            )
264        });
265
266        // Sanitize query more efficiently with a single allocation
267        let needs_escaping = query_str.chars().any(|c| {
268            matches!(
269                c,
270                '\\' | '(' | ')' | '[' | ']' | '{' | '}' | '^' | '~' | ':'
271            )
272        });
273
274        let mut filter_clauses = Vec::new();
275        if let Some(alias) = alias {
276            filter_clauses.push(format!("alias:{alias}"));
277        }
278
279        let sanitized_query = if needs_escaping {
280            // Only allocate if we need to escape characters
281            let mut sanitized = String::with_capacity(query_str.len() * 2);
282
283            for ch in query_str.chars() {
284                match ch {
285                    '\\' => sanitized.push_str("\\\\"),
286                    '(' => sanitized.push_str("\\("),
287                    ')' => sanitized.push_str("\\)"),
288                    '[' => sanitized.push_str("\\["),
289                    ']' => sanitized.push_str("\\]"),
290                    '{' => sanitized.push_str("\\{"),
291                    '}' => sanitized.push_str("\\}"),
292                    '^' => sanitized.push_str("\\^"),
293                    '~' => sanitized.push_str("\\~"),
294                    ':' => sanitized.push_str("\\:"),
295                    _ => sanitized.push(ch),
296                }
297            }
298
299            sanitized
300        } else {
301            query_str.to_string()
302        };
303
304        let full_query_str = if filter_clauses.is_empty() {
305            sanitized_query
306        } else {
307            format!("{} AND ({sanitized_query})", filter_clauses.join(" AND "))
308        };
309
310        let query = timings.time("query_parsing", || {
311            query_parser
312                .parse_query(&full_query_str)
313                .map_err(|e| Error::Index(format!("Failed to parse query: {e}")))
314        })?;
315
316        let top_docs = timings.time("tantivy_search", || {
317            searcher
318                .search(&query, &TopDocs::with_limit(limit))
319                .map_err(|e| Error::Index(format!("Search failed: {e}")))
320        })?;
321
322        let mut hits = Vec::new();
323
324        timings.time("result_processing", || {
325            for (score, doc_address) in top_docs {
326                let doc = searcher
327                    .doc(doc_address)
328                    .map_err(|e| Error::Index(format!("Failed to retrieve doc: {e}")))?;
329
330                let alias = Self::get_field_text(&doc, self.alias_field)?;
331                let file = Self::get_field_text(&doc, self.path_field)?;
332                let heading_path_str = Self::get_field_text(&doc, self.heading_path_field)?;
333                let lines = Self::get_field_text(&doc, self.lines_field)?;
334                let content = Self::get_field_text(&doc, self.content_field)?;
335                let anchor = self.anchor_field.and_then(|f| {
336                    doc.get_first(f)
337                        .and_then(|v| v.as_str())
338                        .map(std::string::ToString::to_string)
339                });
340
341                // Count lines for metrics
342                lines_searched += content.lines().count();
343
344                let heading_path: Vec<String> = heading_path_str
345                    .split(" > ")
346                    .map(std::string::ToString::to_string)
347                    .collect();
348
349                let snippet = Self::extract_snippet(&content, query_str, snippet_limit);
350
351                // Prefer exact match line(s) when possible for better citations
352                let exact_lines = Self::compute_match_lines(&content, query_str, &lines)
353                    .unwrap_or_else(|| lines.clone());
354
355                // Parse numeric line range for convenience
356                let line_numbers = Self::parse_lines_range(&exact_lines);
357
358                hits.push(SearchHit {
359                    source: alias,
360                    file,
361                    heading_path,
362                    lines: exact_lines,
363                    line_numbers,
364                    snippet,
365                    score,
366                    source_url: None,
367                    fetched_at: None,
368                    is_stale: false,
369                    checksum: String::new(),
370                    anchor,
371                    context: None,
372                });
373            }
374            Ok::<(), Error>(())
375        })?;
376
377        let duration = timer.finish_search(lines_searched);
378
379        // Print detailed breakdown if debug logging is enabled
380        if tracing::enabled!(Level::DEBUG) {
381            timings.print_breakdown();
382        }
383
384        debug!(
385            "Found {} hits for query '{}' in {:.2}ms (searched {} lines)",
386            hits.len(),
387            query_str,
388            duration.as_millis(),
389            lines_searched
390        );
391
392        Ok(hits)
393    }
394
395    fn compute_anchor(heading_text: &str) -> String {
396        let mut hasher = Sha256::new();
397        hasher.update(heading_text.trim().to_lowercase().as_bytes());
398        let digest = hasher.finalize();
399        let full = B64.encode(digest);
400        full[..22.min(full.len())].to_string()
401    }
402
403    fn get_field_text(doc: &tantivy::TantivyDocument, field: Field) -> Result<String> {
404        doc.get_first(field)
405            .and_then(|v| v.as_str())
406            .map(std::string::ToString::to_string)
407            .ok_or_else(|| Error::Index("Field not found in document".into()))
408    }
409
410    /// Compute exact match line(s) within a block's content relative to its stored line range.
411    /// Returns a "start-end" string (typically a single line) falling back to the original range on failure.
412    fn compute_match_lines(content: &str, query: &str, block_lines: &str) -> Option<String> {
413        // Parse the block's starting line
414        let block_start: usize = block_lines
415            .split(['-', ':'])
416            .next()
417            .and_then(|s| s.trim().parse::<usize>().ok())?;
418
419        // Tokenize while preserving quoted phrases so we prefer the full phrase when present.
420        let mut phrases = Vec::new();
421        let mut terms = Vec::new();
422        let mut current = String::new();
423        let mut in_quotes = false;
424        for ch in query.chars() {
425            match ch {
426                '"' => {
427                    if in_quotes {
428                        if !current.is_empty() {
429                            phrases.push(current.clone());
430                            current.clear();
431                        }
432                        in_quotes = false;
433                    } else {
434                        in_quotes = true;
435                    }
436                },
437                ch if ch.is_whitespace() && !in_quotes => {
438                    if !current.is_empty() {
439                        terms.push(current.clone());
440                        current.clear();
441                    }
442                },
443                _ => current.push(ch),
444            }
445        }
446        if !current.is_empty() {
447            if in_quotes {
448                phrases.push(current);
449            } else {
450                terms.push(current);
451            }
452        }
453
454        let phrases: Vec<String> = phrases
455            .into_iter()
456            .map(|token| {
457                token
458                    .trim_matches('"')
459                    .trim_start_matches(['+', '-'])
460                    .trim()
461                    .to_string()
462            })
463            .filter(|s| !s.is_empty())
464            .collect();
465        let terms: Vec<String> = terms
466            .into_iter()
467            .map(|token| {
468                token
469                    .trim_matches('"')
470                    .trim_start_matches(['+', '-'])
471                    .trim()
472                    .to_string()
473            })
474            .filter(|s| !s.is_empty())
475            .collect();
476
477        let mut best_pos: Option<usize> = None;
478        for token in phrases.iter().chain(terms.iter()) {
479            if let Some(pos) = content.find(token) {
480                best_pos = Some(best_pos.map_or(pos, |cur| pos.min(cur)));
481            }
482        }
483
484        let pos = best_pos?;
485        // Count newlines before position to get 0-based line offset
486        let local_line = content[..pos].bytes().filter(|&b| b == b'\n').count();
487        let abs_line = block_start.saturating_add(local_line);
488        Some(format!("{abs_line}-{abs_line}"))
489    }
490
491    /// Parse a `"start-end"` or `"start:end"` range into a two-element vector.
492    /// Returns None if parsing fails or inputs are invalid.
493    fn parse_lines_range(range: &str) -> Option<Vec<usize>> {
494        let mut parts = range.split(['-', ':']);
495        let start = parts.next()?.trim().parse::<usize>().ok()?;
496        let end = parts.next()?.trim().parse::<usize>().ok()?;
497        Some(vec![start, end])
498    }
499
500    fn extract_snippet(content: &str, query: &str, max_len: usize) -> String {
501        // Prefer phrase matching when the whole query is quoted; otherwise use the raw query.
502        let trimmed = query.trim();
503        let phrase_candidate =
504            if trimmed.len() >= 2 && trimmed.starts_with('"') && trimmed.ends_with('"') {
505                &trimmed[1..trimmed.len() - 1]
506            } else {
507                query
508            };
509        let query_lower = phrase_candidate.to_lowercase();
510
511        // Find match position using character indices to handle Unicode correctly
512        let mut match_char_pos = None;
513
514        // Use a sliding window approach with character iteration
515        let content_chars: Vec<char> = content.chars().collect();
516        let query_chars: Vec<char> = query_lower.chars().collect();
517
518        if !query_chars.is_empty() {
519            for window_start in 0..content_chars.len() {
520                let window_end = (window_start + query_chars.len()).min(content_chars.len());
521                if window_end - window_start < query_chars.len() {
522                    break;
523                }
524
525                // Check if this window matches (case-insensitive)
526                let window_matches = content_chars[window_start..window_end]
527                    .iter()
528                    .zip(query_chars.iter())
529                    .all(|(c1, c2)| c1.to_lowercase().eq(c2.to_lowercase()));
530
531                if window_matches {
532                    match_char_pos = Some(window_start);
533                    break;
534                }
535            }
536        }
537
538        if let Some(char_pos) = match_char_pos {
539            // Derive context from max_len so we don't overshoot the requested length.
540            let total_chars = content_chars.len();
541            let qlen = query_chars.len();
542            let ctx_each_side = max_len.saturating_sub(qlen) / 2;
543
544            let start_char = char_pos.saturating_sub(ctx_each_side);
545            let mut end_char = (char_pos + qlen + ctx_each_side).min(total_chars);
546
547            // Clamp to at most max_len characters around the match.
548            let span = end_char.saturating_sub(start_char);
549            if span > max_len {
550                end_char = start_char + max_len;
551            }
552
553            let left_trunc = start_char > 0;
554            let right_trunc = end_char < total_chars;
555
556            // Build snippet
557            let mut snippet = String::with_capacity((end_char - start_char) * 4 + 6);
558            if left_trunc {
559                snippet.push_str("...");
560            }
561            for &ch in content_chars.iter().take(end_char).skip(start_char) {
562                snippet.push(ch);
563            }
564            if right_trunc {
565                snippet.push_str("...");
566            }
567            return snippet;
568        }
569
570        // No match found - return truncated content using character count
571        let content_chars: Vec<char> = content.chars().collect();
572        if content_chars.len() <= max_len {
573            content.to_string()
574        } else {
575            // Truncate based on character count, not byte count
576            let mut result = String::with_capacity(max_len * 4 + 3);
577            for (i, ch) in content_chars.iter().enumerate() {
578                if i >= max_len {
579                    break;
580                }
581                result.push(*ch);
582            }
583            result.push_str("...");
584            result
585        }
586    }
587}
588
589#[cfg(test)]
590mod tests {
591    #![allow(clippy::panic)]
592    #![allow(clippy::disallowed_macros)]
593    #![allow(clippy::unwrap_used)]
594    use super::*;
595    use crate::HeadingBlock;
596    use std::time::Instant;
597    use tempfile::TempDir;
598
599    fn create_test_blocks() -> Vec<HeadingBlock> {
600        vec![
601            HeadingBlock {
602                path: vec!["React".to_string(), "Hooks".to_string()],
603                content: "useState is a React hook that lets you add state to functional components. It returns an array with the current state value and a function to update it.".to_string(),
604                start_line: 100,
605                end_line: 120,
606            },
607            HeadingBlock {
608                path: vec!["React".to_string(), "Components".to_string()],
609                content: "Components are the building blocks of React applications. They can be function components or class components.".to_string(),
610                start_line: 50,
611                end_line: 75,
612            },
613            HeadingBlock {
614                path: vec!["Next.js".to_string(), "Routing".to_string()],
615                content: "App Router is the new routing system in Next.js 13+. It provides better performance and developer experience.".to_string(),
616                start_line: 200,
617                end_line: 250,
618            },
619        ]
620    }
621
622    #[test]
623    fn test_index_creation() {
624        let temp_dir = TempDir::new().expect("Failed to create temp dir");
625        let index_path = temp_dir.path().join("test_index");
626
627        let result = SearchIndex::create(&index_path);
628        assert!(result.is_ok(), "Should create index successfully");
629
630        // Verify index directory was created
631        assert!(index_path.exists());
632    }
633
634    #[test]
635    fn test_index_open_nonexistent() {
636        let temp_dir = TempDir::new().expect("Failed to create temp dir");
637        let index_path = temp_dir.path().join("nonexistent");
638
639        let result = SearchIndex::open(&index_path);
640        assert!(result.is_err(), "Should fail to open non-existent index");
641    }
642
643    #[test]
644    fn test_index_and_search_basic() {
645        let temp_dir = TempDir::new().expect("Failed to create temp dir");
646        let index_path = temp_dir.path().join("test_index");
647
648        // Create index and add blocks
649        let index = SearchIndex::create(&index_path).expect("Should create index");
650        let blocks = create_test_blocks();
651
652        index
653            .index_blocks("test", &blocks)
654            .expect("Should index blocks");
655
656        // Search for content
657        let hits = index
658            .search("useState", Some("test"), 10)
659            .expect("Should search");
660
661        assert!(!hits.is_empty(), "Should find results for useState");
662        assert!(
663            hits[0].snippet.contains("useState"),
664            "Result should contain useState"
665        );
666        assert_eq!(hits[0].source, "test");
667        assert_eq!(hits[0].file, "llms.txt");
668    }
669
670    #[test]
671    fn test_search_limit() {
672        let temp_dir = TempDir::new().expect("Failed to create temp dir");
673        let index_path = temp_dir.path().join("test_index");
674
675        let index = SearchIndex::create(&index_path).expect("Should create index");
676        let blocks = create_test_blocks();
677
678        index
679            .index_blocks("test", &blocks)
680            .expect("Should index blocks");
681
682        // Search with limit
683        let hits = index
684            .search("React", Some("test"), 1)
685            .expect("Should search");
686
687        assert!(!hits.is_empty(), "Should find results");
688        assert!(hits.len() <= 1, "Should respect limit");
689    }
690
691    #[test]
692    fn test_search_includes_anchor() {
693        let temp_dir = TempDir::new().expect("Failed to create temp dir");
694        let index_path = temp_dir.path().join("test_index");
695
696        let index = SearchIndex::create(&index_path).expect("Should create index");
697
698        let blocks = vec![HeadingBlock {
699            path: vec!["API".to_string(), "Reference".to_string()],
700            content: "token auth key".to_string(),
701            start_line: 10,
702            end_line: 20,
703        }];
704
705        index
706            .index_blocks("test", &blocks)
707            .expect("Should index blocks");
708
709        let hits = index
710            .search("token", Some("test"), 10)
711            .expect("Should search");
712
713        assert!(!hits.is_empty());
714        assert!(hits[0].anchor.is_some(), "anchor should be present in hits");
715        // Anchor should be derived from the last heading segment
716        let expected = SearchIndex::compute_anchor("Reference");
717        assert_eq!(hits[0].anchor.clone().unwrap(), expected);
718    }
719
720    #[test]
721    fn test_search_no_results() {
722        let temp_dir = TempDir::new().expect("Failed to create temp dir");
723        let index_path = temp_dir.path().join("test_index");
724
725        let index = SearchIndex::create(&index_path).expect("Should create index");
726        let blocks = create_test_blocks();
727
728        index
729            .index_blocks("test", &blocks)
730            .expect("Should index blocks");
731
732        // Search for non-existent term
733        let hits = index
734            .search("nonexistentterm12345", Some("test"), 10)
735            .expect("Should search");
736
737        assert!(
738            hits.is_empty(),
739            "Should find no results for non-existent term"
740        );
741    }
742
743    #[test]
744    fn test_search_performance() {
745        let temp_dir = TempDir::new().expect("Failed to create temp dir");
746        let index_path = temp_dir.path().join("test_index");
747
748        let index = SearchIndex::create(&index_path).expect("Should create index");
749
750        // Create many blocks for performance testing
751        let mut blocks = Vec::new();
752        for i in 0..100 {
753            blocks.push(HeadingBlock {
754                path: vec![format!("Section{}", i)],
755                content: format!("This is content block {i} with various keywords like React, hooks, components, and performance testing."),
756                start_line: i * 10,
757                end_line: i * 10 + 5,
758            });
759        }
760
761        index
762            .index_blocks("perftest", &blocks)
763            .expect("Should index many blocks");
764
765        // Test search performance
766        let start = Instant::now();
767        let hits = index
768            .search("React", Some("perftest"), 50)
769            .expect("Should search");
770        let duration = start.elapsed();
771
772        assert!(!hits.is_empty(), "Should find results");
773        assert!(
774            duration.as_millis() < 100,
775            "Search should be fast (<100ms), took {}ms",
776            duration.as_millis()
777        );
778    }
779
780    #[test]
781    fn test_search_scoring() {
782        let temp_dir = TempDir::new().expect("Failed to create temp dir");
783        let index_path = temp_dir.path().join("test_index");
784
785        let index = SearchIndex::create(&index_path).expect("Should create index");
786
787        let blocks = vec![
788            HeadingBlock {
789                path: vec!["Exact Match".to_string()],
790                content: "React hooks".to_string(),
791                start_line: 1,
792                end_line: 5,
793            },
794            HeadingBlock {
795                path: vec!["Partial Match".to_string()],
796                content: "React components and hooks are useful features".to_string(),
797                start_line: 10,
798                end_line: 15,
799            },
800            HeadingBlock {
801                path: vec!["Distant Match".to_string()],
802                content: "In React, you can use various hooks for different purposes".to_string(),
803                start_line: 20,
804                end_line: 25,
805            },
806        ];
807
808        index
809            .index_blocks("test", &blocks)
810            .expect("Should index blocks");
811
812        let hits = index
813            .search("React hooks", Some("test"), 10)
814            .expect("Should search");
815
816        assert!(!hits.is_empty(), "Should find results");
817
818        // Results should be ordered by relevance (score)
819        for i in 1..hits.len() {
820            assert!(
821                hits[i - 1].score >= hits[i].score,
822                "Results should be ordered by descending score"
823            );
824        }
825
826        // The exact match should have the highest score
827        assert!(
828            hits[0].snippet.contains("React hooks"),
829            "Highest scored result should contain exact match"
830        );
831    }
832
833    #[test]
834    fn test_search_snippet_respects_limits() {
835        let temp_dir = TempDir::new().expect("Failed to create temp dir");
836        let index_path = temp_dir.path().join("test_index");
837
838        let index = SearchIndex::create(&index_path).expect("Should create index");
839
840        let blocks = vec![HeadingBlock {
841            path: vec!["Hooks".to_string()],
842            content: "React provides hooks for state and effect management. Hooks enable composing complex logic from simple primitives. Extensive documentation follows here to ensure the snippet must truncate properly when limits are applied.".to_string(),
843            start_line: 1,
844            end_line: 20,
845        }];
846
847        index
848            .index_blocks("test", &blocks)
849            .expect("Should index blocks");
850
851        let default_hits = index
852            .search("hooks", Some("test"), 5)
853            .expect("Should search with default limit");
854        assert!(!default_hits.is_empty());
855        let default_len = default_hits[0].snippet.chars().count();
856        assert!(
857            default_len <= DEFAULT_SNIPPET_CHAR_LIMIT + 6,
858            "Default snippet should clamp near default limit"
859        );
860
861        let custom_limit = 80;
862        let custom_hits = index
863            .search_with_snippet_limit("hooks", Some("test"), 5, custom_limit)
864            .expect("Should search with custom limit");
865        assert!(!custom_hits.is_empty());
866        let custom_len = custom_hits[0].snippet.chars().count();
867        assert!(
868            custom_len <= clamp_snippet_chars(custom_limit) + 6,
869            "Custom snippet should respect provided limit"
870        );
871
872        // Ensure custom limit produces a shorter snippet than the default when truncation occurs.
873        assert!(custom_len <= default_len);
874    }
875
876    #[test]
877    fn test_heading_path_in_results() {
878        let temp_dir = TempDir::new().expect("Failed to create temp dir");
879        let index_path = temp_dir.path().join("test_index");
880
881        let index = SearchIndex::create(&index_path).expect("Should create index");
882
883        let blocks = vec![HeadingBlock {
884            path: vec![
885                "API".to_string(),
886                "Reference".to_string(),
887                "Hooks".to_string(),
888            ],
889            content: "useState hook documentation".to_string(),
890            start_line: 100,
891            end_line: 120,
892        }];
893
894        index
895            .index_blocks("test", &blocks)
896            .expect("Should index blocks");
897
898        let hits = index
899            .search("useState", Some("test"), 10)
900            .expect("Should search");
901
902        assert!(!hits.is_empty(), "Should find results");
903        assert_eq!(hits[0].heading_path, vec!["API", "Reference", "Hooks"]);
904        assert_eq!(hits[0].file, "llms.txt");
905        // Lines should point to the exact match within the block (first line)
906        assert!(
907            hits[0].lines.starts_with("100-"),
908            "Expected match to start at line 100, got {}",
909            hits[0].lines
910        );
911    }
912
913    #[test]
914    fn test_unicode_snippet_extraction() {
915        let temp_dir = TempDir::new().expect("Failed to create temp dir");
916        let index_path = temp_dir.path().join("test_index");
917        let index = SearchIndex::create(&index_path).expect("Should create index");
918
919        // Test with various Unicode content
920        let unicode_blocks = vec![
921            HeadingBlock {
922                path: vec!["Unicode".to_string(), "Emoji".to_string()],
923                content: "This is a test with emojis: 👋 Hello 🌍 World! 🚀 Let's go! 🎉"
924                    .to_string(),
925                start_line: 1,
926                end_line: 10,
927            },
928            HeadingBlock {
929                path: vec!["Unicode".to_string(), "Chinese".to_string()],
930                content: "这是中文测试。Hello 世界！Programming 编程 is 很有趣。".to_string(),
931                start_line: 20,
932                end_line: 30,
933            },
934            HeadingBlock {
935                path: vec!["Unicode".to_string(), "Mixed".to_string()],
936                content: "日本語 テスト 🇯🇵 with mixed content".to_string(),
937                start_line: 40,
938                end_line: 50,
939            },
940        ];
941
942        index
943            .index_blocks("unicode_test", &unicode_blocks)
944            .expect("Should index blocks");
945
946        // Test searching for various Unicode content
947        let test_cases = vec![("emoji", "👋"), ("中文", "测试"), ("programming", "编程")];
948
949        for (query, _expected_content) in test_cases {
950            let results = index
951                .search(query, Some("unicode_test"), 10)
952                .unwrap_or_else(|_| panic!("Should search for '{query}'"));
953
954            if !results.is_empty() {
955                let hit = &results[0];
956                // Verify snippet doesn't panic on Unicode boundaries
957                assert!(hit.snippet.is_char_boundary(0));
958                assert!(hit.snippet.is_char_boundary(hit.snippet.len()));
959
960                // Verify we can iterate over chars without panic
961                let _char_count = hit.snippet.chars().count();
962            }
963        }
964    }
965
966    #[test]
967    fn test_edge_case_unicode_truncation() {
968        let temp_dir = TempDir::new().expect("Failed to create temp dir");
969        let index_path = temp_dir.path().join("test_index");
970        let index = SearchIndex::create(&index_path).expect("Should create index");
971
972        // Create content where truncation would happen in middle of multi-byte chars
973        let mut long_content = String::new();
974        for _ in 0..20 {
975            long_content.push_str("👨‍👩‍👧‍👦"); // Family emoji (complex grapheme cluster)
976        }
977        long_content.push_str(" MARKER ");
978        for _ in 0..20 {
979            long_content.push_str("🏳️‍🌈"); // Rainbow flag (another complex emoji)
980        }
981
982        let blocks = vec![HeadingBlock {
983            path: vec!["Test".to_string()],
984            content: long_content.clone(),
985            start_line: 1,
986            end_line: 10,
987        }];
988
989        index
990            .index_blocks("edge_test", &blocks)
991            .expect("Should index blocks");
992
993        let results = index
994            .search("MARKER", Some("edge_test"), 10)
995            .expect("Should search");
996
997        assert!(!results.is_empty());
998        let snippet = &results[0].snippet;
999
1000        // Verify the snippet is valid UTF-8 and doesn't panic
1001        assert!(snippet.is_char_boundary(0));
1002        assert!(snippet.is_char_boundary(snippet.len()));
1003        assert!(snippet.contains("MARKER"));
1004
1005        // Verify we can iterate over chars without panic
1006        let char_count = snippet.chars().count();
1007        assert!(char_count > 0);
1008    }
1009}
blz_core/index.rs

blz_core/
index.rs