blz_core/
index.rs

1use crate::profiling::{ComponentTimings, OperationTimer, PerformanceMetrics};
2use crate::types::normalize_flavor_filters;
3use crate::{Error, HeadingBlock, Result, SearchHit};
4use base64::{Engine, engine::general_purpose::STANDARD as B64};
5use sha2::{Digest, Sha256};
6use std::path::Path;
7use tantivy::collector::TopDocs;
8use tantivy::query::QueryParser;
9use tantivy::schema::{Field, STORED, STRING, Schema, TEXT, Value};
10use tantivy::{Index, IndexReader, doc};
11use tracing::{Level, debug, info};
12
13/// Tantivy-based search index for llms.txt documentation
14pub struct SearchIndex {
15    index: Index,
16    #[allow(dead_code)]
17    schema: Schema,
18    content_field: Field,
19    path_field: Field,
20    heading_path_field: Field,
21    lines_field: Field,
22    alias_field: Field,
23    flavor_field: Option<Field>,
24    alias_flavor_field: Option<Field>,
25    anchor_field: Option<Field>,
26    reader: IndexReader,
27    metrics: Option<PerformanceMetrics>,
28}
29
30impl SearchIndex {
31    /// Enable performance metrics collection
32    #[must_use]
33    pub fn with_metrics(mut self, metrics: PerformanceMetrics) -> Self {
34        self.metrics = Some(metrics);
35        self
36    }
37
38    /// Get the performance metrics instance
39    #[must_use]
40    pub const fn metrics(&self) -> Option<&PerformanceMetrics> {
41        self.metrics.as_ref()
42    }
43    /// Creates a new search index at the specified path
44    pub fn create(index_path: &Path) -> Result<Self> {
45        let mut schema_builder = Schema::builder();
46
47        let content_field = schema_builder.add_text_field("content", TEXT | STORED);
48        let path_field = schema_builder.add_text_field("path", STRING | STORED);
49        let heading_path_field = schema_builder.add_text_field("heading_path", TEXT | STORED);
50        let lines_field = schema_builder.add_text_field("lines", STRING | STORED);
51        let alias_field = schema_builder.add_text_field("alias", STRING | STORED);
52        let flavor_field = schema_builder.add_text_field("flavor", STRING | STORED);
53        let alias_flavor_field = schema_builder.add_text_field("alias_flavor", STRING | STORED);
54        let anchor_field = schema_builder.add_text_field("anchor", STRING | STORED);
55
56        let schema = schema_builder.build();
57
58        std::fs::create_dir_all(index_path)
59            .map_err(|e| Error::Index(format!("Failed to create index directory: {e}")))?;
60
61        let index = Index::create_in_dir(index_path, schema.clone())
62            .map_err(|e| Error::Index(format!("Failed to create index: {e}")))?;
63
64        let reader = index
65            .reader_builder()
66            .reload_policy(tantivy::ReloadPolicy::OnCommitWithDelay)
67            .try_into()
68            .map_err(|e| Error::Index(format!("Failed to create reader: {e}")))?;
69
70        Ok(Self {
71            index,
72            schema,
73            content_field,
74            path_field,
75            heading_path_field,
76            lines_field,
77            alias_field,
78            flavor_field: Some(flavor_field),
79            alias_flavor_field: Some(alias_flavor_field),
80            reader,
81            anchor_field: Some(anchor_field),
82            metrics: None,
83        })
84    }
85
86    /// Creates a new search index or opens an existing one at the specified path
87    pub fn create_or_open(index_path: &Path) -> Result<Self> {
88        if index_path.exists() {
89            Self::open(index_path)
90        } else {
91            Self::create(index_path)
92        }
93    }
94
95    /// Opens an existing search index at the specified path
96    pub fn open(index_path: &Path) -> Result<Self> {
97        let index = Index::open_in_dir(index_path)
98            .map_err(|e| Error::Index(format!("Failed to open index: {e}")))?;
99
100        let schema = index.schema();
101
102        let content_field = schema
103            .get_field("content")
104            .map_err(|_| Error::Index("Missing content field".into()))?;
105        let path_field = schema
106            .get_field("path")
107            .map_err(|_| Error::Index("Missing path field".into()))?;
108        let heading_path_field = schema
109            .get_field("heading_path")
110            .map_err(|_| Error::Index("Missing heading_path field".into()))?;
111        let lines_field = schema
112            .get_field("lines")
113            .map_err(|_| Error::Index("Missing lines field".into()))?;
114        let alias_field = schema
115            .get_field("alias")
116            .map_err(|_| Error::Index("Missing alias field".into()))?;
117
118        let flavor_field = schema.get_field("flavor").ok();
119        let alias_flavor_field = schema.get_field("alias_flavor").ok();
120
121        // Anchor is optional for backward compatibility with older indexes
122        let anchor_field = schema.get_field("anchor").ok();
123
124        let reader = index
125            .reader_builder()
126            .reload_policy(tantivy::ReloadPolicy::OnCommitWithDelay)
127            .try_into()
128            .map_err(|e| Error::Index(format!("Failed to create reader: {e}")))?;
129
130        Ok(Self {
131            index,
132            schema,
133            content_field,
134            path_field,
135            heading_path_field,
136            lines_field,
137            alias_field,
138            flavor_field,
139            alias_flavor_field,
140            reader,
141            anchor_field,
142            metrics: None,
143        })
144    }
145
146    /// Indexes a collection of heading blocks for a given alias
147    pub fn index_blocks(
148        &self,
149        alias: &str,
150        file_path: &str,
151        blocks: &[HeadingBlock],
152        flavor: &str,
153    ) -> Result<()> {
154        let timer = self.metrics.as_ref().map_or_else(
155            || OperationTimer::new(&format!("index_{alias}")),
156            |metrics| OperationTimer::with_metrics(&format!("index_{alias}"), metrics.clone()),
157        );
158
159        let mut timings = ComponentTimings::new();
160
161        let mut writer = timings.time("writer_creation", || {
162            self.index
163                .writer(50_000_000)
164                .map_err(|e| Error::Index(format!("Failed to create writer: {e}")))
165        })?;
166
167        let alias_flavor_value = format!("{alias}::{flavor}");
168
169        let _deleted = timings.time("delete_existing", || {
170            self.alias_flavor_field.map_or_else(
171                || writer.delete_term(tantivy::Term::from_field_text(self.alias_field, alias)),
172                |field| {
173                    writer.delete_term(tantivy::Term::from_field_text(field, &alias_flavor_value))
174                },
175            )
176        });
177
178        let mut total_content_bytes = 0usize;
179
180        timings.time("document_creation", || {
181            for block in blocks {
182                total_content_bytes += block.content.len();
183                let heading_path_str = block.path.join(" > ");
184                let lines_str = format!("{}-{}", block.start_line, block.end_line);
185                // Compute anchor from last heading text
186                let anchor = block.path.last().map(|h| Self::compute_anchor(h));
187
188                let mut doc = doc!(
189                    self.content_field => block.content.as_str(),  // Use &str instead of clone
190                    self.path_field => file_path,
191                    self.heading_path_field => heading_path_str,
192                    self.lines_field => lines_str,
193                    self.alias_field => alias
194                );
195                if let Some(field) = self.flavor_field {
196                    doc.add_text(field, flavor);
197                }
198                if let Some(field) = self.alias_flavor_field {
199                    doc.add_text(field, &alias_flavor_value);
200                }
201                if let (Some(f), Some(a)) = (self.anchor_field, anchor) {
202                    doc.add_text(f, a);
203                }
204
205                writer
206                    .add_document(doc)
207                    .map_err(|e| Error::Index(format!("Failed to add document: {e}")))?;
208            }
209            Ok::<(), Error>(())
210        })?;
211
212        timings.time("commit", || {
213            writer
214                .commit()
215                .map_err(|e| Error::Index(format!("Failed to commit: {e}")))
216        })?;
217
218        timings.time("reader_reload", || {
219            self.reader
220                .reload()
221                .map_err(|e| Error::Index(format!("Failed to reload reader: {e}")))
222        })?;
223
224        let duration = timer.finish_index(total_content_bytes);
225
226        // Print detailed breakdown if debug logging is enabled
227        if tracing::enabled!(Level::DEBUG) {
228            timings.print_breakdown();
229        }
230
231        info!(
232            "Indexed {} blocks ({} bytes) for {} in {:.2}ms",
233            blocks.len(),
234            total_content_bytes,
235            alias,
236            duration.as_millis()
237        );
238
239        Ok(())
240    }
241
242    /// Searches the index with optional alias filtering
243    #[allow(clippy::too_many_lines)] // Complex search logic requires detailed implementation
244    pub fn search(
245        &self,
246        query_str: &str,
247        alias: Option<&str>,
248        flavor: Option<&str>,
249        limit: usize,
250    ) -> Result<Vec<SearchHit>> {
251        let timer = self.metrics.as_ref().map_or_else(
252            || OperationTimer::new(&format!("search_{query_str}")),
253            |metrics| OperationTimer::with_metrics(&format!("search_{query_str}"), metrics.clone()),
254        );
255
256        let mut timings = ComponentTimings::new();
257        let mut lines_searched = 0usize;
258
259        let searcher = timings.time("searcher_creation", || self.reader.searcher());
260
261        let query_parser = timings.time("query_parser_creation", || {
262            QueryParser::for_index(
263                &self.index,
264                vec![self.content_field, self.heading_path_field],
265            )
266        });
267
268        // Sanitize query more efficiently with a single allocation
269        let needs_escaping = query_str.chars().any(|c| {
270            matches!(
271                c,
272                '\\' | '"' | '(' | ')' | '[' | ']' | '{' | '}' | '^' | '~'
273            )
274        });
275
276        let mut filter_clauses = Vec::new();
277        if let Some(alias) = alias {
278            filter_clauses.push(format!("alias:{alias}"));
279        }
280
281        let normalized_flavors = flavor.and_then(|raw| {
282            let normalized = normalize_flavor_filters(raw);
283            if normalized.is_empty() {
284                if !raw.trim().is_empty() {
285                    tracing::debug!(
286                        filter = raw,
287                        "Ignoring flavor filter with no recognized values"
288                    );
289                }
290                None
291            } else {
292                Some(normalized)
293            }
294        });
295
296        match (self.flavor_field, normalized_flavors) {
297            (Some(_), Some(values)) => {
298                if values.len() == 1 {
299                    filter_clauses.push(format!("flavor:{}", values[0]));
300                } else {
301                    let clause = values
302                        .iter()
303                        .map(|value| format!("flavor:{value}"))
304                        .collect::<Vec<_>>()
305                        .join(" OR ");
306                    filter_clauses.push(format!("({clause})"));
307                }
308            },
309            (None, Some(values)) => {
310                tracing::warn!(
311                    filters = %values.join(","),
312                    "Flavor filtering requested but index schema has no flavor field; ignoring"
313                );
314            },
315            _ => {},
316        }
317
318        let sanitized_query = if needs_escaping {
319            // Only allocate if we need to escape characters
320            let mut sanitized = String::with_capacity(query_str.len() * 2);
321
322            for ch in query_str.chars() {
323                match ch {
324                    '\\' => sanitized.push_str("\\\\"),
325                    '"' => sanitized.push_str("\\\""),
326                    '(' => sanitized.push_str("\\("),
327                    ')' => sanitized.push_str("\\)"),
328                    '[' => sanitized.push_str("\\["),
329                    ']' => sanitized.push_str("\\]"),
330                    '{' => sanitized.push_str("\\{"),
331                    '}' => sanitized.push_str("\\}"),
332                    '^' => sanitized.push_str("\\^"),
333                    '~' => sanitized.push_str("\\~"),
334                    _ => sanitized.push(ch),
335                }
336            }
337
338            sanitized
339        } else {
340            query_str.to_string()
341        };
342
343        let full_query_str = if filter_clauses.is_empty() {
344            sanitized_query
345        } else {
346            format!("{} AND ({sanitized_query})", filter_clauses.join(" AND "))
347        };
348
349        let query = timings.time("query_parsing", || {
350            query_parser
351                .parse_query(&full_query_str)
352                .map_err(|e| Error::Index(format!("Failed to parse query: {e}")))
353        })?;
354
355        let top_docs = timings.time("tantivy_search", || {
356            searcher
357                .search(&query, &TopDocs::with_limit(limit))
358                .map_err(|e| Error::Index(format!("Search failed: {e}")))
359        })?;
360
361        let mut hits = Vec::new();
362
363        timings.time("result_processing", || {
364            for (score, doc_address) in top_docs {
365                let doc = searcher
366                    .doc(doc_address)
367                    .map_err(|e| Error::Index(format!("Failed to retrieve doc: {e}")))?;
368
369                let alias = Self::get_field_text(&doc, self.alias_field)?;
370                let file = Self::get_field_text(&doc, self.path_field)?;
371                let heading_path_str = Self::get_field_text(&doc, self.heading_path_field)?;
372                let lines = Self::get_field_text(&doc, self.lines_field)?;
373                let content = Self::get_field_text(&doc, self.content_field)?;
374                let anchor = self.anchor_field.and_then(|f| {
375                    doc.get_first(f)
376                        .and_then(|v| v.as_str())
377                        .map(std::string::ToString::to_string)
378                });
379                let flavor_value = self.flavor_field.and_then(|f| {
380                    doc.get_first(f)
381                        .and_then(|v| v.as_str())
382                        .map(std::string::ToString::to_string)
383                });
384
385                // Count lines for metrics
386                lines_searched += content.lines().count();
387
388                let heading_path: Vec<String> = heading_path_str
389                    .split(" > ")
390                    .map(std::string::ToString::to_string)
391                    .collect();
392
393                let snippet = Self::extract_snippet(&content, query_str, 100);
394
395                // Prefer exact match line(s) when possible for better citations
396                let exact_lines = Self::compute_match_lines(&content, query_str, &lines)
397                    .unwrap_or_else(|| lines.clone());
398
399                // Parse numeric line range for convenience
400                let line_numbers = Self::parse_lines_range(&exact_lines);
401
402                hits.push(SearchHit {
403                    alias: alias.clone(),
404                    source: alias,
405                    file,
406                    heading_path,
407                    lines: exact_lines,
408                    line_numbers,
409                    snippet,
410                    score,
411                    source_url: None,
412                    checksum: String::new(),
413                    anchor,
414                    flavor: flavor_value,
415                });
416            }
417            Ok::<(), Error>(())
418        })?;
419
420        let duration = timer.finish_search(lines_searched);
421
422        // Print detailed breakdown if debug logging is enabled
423        if tracing::enabled!(Level::DEBUG) {
424            timings.print_breakdown();
425        }
426
427        debug!(
428            "Found {} hits for query '{}' in {:.2}ms (searched {} lines)",
429            hits.len(),
430            query_str,
431            duration.as_millis(),
432            lines_searched
433        );
434
435        Ok(hits)
436    }
437
438    fn compute_anchor(heading_text: &str) -> String {
439        let mut hasher = Sha256::new();
440        hasher.update(heading_text.trim().to_lowercase().as_bytes());
441        let digest = hasher.finalize();
442        let full = B64.encode(digest);
443        full[..22.min(full.len())].to_string()
444    }
445
446    fn get_field_text(doc: &tantivy::TantivyDocument, field: Field) -> Result<String> {
447        doc.get_first(field)
448            .and_then(|v| v.as_str())
449            .map(std::string::ToString::to_string)
450            .ok_or_else(|| Error::Index("Field not found in document".into()))
451    }
452
453    /// Compute exact match line(s) within a block's content relative to its stored line range.
454    /// Returns a "start-end" string (typically a single line) falling back to the original range on failure.
455    fn compute_match_lines(content: &str, query: &str, block_lines: &str) -> Option<String> {
456        // Parse the block's starting line
457        let block_start: usize = block_lines
458            .split(['-', ':'])
459            .next()
460            .and_then(|s| s.trim().parse::<usize>().ok())?;
461
462        // Tokenize query naively by whitespace; try to find the earliest occurrence
463        let mut best_pos: Option<usize> = None;
464        for token in query.split_whitespace() {
465            if token.is_empty() {
466                continue;
467            }
468            if let Some(pos) = content.find(token) {
469                match best_pos {
470                    Some(cur) if pos < cur => best_pos = Some(pos),
471                    None => best_pos = Some(pos),
472                    _ => {},
473                }
474            }
475        }
476
477        let pos = best_pos?;
478        // Count newlines before position to get 0-based line offset
479        let local_line = content[..pos].bytes().filter(|&b| b == b'\n').count();
480        let abs_line = block_start.saturating_add(local_line);
481        Some(format!("{abs_line}-{abs_line}"))
482    }
483
484    /// Parse a `"start-end"` or `"start:end"` range into a two-element vector.
485    /// Returns None if parsing fails or inputs are invalid.
486    fn parse_lines_range(range: &str) -> Option<Vec<usize>> {
487        let mut parts = range.split(['-', ':']);
488        let start = parts.next()?.trim().parse::<usize>().ok()?;
489        let end = parts.next()?.trim().parse::<usize>().ok()?;
490        Some(vec![start, end])
491    }
492
493    fn extract_snippet(content: &str, query: &str, max_len: usize) -> String {
494        let query_lower = query.to_lowercase();
495
496        // Find match position using character indices to handle Unicode correctly
497        let mut match_char_pos = None;
498
499        // Use a sliding window approach with character iteration
500        let content_chars: Vec<char> = content.chars().collect();
501        let query_chars: Vec<char> = query_lower.chars().collect();
502
503        if !query_chars.is_empty() {
504            for window_start in 0..content_chars.len() {
505                let window_end = (window_start + query_chars.len()).min(content_chars.len());
506                if window_end - window_start < query_chars.len() {
507                    break;
508                }
509
510                // Check if this window matches (case-insensitive)
511                let window_matches = content_chars[window_start..window_end]
512                    .iter()
513                    .zip(query_chars.iter())
514                    .all(|(c1, c2)| c1.to_lowercase().eq(c2.to_lowercase()));
515
516                if window_matches {
517                    match_char_pos = Some(window_start);
518                    break;
519                }
520            }
521        }
522
523        if let Some(char_pos) = match_char_pos {
524            // Derive context from max_len so we don't overshoot the requested length.
525            let total_chars = content_chars.len();
526            let qlen = query_chars.len();
527            let ctx_each_side = max_len.saturating_sub(qlen) / 2;
528
529            let start_char = char_pos.saturating_sub(ctx_each_side);
530            let mut end_char = (char_pos + qlen + ctx_each_side).min(total_chars);
531
532            // Clamp to at most max_len characters around the match.
533            let span = end_char.saturating_sub(start_char);
534            if span > max_len {
535                end_char = start_char + max_len;
536            }
537
538            let left_trunc = start_char > 0;
539            let right_trunc = end_char < total_chars;
540
541            // Build snippet
542            let mut snippet = String::with_capacity((end_char - start_char) * 4 + 6);
543            if left_trunc {
544                snippet.push_str("...");
545            }
546            for &ch in content_chars.iter().take(end_char).skip(start_char) {
547                snippet.push(ch);
548            }
549            if right_trunc {
550                snippet.push_str("...");
551            }
552            return snippet;
553        }
554
555        // No match found - return truncated content using character count
556        let content_chars: Vec<char> = content.chars().collect();
557        if content_chars.len() <= max_len {
558            content.to_string()
559        } else {
560            // Truncate based on character count, not byte count
561            let mut result = String::with_capacity(max_len * 4 + 3);
562            for (i, ch) in content_chars.iter().enumerate() {
563                if i >= max_len {
564                    break;
565                }
566                result.push(*ch);
567            }
568            result.push_str("...");
569            result
570        }
571    }
572}
573
574#[cfg(test)]
575mod tests {
576    #![allow(clippy::panic)]
577    #![allow(clippy::disallowed_macros)]
578    #![allow(clippy::unwrap_used)]
579    use super::*;
580    use crate::HeadingBlock;
581    use std::time::Instant;
582    use tempfile::TempDir;
583
584    fn create_test_blocks() -> Vec<HeadingBlock> {
585        vec![
586            HeadingBlock {
587                path: vec!["React".to_string(), "Hooks".to_string()],
588                content: "useState is a React hook that lets you add state to functional components. It returns an array with the current state value and a function to update it.".to_string(),
589                start_line: 100,
590                end_line: 120,
591            },
592            HeadingBlock {
593                path: vec!["React".to_string(), "Components".to_string()],
594                content: "Components are the building blocks of React applications. They can be function components or class components.".to_string(),
595                start_line: 50,
596                end_line: 75,
597            },
598            HeadingBlock {
599                path: vec!["Next.js".to_string(), "Routing".to_string()],
600                content: "App Router is the new routing system in Next.js 13+. It provides better performance and developer experience.".to_string(),
601                start_line: 200,
602                end_line: 250,
603            },
604        ]
605    }
606
607    #[test]
608    fn test_index_creation() {
609        let temp_dir = TempDir::new().expect("Failed to create temp dir");
610        let index_path = temp_dir.path().join("test_index");
611
612        let result = SearchIndex::create(&index_path);
613        assert!(result.is_ok(), "Should create index successfully");
614
615        // Verify index directory was created
616        assert!(index_path.exists());
617    }
618
619    #[test]
620    fn test_index_open_nonexistent() {
621        let temp_dir = TempDir::new().expect("Failed to create temp dir");
622        let index_path = temp_dir.path().join("nonexistent");
623
624        let result = SearchIndex::open(&index_path);
625        assert!(result.is_err(), "Should fail to open non-existent index");
626    }
627
628    #[test]
629    fn test_index_and_search_basic() {
630        let temp_dir = TempDir::new().expect("Failed to create temp dir");
631        let index_path = temp_dir.path().join("test_index");
632
633        // Create index and add blocks
634        let index = SearchIndex::create(&index_path).expect("Should create index");
635        let blocks = create_test_blocks();
636
637        index
638            .index_blocks("test", "test.md", &blocks, "llms")
639            .expect("Should index blocks");
640
641        // Search for content
642        let hits = index
643            .search("useState", Some("test"), None, 10)
644            .expect("Should search");
645
646        assert!(!hits.is_empty(), "Should find results for useState");
647        assert!(
648            hits[0].snippet.contains("useState"),
649            "Result should contain useState"
650        );
651        assert_eq!(hits[0].alias, "test");
652        assert_eq!(hits[0].file, "test.md");
653    }
654
655    #[test]
656    fn test_search_limit() {
657        let temp_dir = TempDir::new().expect("Failed to create temp dir");
658        let index_path = temp_dir.path().join("test_index");
659
660        let index = SearchIndex::create(&index_path).expect("Should create index");
661        let blocks = create_test_blocks();
662
663        index
664            .index_blocks("test", "test.md", &blocks, "llms")
665            .expect("Should index blocks");
666
667        // Search with limit
668        let hits = index
669            .search("React", Some("test"), None, 1)
670            .expect("Should search");
671
672        assert!(!hits.is_empty(), "Should find results");
673        assert!(hits.len() <= 1, "Should respect limit");
674    }
675
676    #[test]
677    fn test_search_includes_anchor() {
678        let temp_dir = TempDir::new().expect("Failed to create temp dir");
679        let index_path = temp_dir.path().join("test_index");
680
681        let index = SearchIndex::create(&index_path).expect("Should create index");
682
683        let blocks = vec![HeadingBlock {
684            path: vec!["API".to_string(), "Reference".to_string()],
685            content: "token auth key".to_string(),
686            start_line: 10,
687            end_line: 20,
688        }];
689
690        index
691            .index_blocks("test", "api.md", &blocks, "llms")
692            .expect("Should index blocks");
693
694        let hits = index
695            .search("token", Some("test"), None, 10)
696            .expect("Should search");
697
698        assert!(!hits.is_empty());
699        assert!(hits[0].anchor.is_some(), "anchor should be present in hits");
700        // Anchor should be derived from the last heading segment
701        let expected = SearchIndex::compute_anchor("Reference");
702        assert_eq!(hits[0].anchor.clone().unwrap(), expected);
703    }
704
705    #[test]
706    fn test_search_filters_by_flavor() {
707        let temp_dir = TempDir::new().expect("Failed to create temp dir");
708        let index_path = temp_dir.path().join("test_index");
709
710        let index = SearchIndex::create(&index_path).expect("Should create index");
711
712        let llms_blocks = vec![HeadingBlock {
713            path: vec!["Docs".to_string()],
714            content: "base flavor content".to_string(),
715            start_line: 1,
716            end_line: 5,
717        }];
718
719        let full_blocks = vec![HeadingBlock {
720            path: vec!["Docs".to_string(), "Full".to_string()],
721            content: "full flavor content".to_string(),
722            start_line: 6,
723            end_line: 10,
724        }];
725
726        index
727            .index_blocks("alias", "llms.txt", &llms_blocks, "llms")
728            .expect("Should index base flavor");
729        index
730            .index_blocks("alias", "llms-full.txt", &full_blocks, "llms-full")
731            .expect("Should index full flavor");
732
733        let all_hits = index
734            .search("flavor", Some("alias"), None, 10)
735            .expect("Should search without flavor filter");
736        assert_eq!(all_hits.len(), 2);
737
738        let full_hits = index
739            .search("flavor", Some("alias"), Some("llms-full"), 10)
740            .expect("Should filter by llms-full");
741        assert_eq!(full_hits.len(), 1);
742        assert_eq!(full_hits[0].flavor.as_deref(), Some("llms-full"));
743        assert_eq!(full_hits[0].file, "llms-full.txt");
744
745        let base_hits = index
746            .search("flavor", Some("alias"), Some("llms"), 10)
747            .expect("Should filter by llms");
748        assert_eq!(base_hits.len(), 1);
749        assert_eq!(base_hits[0].flavor.as_deref(), Some("llms"));
750        assert_eq!(base_hits[0].file, "llms.txt");
751
752        let missing_hits = index
753            .search("flavor", Some("alias"), Some("nonexistent-flavor"), 10)
754            .expect("Should handle non-existent flavor");
755        assert_eq!(missing_hits.len(), 2);
756    }
757
758    #[test]
759    fn test_search_mixed_flavor_filters() {
760        let temp_dir = TempDir::new().expect("Failed to create temp dir");
761        let index_path = temp_dir.path().join("test_index_mixed");
762
763        let index = SearchIndex::create(&index_path).expect("Should create index");
764
765        let base_blocks = vec![HeadingBlock {
766            path: vec!["Docs".to_string()],
767            content: "base flavor content".to_string(),
768            start_line: 1,
769            end_line: 5,
770        }];
771
772        let full_blocks = vec![HeadingBlock {
773            path: vec!["Docs".to_string(), "Full".to_string()],
774            content: "full flavor content".to_string(),
775            start_line: 6,
776            end_line: 10,
777        }];
778
779        index
780            .index_blocks("alias", "llms.txt", &base_blocks, "llms")
781            .expect("Should index base flavor");
782        index
783            .index_blocks("alias", "llms-full.txt", &full_blocks, "llms-full")
784            .expect("Should index full flavor");
785
786        let mixed_full_hits = index
787            .search("flavor", Some("alias"), Some("llms-full,unknown"), 10)
788            .expect("Should ignore unknown flavor token");
789        assert_eq!(mixed_full_hits.len(), 1);
790        assert_eq!(mixed_full_hits[0].flavor.as_deref(), Some("llms-full"));
791
792        let mixed_base_hits = index
793            .search("flavor", Some("alias"), Some("unknown|llms"), 10)
794            .expect("Should ignore unknown flavor token and return base hits");
795        assert_eq!(mixed_base_hits.len(), 1);
796        assert_eq!(mixed_base_hits[0].flavor.as_deref(), Some("llms"));
797
798        let ignored_hits = index
799            .search("flavor", Some("alias"), Some("unknown"), 10)
800            .expect("Should ignore unknown-only filters");
801        assert_eq!(ignored_hits.len(), 2);
802    }
803
804    #[test]
805    fn test_search_no_results() {
806        let temp_dir = TempDir::new().expect("Failed to create temp dir");
807        let index_path = temp_dir.path().join("test_index");
808
809        let index = SearchIndex::create(&index_path).expect("Should create index");
810        let blocks = create_test_blocks();
811
812        index
813            .index_blocks("test", "test.md", &blocks, "llms")
814            .expect("Should index blocks");
815
816        // Search for non-existent term
817        let hits = index
818            .search("nonexistentterm12345", Some("test"), None, 10)
819            .expect("Should search");
820
821        assert!(
822            hits.is_empty(),
823            "Should find no results for non-existent term"
824        );
825    }
826
827    #[test]
828    fn test_search_performance() {
829        let temp_dir = TempDir::new().expect("Failed to create temp dir");
830        let index_path = temp_dir.path().join("test_index");
831
832        let index = SearchIndex::create(&index_path).expect("Should create index");
833
834        // Create many blocks for performance testing
835        let mut blocks = Vec::new();
836        for i in 0..100 {
837            blocks.push(HeadingBlock {
838                path: vec![format!("Section{}", i)],
839                content: format!("This is content block {i} with various keywords like React, hooks, components, and performance testing."),
840                start_line: i * 10,
841                end_line: i * 10 + 5,
842            });
843        }
844
845        index
846            .index_blocks("perftest", "large.md", &blocks, "llms")
847            .expect("Should index many blocks");
848
849        // Test search performance
850        let start = Instant::now();
851        let hits = index
852            .search("React", Some("perftest"), None, 50)
853            .expect("Should search");
854        let duration = start.elapsed();
855
856        assert!(!hits.is_empty(), "Should find results");
857        assert!(
858            duration.as_millis() < 100,
859            "Search should be fast (<100ms), took {}ms",
860            duration.as_millis()
861        );
862    }
863
864    #[test]
865    fn test_search_scoring() {
866        let temp_dir = TempDir::new().expect("Failed to create temp dir");
867        let index_path = temp_dir.path().join("test_index");
868
869        let index = SearchIndex::create(&index_path).expect("Should create index");
870
871        let blocks = vec![
872            HeadingBlock {
873                path: vec!["Exact Match".to_string()],
874                content: "React hooks".to_string(),
875                start_line: 1,
876                end_line: 5,
877            },
878            HeadingBlock {
879                path: vec!["Partial Match".to_string()],
880                content: "React components and hooks are useful features".to_string(),
881                start_line: 10,
882                end_line: 15,
883            },
884            HeadingBlock {
885                path: vec!["Distant Match".to_string()],
886                content: "In React, you can use various hooks for different purposes".to_string(),
887                start_line: 20,
888                end_line: 25,
889            },
890        ];
891
892        index
893            .index_blocks("test", "test.md", &blocks, "llms")
894            .expect("Should index blocks");
895
896        let hits = index
897            .search("React hooks", Some("test"), None, 10)
898            .expect("Should search");
899
900        assert!(!hits.is_empty(), "Should find results");
901
902        // Results should be ordered by relevance (score)
903        for i in 1..hits.len() {
904            assert!(
905                hits[i - 1].score >= hits[i].score,
906                "Results should be ordered by descending score"
907            );
908        }
909
910        // The exact match should have the highest score
911        assert!(
912            hits[0].snippet.contains("React hooks"),
913            "Highest scored result should contain exact match"
914        );
915    }
916
917    #[test]
918    fn test_heading_path_in_results() {
919        let temp_dir = TempDir::new().expect("Failed to create temp dir");
920        let index_path = temp_dir.path().join("test_index");
921
922        let index = SearchIndex::create(&index_path).expect("Should create index");
923
924        let blocks = vec![HeadingBlock {
925            path: vec![
926                "API".to_string(),
927                "Reference".to_string(),
928                "Hooks".to_string(),
929            ],
930            content: "useState hook documentation".to_string(),
931            start_line: 100,
932            end_line: 120,
933        }];
934
935        index
936            .index_blocks("test", "api.md", &blocks, "llms")
937            .expect("Should index blocks");
938
939        let hits = index
940            .search("useState", Some("test"), None, 10)
941            .expect("Should search");
942
943        assert!(!hits.is_empty(), "Should find results");
944        assert_eq!(hits[0].heading_path, vec!["API", "Reference", "Hooks"]);
945        assert_eq!(hits[0].file, "api.md");
946        // Lines should point to the exact match within the block (first line)
947        assert!(
948            hits[0].lines.starts_with("100-"),
949            "Expected match to start at line 100, got {}",
950            hits[0].lines
951        );
952    }
953
954    #[test]
955    fn test_unicode_snippet_extraction() {
956        let temp_dir = TempDir::new().expect("Failed to create temp dir");
957        let index_path = temp_dir.path().join("test_index");
958        let index = SearchIndex::create(&index_path).expect("Should create index");
959
960        // Test with various Unicode content
961        let unicode_blocks = vec![
962            HeadingBlock {
963                path: vec!["Unicode".to_string(), "Emoji".to_string()],
964                content: "This is a test with emojis: πŸ‘‹ Hello 🌍 World! πŸš€ Let's go! πŸŽ‰"
965                    .to_string(),
966                start_line: 1,
967                end_line: 10,
968            },
969            HeadingBlock {
970                path: vec!["Unicode".to_string(), "Chinese".to_string()],
971                content: "θΏ™ζ˜―δΈ­ζ–‡ζ΅‹θ―•γ€‚Hello δΈ–η•ŒοΌProgramming 编程 is εΎˆζœ‰θΆ£γ€‚".to_string(),
972                start_line: 20,
973                end_line: 30,
974            },
975            HeadingBlock {
976                path: vec!["Unicode".to_string(), "Mixed".to_string()],
977                content: "ζ—₯本θͺž γƒ†γ‚Ήγƒˆ πŸ‡―πŸ‡΅ with mixed content".to_string(),
978                start_line: 40,
979                end_line: 50,
980            },
981        ];
982
983        index
984            .index_blocks("unicode_test", "test.md", &unicode_blocks, "llms")
985            .expect("Should index blocks");
986
987        // Test searching for various Unicode content
988        let test_cases = vec![("emoji", "πŸ‘‹"), ("δΈ­ζ–‡", "ζ΅‹θ―•"), ("programming", "编程")];
989
990        for (query, _expected_content) in test_cases {
991            let results = index
992                .search(query, Some("unicode_test"), None, 10)
993                .unwrap_or_else(|_| panic!("Should search for '{query}'"));
994
995            if !results.is_empty() {
996                let hit = &results[0];
997                // Verify snippet doesn't panic on Unicode boundaries
998                assert!(hit.snippet.is_char_boundary(0));
999                assert!(hit.snippet.is_char_boundary(hit.snippet.len()));
1000
1001                // Verify we can iterate over chars without panic
1002                let _char_count = hit.snippet.chars().count();
1003            }
1004        }
1005    }
1006
1007    #[test]
1008    fn test_edge_case_unicode_truncation() {
1009        let temp_dir = TempDir::new().expect("Failed to create temp dir");
1010        let index_path = temp_dir.path().join("test_index");
1011        let index = SearchIndex::create(&index_path).expect("Should create index");
1012
1013        // Create content where truncation would happen in middle of multi-byte chars
1014        let mut long_content = String::new();
1015        for _ in 0..20 {
1016            long_content.push_str("πŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘¦"); // Family emoji (complex grapheme cluster)
1017        }
1018        long_content.push_str(" MARKER ");
1019        for _ in 0..20 {
1020            long_content.push_str("πŸ³οΈβ€πŸŒˆ"); // Rainbow flag (another complex emoji)
1021        }
1022
1023        let blocks = vec![HeadingBlock {
1024            path: vec!["Test".to_string()],
1025            content: long_content.clone(),
1026            start_line: 1,
1027            end_line: 10,
1028        }];
1029
1030        index
1031            .index_blocks("edge_test", "test.md", &blocks, "llms")
1032            .expect("Should index blocks");
1033
1034        let results = index
1035            .search("MARKER", Some("edge_test"), None, 10)
1036            .expect("Should search");
1037
1038        assert!(!results.is_empty());
1039        let snippet = &results[0].snippet;
1040
1041        // Verify the snippet is valid UTF-8 and doesn't panic
1042        assert!(snippet.is_char_boundary(0));
1043        assert!(snippet.is_char_boundary(snippet.len()));
1044        assert!(snippet.contains("MARKER"));
1045
1046        // Verify we can iterate over chars without panic
1047        let char_count = snippet.chars().count();
1048        assert!(char_count > 0);
1049    }
1050}