1use crate::profiling::{ComponentTimings, OperationTimer, PerformanceMetrics};
2use crate::{Error, HeadingBlock, Result, SearchHit};
3use base64::{Engine, engine::general_purpose::STANDARD as B64};
4use sha2::{Digest, Sha256};
5use std::path::Path;
6use tantivy::collector::TopDocs;
7use tantivy::query::QueryParser;
8use tantivy::schema::{Field, STORED, STRING, Schema, TEXT, Value};
9use tantivy::{Index, IndexReader, doc};
10use tracing::{Level, debug, info};
11
12pub const DEFAULT_SNIPPET_CHAR_LIMIT: usize = 200;
14pub const MIN_SNIPPET_CHAR_LIMIT: usize = 50;
16pub const MAX_SNIPPET_CHAR_LIMIT: usize = 1_000;
18
19pub(crate) const fn clamp_snippet_chars(chars: usize) -> usize {
20 if chars < MIN_SNIPPET_CHAR_LIMIT {
21 MIN_SNIPPET_CHAR_LIMIT
22 } else if chars > MAX_SNIPPET_CHAR_LIMIT {
23 MAX_SNIPPET_CHAR_LIMIT
24 } else {
25 chars
26 }
27}
28
29pub struct SearchIndex {
31 index: Index,
32 #[allow(dead_code)]
33 schema: Schema,
34 content_field: Field,
35 path_field: Field,
36 heading_path_field: Field,
37 lines_field: Field,
38 alias_field: Field,
39 anchor_field: Option<Field>,
40 reader: IndexReader,
41 metrics: Option<PerformanceMetrics>,
42}
43
44impl SearchIndex {
45 #[must_use]
47 pub fn with_metrics(mut self, metrics: PerformanceMetrics) -> Self {
48 self.metrics = Some(metrics);
49 self
50 }
51
52 #[must_use]
54 pub const fn metrics(&self) -> Option<&PerformanceMetrics> {
55 self.metrics.as_ref()
56 }
57 pub fn create(index_path: &Path) -> Result<Self> {
59 let mut schema_builder = Schema::builder();
60
61 let content_field = schema_builder.add_text_field("content", TEXT | STORED);
62 let path_field = schema_builder.add_text_field("path", STRING | STORED);
63 let heading_path_field = schema_builder.add_text_field("heading_path", TEXT | STORED);
64 let lines_field = schema_builder.add_text_field("lines", STRING | STORED);
65 let alias_field = schema_builder.add_text_field("alias", STRING | STORED);
66 let anchor_field = schema_builder.add_text_field("anchor", STRING | STORED);
67
68 let schema = schema_builder.build();
69
70 std::fs::create_dir_all(index_path)
71 .map_err(|e| Error::Index(format!("Failed to create index directory: {e}")))?;
72
73 let index = Index::create_in_dir(index_path, schema.clone())
74 .map_err(|e| Error::Index(format!("Failed to create index: {e}")))?;
75
76 let reader = index
77 .reader_builder()
78 .reload_policy(tantivy::ReloadPolicy::OnCommitWithDelay)
79 .try_into()
80 .map_err(|e| Error::Index(format!("Failed to create reader: {e}")))?;
81
82 Ok(Self {
83 index,
84 schema,
85 content_field,
86 path_field,
87 heading_path_field,
88 lines_field,
89 alias_field,
90 reader,
91 anchor_field: Some(anchor_field),
92 metrics: None,
93 })
94 }
95
96 pub fn create_or_open(index_path: &Path) -> Result<Self> {
98 if index_path.exists() {
99 Self::open(index_path)
100 } else {
101 Self::create(index_path)
102 }
103 }
104
105 pub fn open(index_path: &Path) -> Result<Self> {
107 let index = Index::open_in_dir(index_path)
108 .map_err(|e| Error::Index(format!("Failed to open index: {e}")))?;
109
110 let schema = index.schema();
111
112 let content_field = schema
113 .get_field("content")
114 .map_err(|_| Error::Index("Missing content field".into()))?;
115 let path_field = schema
116 .get_field("path")
117 .map_err(|_| Error::Index("Missing path field".into()))?;
118 let heading_path_field = schema
119 .get_field("heading_path")
120 .map_err(|_| Error::Index("Missing heading_path field".into()))?;
121 let lines_field = schema
122 .get_field("lines")
123 .map_err(|_| Error::Index("Missing lines field".into()))?;
124 let alias_field = schema
125 .get_field("alias")
126 .map_err(|_| Error::Index("Missing alias field".into()))?;
127
128 let anchor_field = schema.get_field("anchor").ok();
130
131 let reader = index
132 .reader_builder()
133 .reload_policy(tantivy::ReloadPolicy::OnCommitWithDelay)
134 .try_into()
135 .map_err(|e| Error::Index(format!("Failed to create reader: {e}")))?;
136
137 Ok(Self {
138 index,
139 schema,
140 content_field,
141 path_field,
142 heading_path_field,
143 lines_field,
144 alias_field,
145 reader,
146 anchor_field,
147 metrics: None,
148 })
149 }
150
151 pub fn index_blocks(&self, alias: &str, blocks: &[HeadingBlock]) -> Result<()> {
153 let timer = self.metrics.as_ref().map_or_else(
154 || OperationTimer::new(&format!("index_{alias}")),
155 |metrics| OperationTimer::with_metrics(&format!("index_{alias}"), metrics.clone()),
156 );
157
158 let mut timings = ComponentTimings::new();
159
160 let mut writer = timings.time("writer_creation", || {
161 self.index
162 .writer(50_000_000)
163 .map_err(|e| Error::Index(format!("Failed to create writer: {e}")))
164 })?;
165
166 let _deleted = timings.time("delete_existing", || {
168 writer.delete_term(tantivy::Term::from_field_text(self.alias_field, alias))
169 });
170
171 let mut total_content_bytes = 0usize;
172
173 timings.time("document_creation", || {
174 for block in blocks {
175 total_content_bytes += block.content.len();
176 let heading_path_str = block.path.join(" > ");
177 let lines_str = format!("{}-{}", block.start_line, block.end_line);
178 let anchor = block.path.last().map(|h| Self::compute_anchor(h));
180
181 let mut doc = doc!(
182 self.content_field => block.content.as_str(), self.path_field => "llms.txt", self.heading_path_field => heading_path_str,
185 self.lines_field => lines_str,
186 self.alias_field => alias
187 );
188 if let (Some(f), Some(a)) = (self.anchor_field, anchor) {
189 doc.add_text(f, a);
190 }
191
192 writer
193 .add_document(doc)
194 .map_err(|e| Error::Index(format!("Failed to add document: {e}")))?;
195 }
196 Ok::<(), Error>(())
197 })?;
198
199 timings.time("commit", || {
200 writer
201 .commit()
202 .map_err(|e| Error::Index(format!("Failed to commit: {e}")))
203 })?;
204
205 timings.time("reader_reload", || {
206 self.reader
207 .reload()
208 .map_err(|e| Error::Index(format!("Failed to reload reader: {e}")))
209 })?;
210
211 let duration = timer.finish_index(total_content_bytes);
212
213 if tracing::enabled!(Level::DEBUG) {
215 timings.print_breakdown();
216 }
217
218 info!(
219 "Indexed {} blocks ({} bytes) for {} in {:.2}ms",
220 blocks.len(),
221 total_content_bytes,
222 alias,
223 duration.as_millis()
224 );
225
226 Ok(())
227 }
228
229 pub fn search(
231 &self,
232 query_str: &str,
233 alias: Option<&str>,
234 limit: usize,
235 ) -> Result<Vec<SearchHit>> {
236 self.search_with_snippet_limit(query_str, alias, limit, DEFAULT_SNIPPET_CHAR_LIMIT)
237 }
238
239 #[allow(clippy::too_many_lines)] pub fn search_with_snippet_limit(
242 &self,
243 query_str: &str,
244 alias: Option<&str>,
245 limit: usize,
246 snippet_max_chars: usize,
247 ) -> Result<Vec<SearchHit>> {
248 let timer = self.metrics.as_ref().map_or_else(
249 || OperationTimer::new(&format!("search_{query_str}")),
250 |metrics| OperationTimer::with_metrics(&format!("search_{query_str}"), metrics.clone()),
251 );
252
253 let mut timings = ComponentTimings::new();
254 let mut lines_searched = 0usize;
255 let snippet_limit = clamp_snippet_chars(snippet_max_chars);
256
257 let searcher = timings.time("searcher_creation", || self.reader.searcher());
258
259 let query_parser = timings.time("query_parser_creation", || {
260 QueryParser::for_index(
261 &self.index,
262 vec![self.content_field, self.heading_path_field],
263 )
264 });
265
266 let needs_escaping = query_str.chars().any(|c| {
268 matches!(
269 c,
270 '\\' | '(' | ')' | '[' | ']' | '{' | '}' | '^' | '~' | ':'
271 )
272 });
273
274 let mut filter_clauses = Vec::new();
275 if let Some(alias) = alias {
276 filter_clauses.push(format!("alias:{alias}"));
277 }
278
279 let sanitized_query = if needs_escaping {
280 let mut sanitized = String::with_capacity(query_str.len() * 2);
282
283 for ch in query_str.chars() {
284 match ch {
285 '\\' => sanitized.push_str("\\\\"),
286 '(' => sanitized.push_str("\\("),
287 ')' => sanitized.push_str("\\)"),
288 '[' => sanitized.push_str("\\["),
289 ']' => sanitized.push_str("\\]"),
290 '{' => sanitized.push_str("\\{"),
291 '}' => sanitized.push_str("\\}"),
292 '^' => sanitized.push_str("\\^"),
293 '~' => sanitized.push_str("\\~"),
294 ':' => sanitized.push_str("\\:"),
295 _ => sanitized.push(ch),
296 }
297 }
298
299 sanitized
300 } else {
301 query_str.to_string()
302 };
303
304 let full_query_str = if filter_clauses.is_empty() {
305 sanitized_query
306 } else {
307 format!("{} AND ({sanitized_query})", filter_clauses.join(" AND "))
308 };
309
310 let query = timings.time("query_parsing", || {
311 query_parser
312 .parse_query(&full_query_str)
313 .map_err(|e| Error::Index(format!("Failed to parse query: {e}")))
314 })?;
315
316 let top_docs = timings.time("tantivy_search", || {
317 searcher
318 .search(&query, &TopDocs::with_limit(limit))
319 .map_err(|e| Error::Index(format!("Search failed: {e}")))
320 })?;
321
322 let mut hits = Vec::new();
323
324 timings.time("result_processing", || {
325 for (score, doc_address) in top_docs {
326 let doc = searcher
327 .doc(doc_address)
328 .map_err(|e| Error::Index(format!("Failed to retrieve doc: {e}")))?;
329
330 let alias = Self::get_field_text(&doc, self.alias_field)?;
331 let file = Self::get_field_text(&doc, self.path_field)?;
332 let heading_path_str = Self::get_field_text(&doc, self.heading_path_field)?;
333 let lines = Self::get_field_text(&doc, self.lines_field)?;
334 let content = Self::get_field_text(&doc, self.content_field)?;
335 let anchor = self.anchor_field.and_then(|f| {
336 doc.get_first(f)
337 .and_then(|v| v.as_str())
338 .map(std::string::ToString::to_string)
339 });
340
341 lines_searched += content.lines().count();
343
344 let heading_path: Vec<String> = heading_path_str
345 .split(" > ")
346 .map(std::string::ToString::to_string)
347 .collect();
348
349 let snippet = Self::extract_snippet(&content, query_str, snippet_limit);
350
351 let exact_lines = Self::compute_match_lines(&content, query_str, &lines)
353 .unwrap_or_else(|| lines.clone());
354
355 let line_numbers = Self::parse_lines_range(&exact_lines);
357
358 hits.push(SearchHit {
359 source: alias,
360 file,
361 heading_path,
362 lines: exact_lines,
363 line_numbers,
364 snippet,
365 score,
366 source_url: None,
367 fetched_at: None,
368 is_stale: false,
369 checksum: String::new(),
370 anchor,
371 context: None,
372 });
373 }
374 Ok::<(), Error>(())
375 })?;
376
377 let duration = timer.finish_search(lines_searched);
378
379 if tracing::enabled!(Level::DEBUG) {
381 timings.print_breakdown();
382 }
383
384 debug!(
385 "Found {} hits for query '{}' in {:.2}ms (searched {} lines)",
386 hits.len(),
387 query_str,
388 duration.as_millis(),
389 lines_searched
390 );
391
392 Ok(hits)
393 }
394
395 fn compute_anchor(heading_text: &str) -> String {
396 let mut hasher = Sha256::new();
397 hasher.update(heading_text.trim().to_lowercase().as_bytes());
398 let digest = hasher.finalize();
399 let full = B64.encode(digest);
400 full[..22.min(full.len())].to_string()
401 }
402
403 fn get_field_text(doc: &tantivy::TantivyDocument, field: Field) -> Result<String> {
404 doc.get_first(field)
405 .and_then(|v| v.as_str())
406 .map(std::string::ToString::to_string)
407 .ok_or_else(|| Error::Index("Field not found in document".into()))
408 }
409
410 fn compute_match_lines(content: &str, query: &str, block_lines: &str) -> Option<String> {
413 let block_start: usize = block_lines
415 .split(['-', ':'])
416 .next()
417 .and_then(|s| s.trim().parse::<usize>().ok())?;
418
419 let mut phrases = Vec::new();
421 let mut terms = Vec::new();
422 let mut current = String::new();
423 let mut in_quotes = false;
424 for ch in query.chars() {
425 match ch {
426 '"' => {
427 if in_quotes {
428 if !current.is_empty() {
429 phrases.push(current.clone());
430 current.clear();
431 }
432 in_quotes = false;
433 } else {
434 in_quotes = true;
435 }
436 },
437 ch if ch.is_whitespace() && !in_quotes => {
438 if !current.is_empty() {
439 terms.push(current.clone());
440 current.clear();
441 }
442 },
443 _ => current.push(ch),
444 }
445 }
446 if !current.is_empty() {
447 if in_quotes {
448 phrases.push(current);
449 } else {
450 terms.push(current);
451 }
452 }
453
454 let phrases: Vec<String> = phrases
455 .into_iter()
456 .map(|token| {
457 token
458 .trim_matches('"')
459 .trim_start_matches(['+', '-'])
460 .trim()
461 .to_string()
462 })
463 .filter(|s| !s.is_empty())
464 .collect();
465 let terms: Vec<String> = terms
466 .into_iter()
467 .map(|token| {
468 token
469 .trim_matches('"')
470 .trim_start_matches(['+', '-'])
471 .trim()
472 .to_string()
473 })
474 .filter(|s| !s.is_empty())
475 .collect();
476
477 let mut best_pos: Option<usize> = None;
478 for token in phrases.iter().chain(terms.iter()) {
479 if let Some(pos) = content.find(token) {
480 best_pos = Some(best_pos.map_or(pos, |cur| pos.min(cur)));
481 }
482 }
483
484 let pos = best_pos?;
485 let local_line = content[..pos].bytes().filter(|&b| b == b'\n').count();
487 let abs_line = block_start.saturating_add(local_line);
488 Some(format!("{abs_line}-{abs_line}"))
489 }
490
491 fn parse_lines_range(range: &str) -> Option<Vec<usize>> {
494 let mut parts = range.split(['-', ':']);
495 let start = parts.next()?.trim().parse::<usize>().ok()?;
496 let end = parts.next()?.trim().parse::<usize>().ok()?;
497 Some(vec![start, end])
498 }
499
500 fn extract_snippet(content: &str, query: &str, max_len: usize) -> String {
501 let trimmed = query.trim();
503 let phrase_candidate =
504 if trimmed.len() >= 2 && trimmed.starts_with('"') && trimmed.ends_with('"') {
505 &trimmed[1..trimmed.len() - 1]
506 } else {
507 query
508 };
509 let query_lower = phrase_candidate.to_lowercase();
510
511 let mut match_char_pos = None;
513
514 let content_chars: Vec<char> = content.chars().collect();
516 let query_chars: Vec<char> = query_lower.chars().collect();
517
518 if !query_chars.is_empty() {
519 for window_start in 0..content_chars.len() {
520 let window_end = (window_start + query_chars.len()).min(content_chars.len());
521 if window_end - window_start < query_chars.len() {
522 break;
523 }
524
525 let window_matches = content_chars[window_start..window_end]
527 .iter()
528 .zip(query_chars.iter())
529 .all(|(c1, c2)| c1.to_lowercase().eq(c2.to_lowercase()));
530
531 if window_matches {
532 match_char_pos = Some(window_start);
533 break;
534 }
535 }
536 }
537
538 if let Some(char_pos) = match_char_pos {
539 let total_chars = content_chars.len();
541 let qlen = query_chars.len();
542 let ctx_each_side = max_len.saturating_sub(qlen) / 2;
543
544 let start_char = char_pos.saturating_sub(ctx_each_side);
545 let mut end_char = (char_pos + qlen + ctx_each_side).min(total_chars);
546
547 let span = end_char.saturating_sub(start_char);
549 if span > max_len {
550 end_char = start_char + max_len;
551 }
552
553 let left_trunc = start_char > 0;
554 let right_trunc = end_char < total_chars;
555
556 let mut snippet = String::with_capacity((end_char - start_char) * 4 + 6);
558 if left_trunc {
559 snippet.push_str("...");
560 }
561 for &ch in content_chars.iter().take(end_char).skip(start_char) {
562 snippet.push(ch);
563 }
564 if right_trunc {
565 snippet.push_str("...");
566 }
567 return snippet;
568 }
569
570 let content_chars: Vec<char> = content.chars().collect();
572 if content_chars.len() <= max_len {
573 content.to_string()
574 } else {
575 let mut result = String::with_capacity(max_len * 4 + 3);
577 for (i, ch) in content_chars.iter().enumerate() {
578 if i >= max_len {
579 break;
580 }
581 result.push(*ch);
582 }
583 result.push_str("...");
584 result
585 }
586 }
587}
588
589#[cfg(test)]
590mod tests {
591 #![allow(clippy::panic)]
592 #![allow(clippy::disallowed_macros)]
593 #![allow(clippy::unwrap_used)]
594 use super::*;
595 use crate::HeadingBlock;
596 use std::time::Instant;
597 use tempfile::TempDir;
598
599 fn create_test_blocks() -> Vec<HeadingBlock> {
600 vec![
601 HeadingBlock {
602 path: vec!["React".to_string(), "Hooks".to_string()],
603 content: "useState is a React hook that lets you add state to functional components. It returns an array with the current state value and a function to update it.".to_string(),
604 start_line: 100,
605 end_line: 120,
606 },
607 HeadingBlock {
608 path: vec!["React".to_string(), "Components".to_string()],
609 content: "Components are the building blocks of React applications. They can be function components or class components.".to_string(),
610 start_line: 50,
611 end_line: 75,
612 },
613 HeadingBlock {
614 path: vec!["Next.js".to_string(), "Routing".to_string()],
615 content: "App Router is the new routing system in Next.js 13+. It provides better performance and developer experience.".to_string(),
616 start_line: 200,
617 end_line: 250,
618 },
619 ]
620 }
621
622 #[test]
623 fn test_index_creation() {
624 let temp_dir = TempDir::new().expect("Failed to create temp dir");
625 let index_path = temp_dir.path().join("test_index");
626
627 let result = SearchIndex::create(&index_path);
628 assert!(result.is_ok(), "Should create index successfully");
629
630 assert!(index_path.exists());
632 }
633
634 #[test]
635 fn test_index_open_nonexistent() {
636 let temp_dir = TempDir::new().expect("Failed to create temp dir");
637 let index_path = temp_dir.path().join("nonexistent");
638
639 let result = SearchIndex::open(&index_path);
640 assert!(result.is_err(), "Should fail to open non-existent index");
641 }
642
643 #[test]
644 fn test_index_and_search_basic() {
645 let temp_dir = TempDir::new().expect("Failed to create temp dir");
646 let index_path = temp_dir.path().join("test_index");
647
648 let index = SearchIndex::create(&index_path).expect("Should create index");
650 let blocks = create_test_blocks();
651
652 index
653 .index_blocks("test", &blocks)
654 .expect("Should index blocks");
655
656 let hits = index
658 .search("useState", Some("test"), 10)
659 .expect("Should search");
660
661 assert!(!hits.is_empty(), "Should find results for useState");
662 assert!(
663 hits[0].snippet.contains("useState"),
664 "Result should contain useState"
665 );
666 assert_eq!(hits[0].source, "test");
667 assert_eq!(hits[0].file, "llms.txt");
668 }
669
670 #[test]
671 fn test_search_limit() {
672 let temp_dir = TempDir::new().expect("Failed to create temp dir");
673 let index_path = temp_dir.path().join("test_index");
674
675 let index = SearchIndex::create(&index_path).expect("Should create index");
676 let blocks = create_test_blocks();
677
678 index
679 .index_blocks("test", &blocks)
680 .expect("Should index blocks");
681
682 let hits = index
684 .search("React", Some("test"), 1)
685 .expect("Should search");
686
687 assert!(!hits.is_empty(), "Should find results");
688 assert!(hits.len() <= 1, "Should respect limit");
689 }
690
691 #[test]
692 fn test_search_includes_anchor() {
693 let temp_dir = TempDir::new().expect("Failed to create temp dir");
694 let index_path = temp_dir.path().join("test_index");
695
696 let index = SearchIndex::create(&index_path).expect("Should create index");
697
698 let blocks = vec![HeadingBlock {
699 path: vec!["API".to_string(), "Reference".to_string()],
700 content: "token auth key".to_string(),
701 start_line: 10,
702 end_line: 20,
703 }];
704
705 index
706 .index_blocks("test", &blocks)
707 .expect("Should index blocks");
708
709 let hits = index
710 .search("token", Some("test"), 10)
711 .expect("Should search");
712
713 assert!(!hits.is_empty());
714 assert!(hits[0].anchor.is_some(), "anchor should be present in hits");
715 let expected = SearchIndex::compute_anchor("Reference");
717 assert_eq!(hits[0].anchor.clone().unwrap(), expected);
718 }
719
720 #[test]
721 fn test_search_no_results() {
722 let temp_dir = TempDir::new().expect("Failed to create temp dir");
723 let index_path = temp_dir.path().join("test_index");
724
725 let index = SearchIndex::create(&index_path).expect("Should create index");
726 let blocks = create_test_blocks();
727
728 index
729 .index_blocks("test", &blocks)
730 .expect("Should index blocks");
731
732 let hits = index
734 .search("nonexistentterm12345", Some("test"), 10)
735 .expect("Should search");
736
737 assert!(
738 hits.is_empty(),
739 "Should find no results for non-existent term"
740 );
741 }
742
743 #[test]
744 fn test_search_performance() {
745 let temp_dir = TempDir::new().expect("Failed to create temp dir");
746 let index_path = temp_dir.path().join("test_index");
747
748 let index = SearchIndex::create(&index_path).expect("Should create index");
749
750 let mut blocks = Vec::new();
752 for i in 0..100 {
753 blocks.push(HeadingBlock {
754 path: vec![format!("Section{}", i)],
755 content: format!("This is content block {i} with various keywords like React, hooks, components, and performance testing."),
756 start_line: i * 10,
757 end_line: i * 10 + 5,
758 });
759 }
760
761 index
762 .index_blocks("perftest", &blocks)
763 .expect("Should index many blocks");
764
765 let start = Instant::now();
767 let hits = index
768 .search("React", Some("perftest"), 50)
769 .expect("Should search");
770 let duration = start.elapsed();
771
772 assert!(!hits.is_empty(), "Should find results");
773 assert!(
774 duration.as_millis() < 100,
775 "Search should be fast (<100ms), took {}ms",
776 duration.as_millis()
777 );
778 }
779
780 #[test]
781 fn test_search_scoring() {
782 let temp_dir = TempDir::new().expect("Failed to create temp dir");
783 let index_path = temp_dir.path().join("test_index");
784
785 let index = SearchIndex::create(&index_path).expect("Should create index");
786
787 let blocks = vec![
788 HeadingBlock {
789 path: vec!["Exact Match".to_string()],
790 content: "React hooks".to_string(),
791 start_line: 1,
792 end_line: 5,
793 },
794 HeadingBlock {
795 path: vec!["Partial Match".to_string()],
796 content: "React components and hooks are useful features".to_string(),
797 start_line: 10,
798 end_line: 15,
799 },
800 HeadingBlock {
801 path: vec!["Distant Match".to_string()],
802 content: "In React, you can use various hooks for different purposes".to_string(),
803 start_line: 20,
804 end_line: 25,
805 },
806 ];
807
808 index
809 .index_blocks("test", &blocks)
810 .expect("Should index blocks");
811
812 let hits = index
813 .search("React hooks", Some("test"), 10)
814 .expect("Should search");
815
816 assert!(!hits.is_empty(), "Should find results");
817
818 for i in 1..hits.len() {
820 assert!(
821 hits[i - 1].score >= hits[i].score,
822 "Results should be ordered by descending score"
823 );
824 }
825
826 assert!(
828 hits[0].snippet.contains("React hooks"),
829 "Highest scored result should contain exact match"
830 );
831 }
832
833 #[test]
834 fn test_search_snippet_respects_limits() {
835 let temp_dir = TempDir::new().expect("Failed to create temp dir");
836 let index_path = temp_dir.path().join("test_index");
837
838 let index = SearchIndex::create(&index_path).expect("Should create index");
839
840 let blocks = vec![HeadingBlock {
841 path: vec!["Hooks".to_string()],
842 content: "React provides hooks for state and effect management. Hooks enable composing complex logic from simple primitives. Extensive documentation follows here to ensure the snippet must truncate properly when limits are applied.".to_string(),
843 start_line: 1,
844 end_line: 20,
845 }];
846
847 index
848 .index_blocks("test", &blocks)
849 .expect("Should index blocks");
850
851 let default_hits = index
852 .search("hooks", Some("test"), 5)
853 .expect("Should search with default limit");
854 assert!(!default_hits.is_empty());
855 let default_len = default_hits[0].snippet.chars().count();
856 assert!(
857 default_len <= DEFAULT_SNIPPET_CHAR_LIMIT + 6,
858 "Default snippet should clamp near default limit"
859 );
860
861 let custom_limit = 80;
862 let custom_hits = index
863 .search_with_snippet_limit("hooks", Some("test"), 5, custom_limit)
864 .expect("Should search with custom limit");
865 assert!(!custom_hits.is_empty());
866 let custom_len = custom_hits[0].snippet.chars().count();
867 assert!(
868 custom_len <= clamp_snippet_chars(custom_limit) + 6,
869 "Custom snippet should respect provided limit"
870 );
871
872 assert!(custom_len <= default_len);
874 }
875
876 #[test]
877 fn test_heading_path_in_results() {
878 let temp_dir = TempDir::new().expect("Failed to create temp dir");
879 let index_path = temp_dir.path().join("test_index");
880
881 let index = SearchIndex::create(&index_path).expect("Should create index");
882
883 let blocks = vec![HeadingBlock {
884 path: vec![
885 "API".to_string(),
886 "Reference".to_string(),
887 "Hooks".to_string(),
888 ],
889 content: "useState hook documentation".to_string(),
890 start_line: 100,
891 end_line: 120,
892 }];
893
894 index
895 .index_blocks("test", &blocks)
896 .expect("Should index blocks");
897
898 let hits = index
899 .search("useState", Some("test"), 10)
900 .expect("Should search");
901
902 assert!(!hits.is_empty(), "Should find results");
903 assert_eq!(hits[0].heading_path, vec!["API", "Reference", "Hooks"]);
904 assert_eq!(hits[0].file, "llms.txt");
905 assert!(
907 hits[0].lines.starts_with("100-"),
908 "Expected match to start at line 100, got {}",
909 hits[0].lines
910 );
911 }
912
913 #[test]
914 fn test_unicode_snippet_extraction() {
915 let temp_dir = TempDir::new().expect("Failed to create temp dir");
916 let index_path = temp_dir.path().join("test_index");
917 let index = SearchIndex::create(&index_path).expect("Should create index");
918
919 let unicode_blocks = vec![
921 HeadingBlock {
922 path: vec!["Unicode".to_string(), "Emoji".to_string()],
923 content: "This is a test with emojis: π Hello π World! π Let's go! π"
924 .to_string(),
925 start_line: 1,
926 end_line: 10,
927 },
928 HeadingBlock {
929 path: vec!["Unicode".to_string(), "Chinese".to_string()],
930 content: "θΏζ―δΈζζ΅θ―γHello δΈηοΌProgramming ηΌη¨ is εΎζθΆ£γ".to_string(),
931 start_line: 20,
932 end_line: 30,
933 },
934 HeadingBlock {
935 path: vec!["Unicode".to_string(), "Mixed".to_string()],
936 content: "ζ₯ζ¬θͺ γγΉγ π―π΅ with mixed content".to_string(),
937 start_line: 40,
938 end_line: 50,
939 },
940 ];
941
942 index
943 .index_blocks("unicode_test", &unicode_blocks)
944 .expect("Should index blocks");
945
946 let test_cases = vec![("emoji", "π"), ("δΈζ", "ζ΅θ―"), ("programming", "ηΌη¨")];
948
949 for (query, _expected_content) in test_cases {
950 let results = index
951 .search(query, Some("unicode_test"), 10)
952 .unwrap_or_else(|_| panic!("Should search for '{query}'"));
953
954 if !results.is_empty() {
955 let hit = &results[0];
956 assert!(hit.snippet.is_char_boundary(0));
958 assert!(hit.snippet.is_char_boundary(hit.snippet.len()));
959
960 let _char_count = hit.snippet.chars().count();
962 }
963 }
964 }
965
966 #[test]
967 fn test_edge_case_unicode_truncation() {
968 let temp_dir = TempDir::new().expect("Failed to create temp dir");
969 let index_path = temp_dir.path().join("test_index");
970 let index = SearchIndex::create(&index_path).expect("Should create index");
971
972 let mut long_content = String::new();
974 for _ in 0..20 {
975 long_content.push_str("π¨βπ©βπ§βπ¦"); }
977 long_content.push_str(" MARKER ");
978 for _ in 0..20 {
979 long_content.push_str("π³οΈβπ"); }
981
982 let blocks = vec![HeadingBlock {
983 path: vec!["Test".to_string()],
984 content: long_content.clone(),
985 start_line: 1,
986 end_line: 10,
987 }];
988
989 index
990 .index_blocks("edge_test", &blocks)
991 .expect("Should index blocks");
992
993 let results = index
994 .search("MARKER", Some("edge_test"), 10)
995 .expect("Should search");
996
997 assert!(!results.is_empty());
998 let snippet = &results[0].snippet;
999
1000 assert!(snippet.is_char_boundary(0));
1002 assert!(snippet.is_char_boundary(snippet.len()));
1003 assert!(snippet.contains("MARKER"));
1004
1005 let char_count = snippet.chars().count();
1007 assert!(char_count > 0);
1008 }
1009}