1use crate::profiling::{ComponentTimings, OperationTimer, PerformanceMetrics};
2use crate::types::normalize_flavor_filters;
3use crate::{Error, HeadingBlock, Result, SearchHit};
4use base64::{Engine, engine::general_purpose::STANDARD as B64};
5use sha2::{Digest, Sha256};
6use std::path::Path;
7use tantivy::collector::TopDocs;
8use tantivy::query::QueryParser;
9use tantivy::schema::{Field, STORED, STRING, Schema, TEXT, Value};
10use tantivy::{Index, IndexReader, doc};
11use tracing::{Level, debug, info};
12
13pub struct SearchIndex {
15 index: Index,
16 #[allow(dead_code)]
17 schema: Schema,
18 content_field: Field,
19 path_field: Field,
20 heading_path_field: Field,
21 lines_field: Field,
22 alias_field: Field,
23 flavor_field: Option<Field>,
24 alias_flavor_field: Option<Field>,
25 anchor_field: Option<Field>,
26 reader: IndexReader,
27 metrics: Option<PerformanceMetrics>,
28}
29
30impl SearchIndex {
31 #[must_use]
33 pub fn with_metrics(mut self, metrics: PerformanceMetrics) -> Self {
34 self.metrics = Some(metrics);
35 self
36 }
37
38 #[must_use]
40 pub const fn metrics(&self) -> Option<&PerformanceMetrics> {
41 self.metrics.as_ref()
42 }
43 pub fn create(index_path: &Path) -> Result<Self> {
45 let mut schema_builder = Schema::builder();
46
47 let content_field = schema_builder.add_text_field("content", TEXT | STORED);
48 let path_field = schema_builder.add_text_field("path", STRING | STORED);
49 let heading_path_field = schema_builder.add_text_field("heading_path", TEXT | STORED);
50 let lines_field = schema_builder.add_text_field("lines", STRING | STORED);
51 let alias_field = schema_builder.add_text_field("alias", STRING | STORED);
52 let flavor_field = schema_builder.add_text_field("flavor", STRING | STORED);
53 let alias_flavor_field = schema_builder.add_text_field("alias_flavor", STRING | STORED);
54 let anchor_field = schema_builder.add_text_field("anchor", STRING | STORED);
55
56 let schema = schema_builder.build();
57
58 std::fs::create_dir_all(index_path)
59 .map_err(|e| Error::Index(format!("Failed to create index directory: {e}")))?;
60
61 let index = Index::create_in_dir(index_path, schema.clone())
62 .map_err(|e| Error::Index(format!("Failed to create index: {e}")))?;
63
64 let reader = index
65 .reader_builder()
66 .reload_policy(tantivy::ReloadPolicy::OnCommitWithDelay)
67 .try_into()
68 .map_err(|e| Error::Index(format!("Failed to create reader: {e}")))?;
69
70 Ok(Self {
71 index,
72 schema,
73 content_field,
74 path_field,
75 heading_path_field,
76 lines_field,
77 alias_field,
78 flavor_field: Some(flavor_field),
79 alias_flavor_field: Some(alias_flavor_field),
80 reader,
81 anchor_field: Some(anchor_field),
82 metrics: None,
83 })
84 }
85
86 pub fn create_or_open(index_path: &Path) -> Result<Self> {
88 if index_path.exists() {
89 Self::open(index_path)
90 } else {
91 Self::create(index_path)
92 }
93 }
94
95 pub fn open(index_path: &Path) -> Result<Self> {
97 let index = Index::open_in_dir(index_path)
98 .map_err(|e| Error::Index(format!("Failed to open index: {e}")))?;
99
100 let schema = index.schema();
101
102 let content_field = schema
103 .get_field("content")
104 .map_err(|_| Error::Index("Missing content field".into()))?;
105 let path_field = schema
106 .get_field("path")
107 .map_err(|_| Error::Index("Missing path field".into()))?;
108 let heading_path_field = schema
109 .get_field("heading_path")
110 .map_err(|_| Error::Index("Missing heading_path field".into()))?;
111 let lines_field = schema
112 .get_field("lines")
113 .map_err(|_| Error::Index("Missing lines field".into()))?;
114 let alias_field = schema
115 .get_field("alias")
116 .map_err(|_| Error::Index("Missing alias field".into()))?;
117
118 let flavor_field = schema.get_field("flavor").ok();
119 let alias_flavor_field = schema.get_field("alias_flavor").ok();
120
121 let anchor_field = schema.get_field("anchor").ok();
123
124 let reader = index
125 .reader_builder()
126 .reload_policy(tantivy::ReloadPolicy::OnCommitWithDelay)
127 .try_into()
128 .map_err(|e| Error::Index(format!("Failed to create reader: {e}")))?;
129
130 Ok(Self {
131 index,
132 schema,
133 content_field,
134 path_field,
135 heading_path_field,
136 lines_field,
137 alias_field,
138 flavor_field,
139 alias_flavor_field,
140 reader,
141 anchor_field,
142 metrics: None,
143 })
144 }
145
146 pub fn index_blocks(
148 &self,
149 alias: &str,
150 file_path: &str,
151 blocks: &[HeadingBlock],
152 flavor: &str,
153 ) -> Result<()> {
154 let timer = self.metrics.as_ref().map_or_else(
155 || OperationTimer::new(&format!("index_{alias}")),
156 |metrics| OperationTimer::with_metrics(&format!("index_{alias}"), metrics.clone()),
157 );
158
159 let mut timings = ComponentTimings::new();
160
161 let mut writer = timings.time("writer_creation", || {
162 self.index
163 .writer(50_000_000)
164 .map_err(|e| Error::Index(format!("Failed to create writer: {e}")))
165 })?;
166
167 let alias_flavor_value = format!("{alias}::{flavor}");
168
169 let _deleted = timings.time("delete_existing", || {
170 self.alias_flavor_field.map_or_else(
171 || writer.delete_term(tantivy::Term::from_field_text(self.alias_field, alias)),
172 |field| {
173 writer.delete_term(tantivy::Term::from_field_text(field, &alias_flavor_value))
174 },
175 )
176 });
177
178 let mut total_content_bytes = 0usize;
179
180 timings.time("document_creation", || {
181 for block in blocks {
182 total_content_bytes += block.content.len();
183 let heading_path_str = block.path.join(" > ");
184 let lines_str = format!("{}-{}", block.start_line, block.end_line);
185 let anchor = block.path.last().map(|h| Self::compute_anchor(h));
187
188 let mut doc = doc!(
189 self.content_field => block.content.as_str(), self.path_field => file_path,
191 self.heading_path_field => heading_path_str,
192 self.lines_field => lines_str,
193 self.alias_field => alias
194 );
195 if let Some(field) = self.flavor_field {
196 doc.add_text(field, flavor);
197 }
198 if let Some(field) = self.alias_flavor_field {
199 doc.add_text(field, &alias_flavor_value);
200 }
201 if let (Some(f), Some(a)) = (self.anchor_field, anchor) {
202 doc.add_text(f, a);
203 }
204
205 writer
206 .add_document(doc)
207 .map_err(|e| Error::Index(format!("Failed to add document: {e}")))?;
208 }
209 Ok::<(), Error>(())
210 })?;
211
212 timings.time("commit", || {
213 writer
214 .commit()
215 .map_err(|e| Error::Index(format!("Failed to commit: {e}")))
216 })?;
217
218 timings.time("reader_reload", || {
219 self.reader
220 .reload()
221 .map_err(|e| Error::Index(format!("Failed to reload reader: {e}")))
222 })?;
223
224 let duration = timer.finish_index(total_content_bytes);
225
226 if tracing::enabled!(Level::DEBUG) {
228 timings.print_breakdown();
229 }
230
231 info!(
232 "Indexed {} blocks ({} bytes) for {} in {:.2}ms",
233 blocks.len(),
234 total_content_bytes,
235 alias,
236 duration.as_millis()
237 );
238
239 Ok(())
240 }
241
242 #[allow(clippy::too_many_lines)] pub fn search(
245 &self,
246 query_str: &str,
247 alias: Option<&str>,
248 flavor: Option<&str>,
249 limit: usize,
250 ) -> Result<Vec<SearchHit>> {
251 let timer = self.metrics.as_ref().map_or_else(
252 || OperationTimer::new(&format!("search_{query_str}")),
253 |metrics| OperationTimer::with_metrics(&format!("search_{query_str}"), metrics.clone()),
254 );
255
256 let mut timings = ComponentTimings::new();
257 let mut lines_searched = 0usize;
258
259 let searcher = timings.time("searcher_creation", || self.reader.searcher());
260
261 let query_parser = timings.time("query_parser_creation", || {
262 QueryParser::for_index(
263 &self.index,
264 vec![self.content_field, self.heading_path_field],
265 )
266 });
267
268 let needs_escaping = query_str.chars().any(|c| {
270 matches!(
271 c,
272 '\\' | '"' | '(' | ')' | '[' | ']' | '{' | '}' | '^' | '~'
273 )
274 });
275
276 let mut filter_clauses = Vec::new();
277 if let Some(alias) = alias {
278 filter_clauses.push(format!("alias:{alias}"));
279 }
280
281 let normalized_flavors = flavor.and_then(|raw| {
282 let normalized = normalize_flavor_filters(raw);
283 if normalized.is_empty() {
284 if !raw.trim().is_empty() {
285 tracing::debug!(
286 filter = raw,
287 "Ignoring flavor filter with no recognized values"
288 );
289 }
290 None
291 } else {
292 Some(normalized)
293 }
294 });
295
296 match (self.flavor_field, normalized_flavors) {
297 (Some(_), Some(values)) => {
298 if values.len() == 1 {
299 filter_clauses.push(format!("flavor:{}", values[0]));
300 } else {
301 let clause = values
302 .iter()
303 .map(|value| format!("flavor:{value}"))
304 .collect::<Vec<_>>()
305 .join(" OR ");
306 filter_clauses.push(format!("({clause})"));
307 }
308 },
309 (None, Some(values)) => {
310 tracing::warn!(
311 filters = %values.join(","),
312 "Flavor filtering requested but index schema has no flavor field; ignoring"
313 );
314 },
315 _ => {},
316 }
317
318 let sanitized_query = if needs_escaping {
319 let mut sanitized = String::with_capacity(query_str.len() * 2);
321
322 for ch in query_str.chars() {
323 match ch {
324 '\\' => sanitized.push_str("\\\\"),
325 '"' => sanitized.push_str("\\\""),
326 '(' => sanitized.push_str("\\("),
327 ')' => sanitized.push_str("\\)"),
328 '[' => sanitized.push_str("\\["),
329 ']' => sanitized.push_str("\\]"),
330 '{' => sanitized.push_str("\\{"),
331 '}' => sanitized.push_str("\\}"),
332 '^' => sanitized.push_str("\\^"),
333 '~' => sanitized.push_str("\\~"),
334 _ => sanitized.push(ch),
335 }
336 }
337
338 sanitized
339 } else {
340 query_str.to_string()
341 };
342
343 let full_query_str = if filter_clauses.is_empty() {
344 sanitized_query
345 } else {
346 format!("{} AND ({sanitized_query})", filter_clauses.join(" AND "))
347 };
348
349 let query = timings.time("query_parsing", || {
350 query_parser
351 .parse_query(&full_query_str)
352 .map_err(|e| Error::Index(format!("Failed to parse query: {e}")))
353 })?;
354
355 let top_docs = timings.time("tantivy_search", || {
356 searcher
357 .search(&query, &TopDocs::with_limit(limit))
358 .map_err(|e| Error::Index(format!("Search failed: {e}")))
359 })?;
360
361 let mut hits = Vec::new();
362
363 timings.time("result_processing", || {
364 for (score, doc_address) in top_docs {
365 let doc = searcher
366 .doc(doc_address)
367 .map_err(|e| Error::Index(format!("Failed to retrieve doc: {e}")))?;
368
369 let alias = Self::get_field_text(&doc, self.alias_field)?;
370 let file = Self::get_field_text(&doc, self.path_field)?;
371 let heading_path_str = Self::get_field_text(&doc, self.heading_path_field)?;
372 let lines = Self::get_field_text(&doc, self.lines_field)?;
373 let content = Self::get_field_text(&doc, self.content_field)?;
374 let anchor = self.anchor_field.and_then(|f| {
375 doc.get_first(f)
376 .and_then(|v| v.as_str())
377 .map(std::string::ToString::to_string)
378 });
379 let flavor_value = self.flavor_field.and_then(|f| {
380 doc.get_first(f)
381 .and_then(|v| v.as_str())
382 .map(std::string::ToString::to_string)
383 });
384
385 lines_searched += content.lines().count();
387
388 let heading_path: Vec<String> = heading_path_str
389 .split(" > ")
390 .map(std::string::ToString::to_string)
391 .collect();
392
393 let snippet = Self::extract_snippet(&content, query_str, 100);
394
395 let exact_lines = Self::compute_match_lines(&content, query_str, &lines)
397 .unwrap_or_else(|| lines.clone());
398
399 let line_numbers = Self::parse_lines_range(&exact_lines);
401
402 hits.push(SearchHit {
403 alias: alias.clone(),
404 source: alias,
405 file,
406 heading_path,
407 lines: exact_lines,
408 line_numbers,
409 snippet,
410 score,
411 source_url: None,
412 checksum: String::new(),
413 anchor,
414 flavor: flavor_value,
415 });
416 }
417 Ok::<(), Error>(())
418 })?;
419
420 let duration = timer.finish_search(lines_searched);
421
422 if tracing::enabled!(Level::DEBUG) {
424 timings.print_breakdown();
425 }
426
427 debug!(
428 "Found {} hits for query '{}' in {:.2}ms (searched {} lines)",
429 hits.len(),
430 query_str,
431 duration.as_millis(),
432 lines_searched
433 );
434
435 Ok(hits)
436 }
437
438 fn compute_anchor(heading_text: &str) -> String {
439 let mut hasher = Sha256::new();
440 hasher.update(heading_text.trim().to_lowercase().as_bytes());
441 let digest = hasher.finalize();
442 let full = B64.encode(digest);
443 full[..22.min(full.len())].to_string()
444 }
445
446 fn get_field_text(doc: &tantivy::TantivyDocument, field: Field) -> Result<String> {
447 doc.get_first(field)
448 .and_then(|v| v.as_str())
449 .map(std::string::ToString::to_string)
450 .ok_or_else(|| Error::Index("Field not found in document".into()))
451 }
452
453 fn compute_match_lines(content: &str, query: &str, block_lines: &str) -> Option<String> {
456 let block_start: usize = block_lines
458 .split(['-', ':'])
459 .next()
460 .and_then(|s| s.trim().parse::<usize>().ok())?;
461
462 let mut best_pos: Option<usize> = None;
464 for token in query.split_whitespace() {
465 if token.is_empty() {
466 continue;
467 }
468 if let Some(pos) = content.find(token) {
469 match best_pos {
470 Some(cur) if pos < cur => best_pos = Some(pos),
471 None => best_pos = Some(pos),
472 _ => {},
473 }
474 }
475 }
476
477 let pos = best_pos?;
478 let local_line = content[..pos].bytes().filter(|&b| b == b'\n').count();
480 let abs_line = block_start.saturating_add(local_line);
481 Some(format!("{abs_line}-{abs_line}"))
482 }
483
484 fn parse_lines_range(range: &str) -> Option<Vec<usize>> {
487 let mut parts = range.split(['-', ':']);
488 let start = parts.next()?.trim().parse::<usize>().ok()?;
489 let end = parts.next()?.trim().parse::<usize>().ok()?;
490 Some(vec![start, end])
491 }
492
493 fn extract_snippet(content: &str, query: &str, max_len: usize) -> String {
494 let query_lower = query.to_lowercase();
495
496 let mut match_char_pos = None;
498
499 let content_chars: Vec<char> = content.chars().collect();
501 let query_chars: Vec<char> = query_lower.chars().collect();
502
503 if !query_chars.is_empty() {
504 for window_start in 0..content_chars.len() {
505 let window_end = (window_start + query_chars.len()).min(content_chars.len());
506 if window_end - window_start < query_chars.len() {
507 break;
508 }
509
510 let window_matches = content_chars[window_start..window_end]
512 .iter()
513 .zip(query_chars.iter())
514 .all(|(c1, c2)| c1.to_lowercase().eq(c2.to_lowercase()));
515
516 if window_matches {
517 match_char_pos = Some(window_start);
518 break;
519 }
520 }
521 }
522
523 if let Some(char_pos) = match_char_pos {
524 let total_chars = content_chars.len();
526 let qlen = query_chars.len();
527 let ctx_each_side = max_len.saturating_sub(qlen) / 2;
528
529 let start_char = char_pos.saturating_sub(ctx_each_side);
530 let mut end_char = (char_pos + qlen + ctx_each_side).min(total_chars);
531
532 let span = end_char.saturating_sub(start_char);
534 if span > max_len {
535 end_char = start_char + max_len;
536 }
537
538 let left_trunc = start_char > 0;
539 let right_trunc = end_char < total_chars;
540
541 let mut snippet = String::with_capacity((end_char - start_char) * 4 + 6);
543 if left_trunc {
544 snippet.push_str("...");
545 }
546 for &ch in content_chars.iter().take(end_char).skip(start_char) {
547 snippet.push(ch);
548 }
549 if right_trunc {
550 snippet.push_str("...");
551 }
552 return snippet;
553 }
554
555 let content_chars: Vec<char> = content.chars().collect();
557 if content_chars.len() <= max_len {
558 content.to_string()
559 } else {
560 let mut result = String::with_capacity(max_len * 4 + 3);
562 for (i, ch) in content_chars.iter().enumerate() {
563 if i >= max_len {
564 break;
565 }
566 result.push(*ch);
567 }
568 result.push_str("...");
569 result
570 }
571 }
572}
573
574#[cfg(test)]
575mod tests {
576 #![allow(clippy::panic)]
577 #![allow(clippy::disallowed_macros)]
578 #![allow(clippy::unwrap_used)]
579 use super::*;
580 use crate::HeadingBlock;
581 use std::time::Instant;
582 use tempfile::TempDir;
583
584 fn create_test_blocks() -> Vec<HeadingBlock> {
585 vec![
586 HeadingBlock {
587 path: vec!["React".to_string(), "Hooks".to_string()],
588 content: "useState is a React hook that lets you add state to functional components. It returns an array with the current state value and a function to update it.".to_string(),
589 start_line: 100,
590 end_line: 120,
591 },
592 HeadingBlock {
593 path: vec!["React".to_string(), "Components".to_string()],
594 content: "Components are the building blocks of React applications. They can be function components or class components.".to_string(),
595 start_line: 50,
596 end_line: 75,
597 },
598 HeadingBlock {
599 path: vec!["Next.js".to_string(), "Routing".to_string()],
600 content: "App Router is the new routing system in Next.js 13+. It provides better performance and developer experience.".to_string(),
601 start_line: 200,
602 end_line: 250,
603 },
604 ]
605 }
606
607 #[test]
608 fn test_index_creation() {
609 let temp_dir = TempDir::new().expect("Failed to create temp dir");
610 let index_path = temp_dir.path().join("test_index");
611
612 let result = SearchIndex::create(&index_path);
613 assert!(result.is_ok(), "Should create index successfully");
614
615 assert!(index_path.exists());
617 }
618
619 #[test]
620 fn test_index_open_nonexistent() {
621 let temp_dir = TempDir::new().expect("Failed to create temp dir");
622 let index_path = temp_dir.path().join("nonexistent");
623
624 let result = SearchIndex::open(&index_path);
625 assert!(result.is_err(), "Should fail to open non-existent index");
626 }
627
628 #[test]
629 fn test_index_and_search_basic() {
630 let temp_dir = TempDir::new().expect("Failed to create temp dir");
631 let index_path = temp_dir.path().join("test_index");
632
633 let index = SearchIndex::create(&index_path).expect("Should create index");
635 let blocks = create_test_blocks();
636
637 index
638 .index_blocks("test", "test.md", &blocks, "llms")
639 .expect("Should index blocks");
640
641 let hits = index
643 .search("useState", Some("test"), None, 10)
644 .expect("Should search");
645
646 assert!(!hits.is_empty(), "Should find results for useState");
647 assert!(
648 hits[0].snippet.contains("useState"),
649 "Result should contain useState"
650 );
651 assert_eq!(hits[0].alias, "test");
652 assert_eq!(hits[0].file, "test.md");
653 }
654
655 #[test]
656 fn test_search_limit() {
657 let temp_dir = TempDir::new().expect("Failed to create temp dir");
658 let index_path = temp_dir.path().join("test_index");
659
660 let index = SearchIndex::create(&index_path).expect("Should create index");
661 let blocks = create_test_blocks();
662
663 index
664 .index_blocks("test", "test.md", &blocks, "llms")
665 .expect("Should index blocks");
666
667 let hits = index
669 .search("React", Some("test"), None, 1)
670 .expect("Should search");
671
672 assert!(!hits.is_empty(), "Should find results");
673 assert!(hits.len() <= 1, "Should respect limit");
674 }
675
676 #[test]
677 fn test_search_includes_anchor() {
678 let temp_dir = TempDir::new().expect("Failed to create temp dir");
679 let index_path = temp_dir.path().join("test_index");
680
681 let index = SearchIndex::create(&index_path).expect("Should create index");
682
683 let blocks = vec![HeadingBlock {
684 path: vec!["API".to_string(), "Reference".to_string()],
685 content: "token auth key".to_string(),
686 start_line: 10,
687 end_line: 20,
688 }];
689
690 index
691 .index_blocks("test", "api.md", &blocks, "llms")
692 .expect("Should index blocks");
693
694 let hits = index
695 .search("token", Some("test"), None, 10)
696 .expect("Should search");
697
698 assert!(!hits.is_empty());
699 assert!(hits[0].anchor.is_some(), "anchor should be present in hits");
700 let expected = SearchIndex::compute_anchor("Reference");
702 assert_eq!(hits[0].anchor.clone().unwrap(), expected);
703 }
704
705 #[test]
706 fn test_search_filters_by_flavor() {
707 let temp_dir = TempDir::new().expect("Failed to create temp dir");
708 let index_path = temp_dir.path().join("test_index");
709
710 let index = SearchIndex::create(&index_path).expect("Should create index");
711
712 let llms_blocks = vec![HeadingBlock {
713 path: vec!["Docs".to_string()],
714 content: "base flavor content".to_string(),
715 start_line: 1,
716 end_line: 5,
717 }];
718
719 let full_blocks = vec![HeadingBlock {
720 path: vec!["Docs".to_string(), "Full".to_string()],
721 content: "full flavor content".to_string(),
722 start_line: 6,
723 end_line: 10,
724 }];
725
726 index
727 .index_blocks("alias", "llms.txt", &llms_blocks, "llms")
728 .expect("Should index base flavor");
729 index
730 .index_blocks("alias", "llms-full.txt", &full_blocks, "llms-full")
731 .expect("Should index full flavor");
732
733 let all_hits = index
734 .search("flavor", Some("alias"), None, 10)
735 .expect("Should search without flavor filter");
736 assert_eq!(all_hits.len(), 2);
737
738 let full_hits = index
739 .search("flavor", Some("alias"), Some("llms-full"), 10)
740 .expect("Should filter by llms-full");
741 assert_eq!(full_hits.len(), 1);
742 assert_eq!(full_hits[0].flavor.as_deref(), Some("llms-full"));
743 assert_eq!(full_hits[0].file, "llms-full.txt");
744
745 let base_hits = index
746 .search("flavor", Some("alias"), Some("llms"), 10)
747 .expect("Should filter by llms");
748 assert_eq!(base_hits.len(), 1);
749 assert_eq!(base_hits[0].flavor.as_deref(), Some("llms"));
750 assert_eq!(base_hits[0].file, "llms.txt");
751
752 let missing_hits = index
753 .search("flavor", Some("alias"), Some("nonexistent-flavor"), 10)
754 .expect("Should handle non-existent flavor");
755 assert_eq!(missing_hits.len(), 2);
756 }
757
758 #[test]
759 fn test_search_mixed_flavor_filters() {
760 let temp_dir = TempDir::new().expect("Failed to create temp dir");
761 let index_path = temp_dir.path().join("test_index_mixed");
762
763 let index = SearchIndex::create(&index_path).expect("Should create index");
764
765 let base_blocks = vec![HeadingBlock {
766 path: vec!["Docs".to_string()],
767 content: "base flavor content".to_string(),
768 start_line: 1,
769 end_line: 5,
770 }];
771
772 let full_blocks = vec![HeadingBlock {
773 path: vec!["Docs".to_string(), "Full".to_string()],
774 content: "full flavor content".to_string(),
775 start_line: 6,
776 end_line: 10,
777 }];
778
779 index
780 .index_blocks("alias", "llms.txt", &base_blocks, "llms")
781 .expect("Should index base flavor");
782 index
783 .index_blocks("alias", "llms-full.txt", &full_blocks, "llms-full")
784 .expect("Should index full flavor");
785
786 let mixed_full_hits = index
787 .search("flavor", Some("alias"), Some("llms-full,unknown"), 10)
788 .expect("Should ignore unknown flavor token");
789 assert_eq!(mixed_full_hits.len(), 1);
790 assert_eq!(mixed_full_hits[0].flavor.as_deref(), Some("llms-full"));
791
792 let mixed_base_hits = index
793 .search("flavor", Some("alias"), Some("unknown|llms"), 10)
794 .expect("Should ignore unknown flavor token and return base hits");
795 assert_eq!(mixed_base_hits.len(), 1);
796 assert_eq!(mixed_base_hits[0].flavor.as_deref(), Some("llms"));
797
798 let ignored_hits = index
799 .search("flavor", Some("alias"), Some("unknown"), 10)
800 .expect("Should ignore unknown-only filters");
801 assert_eq!(ignored_hits.len(), 2);
802 }
803
804 #[test]
805 fn test_search_no_results() {
806 let temp_dir = TempDir::new().expect("Failed to create temp dir");
807 let index_path = temp_dir.path().join("test_index");
808
809 let index = SearchIndex::create(&index_path).expect("Should create index");
810 let blocks = create_test_blocks();
811
812 index
813 .index_blocks("test", "test.md", &blocks, "llms")
814 .expect("Should index blocks");
815
816 let hits = index
818 .search("nonexistentterm12345", Some("test"), None, 10)
819 .expect("Should search");
820
821 assert!(
822 hits.is_empty(),
823 "Should find no results for non-existent term"
824 );
825 }
826
827 #[test]
828 fn test_search_performance() {
829 let temp_dir = TempDir::new().expect("Failed to create temp dir");
830 let index_path = temp_dir.path().join("test_index");
831
832 let index = SearchIndex::create(&index_path).expect("Should create index");
833
834 let mut blocks = Vec::new();
836 for i in 0..100 {
837 blocks.push(HeadingBlock {
838 path: vec![format!("Section{}", i)],
839 content: format!("This is content block {i} with various keywords like React, hooks, components, and performance testing."),
840 start_line: i * 10,
841 end_line: i * 10 + 5,
842 });
843 }
844
845 index
846 .index_blocks("perftest", "large.md", &blocks, "llms")
847 .expect("Should index many blocks");
848
849 let start = Instant::now();
851 let hits = index
852 .search("React", Some("perftest"), None, 50)
853 .expect("Should search");
854 let duration = start.elapsed();
855
856 assert!(!hits.is_empty(), "Should find results");
857 assert!(
858 duration.as_millis() < 100,
859 "Search should be fast (<100ms), took {}ms",
860 duration.as_millis()
861 );
862 }
863
864 #[test]
865 fn test_search_scoring() {
866 let temp_dir = TempDir::new().expect("Failed to create temp dir");
867 let index_path = temp_dir.path().join("test_index");
868
869 let index = SearchIndex::create(&index_path).expect("Should create index");
870
871 let blocks = vec![
872 HeadingBlock {
873 path: vec!["Exact Match".to_string()],
874 content: "React hooks".to_string(),
875 start_line: 1,
876 end_line: 5,
877 },
878 HeadingBlock {
879 path: vec!["Partial Match".to_string()],
880 content: "React components and hooks are useful features".to_string(),
881 start_line: 10,
882 end_line: 15,
883 },
884 HeadingBlock {
885 path: vec!["Distant Match".to_string()],
886 content: "In React, you can use various hooks for different purposes".to_string(),
887 start_line: 20,
888 end_line: 25,
889 },
890 ];
891
892 index
893 .index_blocks("test", "test.md", &blocks, "llms")
894 .expect("Should index blocks");
895
896 let hits = index
897 .search("React hooks", Some("test"), None, 10)
898 .expect("Should search");
899
900 assert!(!hits.is_empty(), "Should find results");
901
902 for i in 1..hits.len() {
904 assert!(
905 hits[i - 1].score >= hits[i].score,
906 "Results should be ordered by descending score"
907 );
908 }
909
910 assert!(
912 hits[0].snippet.contains("React hooks"),
913 "Highest scored result should contain exact match"
914 );
915 }
916
917 #[test]
918 fn test_heading_path_in_results() {
919 let temp_dir = TempDir::new().expect("Failed to create temp dir");
920 let index_path = temp_dir.path().join("test_index");
921
922 let index = SearchIndex::create(&index_path).expect("Should create index");
923
924 let blocks = vec![HeadingBlock {
925 path: vec![
926 "API".to_string(),
927 "Reference".to_string(),
928 "Hooks".to_string(),
929 ],
930 content: "useState hook documentation".to_string(),
931 start_line: 100,
932 end_line: 120,
933 }];
934
935 index
936 .index_blocks("test", "api.md", &blocks, "llms")
937 .expect("Should index blocks");
938
939 let hits = index
940 .search("useState", Some("test"), None, 10)
941 .expect("Should search");
942
943 assert!(!hits.is_empty(), "Should find results");
944 assert_eq!(hits[0].heading_path, vec!["API", "Reference", "Hooks"]);
945 assert_eq!(hits[0].file, "api.md");
946 assert!(
948 hits[0].lines.starts_with("100-"),
949 "Expected match to start at line 100, got {}",
950 hits[0].lines
951 );
952 }
953
954 #[test]
955 fn test_unicode_snippet_extraction() {
956 let temp_dir = TempDir::new().expect("Failed to create temp dir");
957 let index_path = temp_dir.path().join("test_index");
958 let index = SearchIndex::create(&index_path).expect("Should create index");
959
960 let unicode_blocks = vec![
962 HeadingBlock {
963 path: vec!["Unicode".to_string(), "Emoji".to_string()],
964 content: "This is a test with emojis: π Hello π World! π Let's go! π"
965 .to_string(),
966 start_line: 1,
967 end_line: 10,
968 },
969 HeadingBlock {
970 path: vec!["Unicode".to_string(), "Chinese".to_string()],
971 content: "θΏζ―δΈζζ΅θ―γHello δΈηοΌProgramming ηΌη¨ is εΎζθΆ£γ".to_string(),
972 start_line: 20,
973 end_line: 30,
974 },
975 HeadingBlock {
976 path: vec!["Unicode".to_string(), "Mixed".to_string()],
977 content: "ζ₯ζ¬θͺ γγΉγ π―π΅ with mixed content".to_string(),
978 start_line: 40,
979 end_line: 50,
980 },
981 ];
982
983 index
984 .index_blocks("unicode_test", "test.md", &unicode_blocks, "llms")
985 .expect("Should index blocks");
986
987 let test_cases = vec![("emoji", "π"), ("δΈζ", "ζ΅θ―"), ("programming", "ηΌη¨")];
989
990 for (query, _expected_content) in test_cases {
991 let results = index
992 .search(query, Some("unicode_test"), None, 10)
993 .unwrap_or_else(|_| panic!("Should search for '{query}'"));
994
995 if !results.is_empty() {
996 let hit = &results[0];
997 assert!(hit.snippet.is_char_boundary(0));
999 assert!(hit.snippet.is_char_boundary(hit.snippet.len()));
1000
1001 let _char_count = hit.snippet.chars().count();
1003 }
1004 }
1005 }
1006
1007 #[test]
1008 fn test_edge_case_unicode_truncation() {
1009 let temp_dir = TempDir::new().expect("Failed to create temp dir");
1010 let index_path = temp_dir.path().join("test_index");
1011 let index = SearchIndex::create(&index_path).expect("Should create index");
1012
1013 let mut long_content = String::new();
1015 for _ in 0..20 {
1016 long_content.push_str("π¨βπ©βπ§βπ¦"); }
1018 long_content.push_str(" MARKER ");
1019 for _ in 0..20 {
1020 long_content.push_str("π³οΈβπ"); }
1022
1023 let blocks = vec![HeadingBlock {
1024 path: vec!["Test".to_string()],
1025 content: long_content.clone(),
1026 start_line: 1,
1027 end_line: 10,
1028 }];
1029
1030 index
1031 .index_blocks("edge_test", "test.md", &blocks, "llms")
1032 .expect("Should index blocks");
1033
1034 let results = index
1035 .search("MARKER", Some("edge_test"), None, 10)
1036 .expect("Should search");
1037
1038 assert!(!results.is_empty());
1039 let snippet = &results[0].snippet;
1040
1041 assert!(snippet.is_char_boundary(0));
1043 assert!(snippet.is_char_boundary(snippet.len()));
1044 assert!(snippet.contains("MARKER"));
1045
1046 let char_count = snippet.chars().count();
1048 assert!(char_count > 0);
1049 }
1050}