1use std::{
2 cmp::Ordering,
3 collections::{BTreeMap, HashMap},
4};
5
6use blake3::hash;
7use serde::{Deserialize, Serialize};
8
9use crate::{MemvidError, Result, types::FrameId};
10
11fn lex_config() -> impl bincode::config::Config {
13 bincode::config::standard()
14 .with_fixed_int_encoding()
15 .with_little_endian()
16}
17
18const LEX_DECODE_LIMIT: usize = crate::MAX_INDEX_BYTES as usize;
19const LEX_SECTION_SOFT_CHARS: usize = 900;
20const LEX_SECTION_HARD_CHARS: usize = 1400;
21const LEX_SECTION_MAX_COUNT: usize = 2048;
22
23#[derive(Default)]
25pub struct LexIndexBuilder {
26 documents: Vec<LexDocument>,
27}
28
29impl LexIndexBuilder {
30 pub fn new() -> Self {
31 Self::default()
32 }
33
34 pub fn add_document(
35 &mut self,
36 frame_id: FrameId,
37 uri: &str,
38 title: Option<&str>,
39 content: &str,
40 tags: &HashMap<String, String>,
41 ) {
42 let tokens = tokenize(content);
43 let tags: BTreeMap<_, _> = tags.iter().map(|(k, v)| (k.clone(), v.clone())).collect();
45 let mut sections = chunk_sections(content);
46
47 let (content_owned, content_lower) = if content.is_empty() {
48 (String::new(), String::new())
49 } else if sections.is_empty() {
50 let owned = content.to_string();
51 let lower = owned.to_ascii_lowercase();
52 sections.push(LexSection {
53 offset: 0,
54 content: owned.clone(),
55 content_lower: lower.clone(),
56 });
57 (owned, lower)
58 } else {
59 (String::new(), String::new())
60 };
61 self.documents.push(LexDocument {
62 frame_id,
63 tokens,
64 tags,
65 content: content_owned,
66 content_lower,
67 uri: Some(uri.to_string()),
68 title: title.map(ToString::to_string),
69 sections,
70 });
71 }
72
73 pub fn finish(mut self) -> Result<LexIndexArtifact> {
74 for document in &mut self.documents {
75 document.ensure_sections();
76 }
77 let bytes = bincode::serde::encode_to_vec(&self.documents, lex_config())?;
78 let checksum = *hash(&bytes).as_bytes();
79 Ok(LexIndexArtifact {
80 bytes,
81 doc_count: self.documents.len() as u64,
82 checksum,
83 })
84 }
85}
86
87#[derive(Debug, Clone)]
89pub struct LexIndexArtifact {
90 pub bytes: Vec<u8>,
91 pub doc_count: u64,
92 pub checksum: [u8; 32],
93}
94
95#[derive(Debug, Clone)]
97pub struct LexIndex {
98 documents: Vec<LexDocument>,
99}
100
101impl LexIndex {
102 pub fn decode(bytes: &[u8]) -> Result<Self> {
103 let new_config = bincode::config::standard()
104 .with_fixed_int_encoding()
105 .with_little_endian()
106 .with_limit::<LEX_DECODE_LIMIT>();
107 if let Ok((documents, read)) =
108 bincode::serde::decode_from_slice::<Vec<LexDocument>, _>(bytes, new_config)
109 {
110 if read == bytes.len() {
111 return Ok(Self::from_documents(documents));
112 }
113 }
114
115 let legacy_fixed = bincode::config::standard()
116 .with_fixed_int_encoding()
117 .with_little_endian()
118 .with_limit::<LEX_DECODE_LIMIT>();
119 if let Ok((legacy_docs, read)) =
120 bincode::serde::decode_from_slice::<Vec<LegacyLexDocument>, _>(bytes, legacy_fixed)
121 {
122 if read == bytes.len() {
123 let documents = legacy_docs.into_iter().map(legacy_to_current).collect();
124 return Ok(Self::from_documents(documents));
125 }
126 }
127
128 let legacy_config = bincode::config::standard()
129 .with_little_endian()
130 .with_limit::<LEX_DECODE_LIMIT>();
131 if let Ok((legacy_docs, read)) =
132 bincode::serde::decode_from_slice::<Vec<LegacyLexDocument>, _>(bytes, legacy_config)
133 {
134 if read == bytes.len() {
135 let documents = legacy_docs.into_iter().map(legacy_to_current).collect();
136 return Ok(Self::from_documents(documents));
137 }
138 }
139
140 Err(MemvidError::InvalidToc {
141 reason: "unsupported lex index encoding".into(),
142 })
143 }
144
145 fn from_documents(mut documents: Vec<LexDocument>) -> Self {
146 for document in &mut documents {
147 document.ensure_sections();
148 }
149 Self { documents }
150 }
151
152 pub fn search(&self, query: &str, limit: usize) -> Vec<LexSearchHit> {
153 let mut query_tokens = tokenize(query);
154 query_tokens.retain(|token| !token.is_empty());
155 if query_tokens.is_empty() {
156 return Vec::new();
157 }
158 let mut matches = self.compute_matches(&query_tokens, None, None);
159 matches.truncate(limit);
160 matches
161 .into_iter()
162 .map(|m| {
163 let snippets = build_snippets(&m.content, &m.occurrences, 160, 3);
164 LexSearchHit {
165 frame_id: m.frame_id,
166 score: m.score,
167 match_count: m.occurrences.len(),
168 snippets,
169 }
170 })
171 .collect()
172 }
173
174 pub(crate) fn documents_mut(&mut self) -> &mut [LexDocument] {
175 &mut self.documents
176 }
177
178 pub(crate) fn remove_document(&mut self, frame_id: FrameId) {
179 self.documents.retain(|doc| doc.frame_id != frame_id);
180 }
181
182 pub(crate) fn compute_matches(
183 &self,
184 query_tokens: &[String],
185 uri_filter: Option<&str>,
186 scope_filter: Option<&str>,
187 ) -> Vec<LexMatch> {
188 if query_tokens.is_empty() {
189 return Vec::new();
190 }
191
192 let mut hits = Vec::new();
193 let phrase = query_tokens.join(" ");
194 for document in &self.documents {
195 if let Some(uri) = uri_filter {
196 if !uri_matches(document.uri.as_deref(), uri) {
197 continue;
198 }
199 } else if let Some(scope) = scope_filter {
200 match document.uri.as_deref() {
201 Some(candidate) if candidate.starts_with(scope) => {}
202 _ => continue,
203 }
204 }
205
206 if document.sections.is_empty() {
207 continue;
208 }
209
210 for section in &document.sections {
211 let haystack = section.content_lower.as_str();
212 if haystack.is_empty() {
213 continue;
214 }
215
216 let mut occurrences: Vec<(usize, usize)> = Vec::new();
217
218 if query_tokens.len() == 1 {
219 let needle = &query_tokens[0];
220 if needle.is_empty() {
221 continue;
222 }
223 let mut start = 0usize;
224 while let Some(idx) = haystack[start..].find(needle) {
225 let local_start = start + idx;
226 let local_end = local_start + needle.len();
227 occurrences.push((local_start, local_end));
228 start = local_end;
229 }
230 } else {
231 let mut all_occurrences = Vec::new();
232 let mut all_present = true;
233 for needle in query_tokens {
234 if needle.is_empty() {
235 all_present = false;
236 break;
237 }
238 let mut start = 0usize;
239 let mut found_for_token = false;
240 while let Some(idx) = haystack[start..].find(needle) {
241 found_for_token = true;
242 let local_start = start + idx;
243 let local_end = local_start + needle.len();
244 all_occurrences.push((local_start, local_end));
245 start = local_end;
246 }
247 if !found_for_token {
248 all_present = false;
249 break;
250 }
251 }
252 if !all_present {
253 continue;
254 }
255 occurrences = all_occurrences;
256 }
257
258 if occurrences.is_empty() {
259 continue;
260 }
261
262 occurrences.sort_by_key(|(start, _)| *start);
263 let mut score = occurrences.len() as f32;
264 if !phrase.is_empty() && section.content_lower.contains(&phrase) {
265 score += 1000.0;
266 }
267 hits.push(LexMatch {
268 frame_id: document.frame_id,
269 score,
270 occurrences,
271 content: section.content.clone(),
272 uri: document.uri.clone(),
273 title: document.title.clone(),
274 chunk_offset: section.offset,
275 });
276 }
277 }
278
279 hits.sort_by(|a, b| b.score.partial_cmp(&a.score).unwrap_or(Ordering::Equal));
280 hits
281 }
282}
283
284fn uri_matches(candidate: Option<&str>, expected: &str) -> bool {
285 let Some(uri) = candidate else {
286 return false;
287 };
288 if expected.contains('#') {
289 uri.eq_ignore_ascii_case(expected)
290 } else {
291 let expected_lower = expected.to_ascii_lowercase();
292 let candidate_lower = uri.to_ascii_lowercase();
293 candidate_lower.starts_with(&expected_lower)
294 }
295}
296
297#[derive(Debug, Clone, Serialize, Deserialize)]
298pub(crate) struct LexDocument {
299 pub(crate) frame_id: FrameId,
300 tokens: Vec<String>,
301 tags: BTreeMap<String, String>,
302 #[serde(default)]
303 content: String,
304 #[serde(default)]
305 pub(crate) content_lower: String,
306 #[serde(default)]
307 pub(crate) uri: Option<String>,
308 #[serde(default)]
309 pub(crate) title: Option<String>,
310 #[serde(default)]
311 sections: Vec<LexSection>,
312}
313
314#[derive(Debug, Clone, Serialize, Deserialize)]
315pub(crate) struct LexSection {
316 pub(crate) offset: usize,
317 #[serde(default)]
318 pub(crate) content: String,
319 #[serde(default)]
320 pub(crate) content_lower: String,
321}
322
323#[derive(Debug, Clone, Serialize, Deserialize)]
324struct LegacyLexDocument {
325 frame_id: FrameId,
326 tokens: Vec<String>,
327 tags: BTreeMap<String, String>,
328 #[serde(default)]
329 content: Option<String>,
330 #[serde(default)]
331 uri: Option<String>,
332 #[serde(default)]
333 title: Option<String>,
334}
335
336impl LexDocument {
337 fn ensure_sections(&mut self) {
338 if !self.sections.is_empty() {
339 return;
340 }
341
342 if self.content.is_empty() {
343 return;
344 }
345
346 if self.content_lower.is_empty() {
347 self.content_lower = self.content.to_ascii_lowercase();
348 }
349
350 self.sections.push(LexSection {
351 offset: 0,
352 content: self.content.clone(),
353 content_lower: self.content_lower.clone(),
354 });
355 }
356}
357
358fn legacy_to_current(legacy: LegacyLexDocument) -> LexDocument {
359 let content = legacy.content.unwrap_or_default();
360 let content_lower = content.to_ascii_lowercase();
361 let sections = if content.is_empty() {
362 Vec::new()
363 } else {
364 vec![LexSection {
365 offset: 0,
366 content: content.clone(),
367 content_lower: content_lower.clone(),
368 }]
369 };
370 LexDocument {
371 frame_id: legacy.frame_id,
372 tokens: legacy.tokens,
373 tags: legacy.tags,
374 content,
375 content_lower,
376 uri: legacy.uri,
377 title: legacy.title,
378 sections,
379 }
380}
381
382#[derive(Debug, Clone)]
383pub struct LexSearchHit {
384 pub frame_id: FrameId,
385 pub score: f32,
386 pub match_count: usize,
387 pub snippets: Vec<String>,
388}
389
390#[derive(Debug, Clone)]
391pub(crate) struct LexMatch {
392 pub frame_id: FrameId,
393 pub score: f32,
394 pub occurrences: Vec<(usize, usize)>,
395 pub content: String,
396 pub uri: Option<String>,
397 pub title: Option<String>,
398 pub chunk_offset: usize,
399}
400
401fn tokenize(input: &str) -> Vec<String> {
402 input
403 .split(|c: char| !is_token_char(c))
404 .filter_map(|token| {
405 if token.chars().any(|ch| ch.is_alphanumeric()) {
406 Some(token.to_lowercase())
407 } else {
408 None
409 }
410 })
411 .collect()
412}
413
414fn is_token_char(ch: char) -> bool {
415 ch.is_alphanumeric() || matches!(ch, '&' | '@' | '+' | '/' | '_')
416}
417
418fn build_snippets(
419 content: &str,
420 occurrences: &[(usize, usize)],
421 window: usize,
422 max_snippets: usize,
423) -> Vec<String> {
424 compute_snippet_slices(content, occurrences, window, max_snippets)
425 .into_iter()
426 .map(|(start, end)| content[start..end].replace('\n', " "))
427 .collect()
428}
429
430fn chunk_sections(content: &str) -> Vec<LexSection> {
431 if content.is_empty() {
432 return Vec::new();
433 }
434
435 if content.len() <= LEX_SECTION_HARD_CHARS {
436 return vec![LexSection {
437 offset: 0,
438 content: content.to_string(),
439 content_lower: content.to_ascii_lowercase(),
440 }];
441 }
442
443 let mut sections: Vec<LexSection> = Vec::new();
444 let mut chunk_start = 0usize;
445 let mut last_soft_break = None;
446 let mut iter = content.char_indices().peekable();
447
448 while let Some((idx, ch)) = iter.next() {
449 let char_end = idx + ch.len_utf8();
450 let current_len = char_end.saturating_sub(chunk_start);
451 let next_char = iter.peek().map(|(_, next)| *next);
452
453 if is_soft_boundary(ch, next_char) {
454 last_soft_break = Some(char_end);
455 if current_len < LEX_SECTION_SOFT_CHARS {
456 continue;
457 }
458 }
459
460 if current_len < LEX_SECTION_HARD_CHARS {
461 continue;
462 }
463
464 let mut split_at = last_soft_break.unwrap_or(char_end);
465 if split_at <= chunk_start {
466 split_at = char_end;
467 }
468
469 push_section(&mut sections, content, chunk_start, split_at);
470 chunk_start = split_at;
471 last_soft_break = None;
472
473 if sections.len() >= LEX_SECTION_MAX_COUNT {
474 break;
475 }
476 }
477
478 if chunk_start < content.len() {
479 if sections.len() >= LEX_SECTION_MAX_COUNT {
480 if let Some(last) = sections.last_mut() {
481 let slice = &content[last.offset..];
482 last.content = slice.to_string();
483 last.content_lower = slice.to_ascii_lowercase();
484 }
485 } else {
486 push_section(&mut sections, content, chunk_start, content.len());
487 }
488 }
489
490 if sections.is_empty() {
491 sections.push(LexSection {
492 offset: 0,
493 content: content.to_string(),
494 content_lower: content.to_ascii_lowercase(),
495 });
496 }
497
498 sections
499}
500
501fn push_section(sections: &mut Vec<LexSection>, content: &str, start: usize, end: usize) {
502 if end <= start {
503 return;
504 }
505
506 let slice = &content[start..end];
507 sections.push(LexSection {
508 offset: start,
509 content: slice.to_string(),
510 content_lower: slice.to_ascii_lowercase(),
511 });
512}
513
514fn is_soft_boundary(ch: char, next: Option<char>) -> bool {
515 match ch {
516 '.' | '!' | '?' => next.map_or(true, |n| n.is_whitespace()),
517 '\n' => true,
518 _ => false,
519 }
520}
521
522pub(crate) fn compute_snippet_slices(
523 content: &str,
524 occurrences: &[(usize, usize)],
525 window: usize,
526 max_snippets: usize,
527) -> Vec<(usize, usize)> {
528 if content.is_empty() {
529 return Vec::new();
530 }
531
532 if occurrences.is_empty() {
533 let end = advance_boundary(content, 0, window);
534 return vec![(0, end)];
535 }
536
537 let mut merged: Vec<(usize, usize)> = Vec::new();
538 for &(start, end) in occurrences {
539 let mut snippet_start = start.saturating_sub(window / 2);
540 let mut snippet_end = (end + window / 2).min(content.len());
541
542 if let Some(adj) = sentence_start_before(content, snippet_start) {
543 snippet_start = adj;
544 }
545 if let Some(adj) = sentence_end_after(content, snippet_end) {
546 snippet_end = adj;
547 }
548
549 snippet_start = prev_char_boundary(content, snippet_start);
550 snippet_end = next_char_boundary(content, snippet_end);
551
552 if snippet_end <= snippet_start {
553 continue;
554 }
555
556 if let Some(last) = merged.last_mut() {
557 if snippet_start <= last.1 + 20 {
558 last.1 = last.1.max(snippet_end);
559 continue;
560 }
561 }
562
563 merged.push((
564 snippet_start.min(content.len()),
565 snippet_end.min(content.len()),
566 ));
567 if merged.len() >= max_snippets {
568 break;
569 }
570 }
571
572 if merged.is_empty() {
573 let end = advance_boundary(content, 0, window);
574 merged.push((0, end));
575 }
576
577 merged
578}
579
580fn sentence_start_before(content: &str, idx: usize) -> Option<usize> {
581 if idx == 0 {
582 return Some(0);
583 }
584 let mut idx = idx.min(content.len());
585 idx = prev_char_boundary(content, idx);
586 let mut candidate = None;
587 for (pos, ch) in content[..idx].char_indices() {
588 if matches!(ch, '.' | '!' | '?' | '\n') {
589 candidate = Some(pos + ch.len_utf8());
590 }
591 }
592 candidate.map(|pos| {
593 let mut pos = next_char_boundary(content, pos);
594 while pos < content.len() && content.as_bytes()[pos].is_ascii_whitespace() {
595 pos += 1;
596 }
597 prev_char_boundary(content, pos)
598 })
599}
600
601fn sentence_end_after(content: &str, idx: usize) -> Option<usize> {
602 if idx >= content.len() {
603 return Some(content.len());
604 }
605 let mut idx = idx;
606 idx = prev_char_boundary(content, idx);
607 for (offset, ch) in content[idx..].char_indices() {
608 let global = idx + offset;
609 if matches!(ch, '.' | '!' | '?') {
610 return Some(next_char_boundary(content, global + ch.len_utf8()));
611 }
612 if ch == '\n' {
613 return Some(global);
614 }
615 }
616 None
617}
618
619fn prev_char_boundary(content: &str, mut idx: usize) -> usize {
620 if idx > content.len() {
621 idx = content.len();
622 }
623 while idx > 0 && !content.is_char_boundary(idx) {
624 idx -= 1;
625 }
626 idx
627}
628
629fn next_char_boundary(content: &str, mut idx: usize) -> usize {
630 if idx > content.len() {
631 idx = content.len();
632 }
633 while idx < content.len() && !content.is_char_boundary(idx) {
634 idx += 1;
635 }
636 idx
637}
638
639fn advance_boundary(content: &str, start: usize, mut window: usize) -> usize {
640 if start >= content.len() {
641 return content.len();
642 }
643 let mut last = content.len();
644 for (offset, _) in content[start..].char_indices() {
645 if window == 0 {
646 return start + offset;
647 }
648 last = start + offset;
649 window -= 1;
650 }
651 content.len().max(last)
652}
653
654#[cfg(test)]
655mod tests {
656 use super::*;
657
658 #[test]
659 fn builder_produces_artifact() {
660 let mut builder = LexIndexBuilder::new();
661 let mut tags = HashMap::new();
662 tags.insert("source".into(), "test".into());
663 builder.add_document(0, "mv2://docs/one", Some("Doc One"), "hello world", &tags);
664 builder.add_document(
665 1,
666 "mv2://docs/two",
667 Some("Doc Two"),
668 "rust systems",
669 &HashMap::new(),
670 );
671
672 let artifact = builder.finish().expect("finish");
673 assert_eq!(artifact.doc_count, 2);
674 assert!(artifact.bytes.len() > 0);
675
676 let index = LexIndex::decode(&artifact.bytes).expect("decode");
677 let hits = index.search("rust", 10);
678 assert_eq!(hits.len(), 1);
679 assert_eq!(hits[0].frame_id, 1);
680 assert!(hits[0].match_count >= 1);
681 assert!(!hits[0].snippets.is_empty());
682 }
683
684 #[test]
685 fn tokenizer_lowercases_and_filters() {
686 let tokens = tokenize("Hello, Rust-lang!");
687 assert_eq!(tokens, vec!["hello", "rust", "lang"]);
688 }
689
690 #[test]
691 fn tokenizer_retains_connector_characters() {
692 let tokens = tokenize("N&M EXPRESS LLC @ 2024");
693 assert_eq!(tokens, vec!["n&m", "express", "llc", "2024"]);
694 }
695}