memvid_core/
lex.rs

1use std::{
2    cmp::Ordering,
3    collections::{BTreeMap, HashMap},
4};
5
6use blake3::hash;
7use serde::{Deserialize, Serialize};
8
9use crate::{MemvidError, Result, types::FrameId};
10
11// Bincode configuration reused for deterministic layout.
12fn lex_config() -> impl bincode::config::Config {
13    bincode::config::standard()
14        .with_fixed_int_encoding()
15        .with_little_endian()
16}
17
18const LEX_DECODE_LIMIT: usize = crate::MAX_INDEX_BYTES as usize;
19const LEX_SECTION_SOFT_CHARS: usize = 900;
20const LEX_SECTION_HARD_CHARS: usize = 1400;
21const LEX_SECTION_MAX_COUNT: usize = 2048;
22
23/// Intermediate builder that collects documents prior to serialisation.
24#[derive(Default)]
25pub struct LexIndexBuilder {
26    documents: Vec<LexDocument>,
27}
28
29impl LexIndexBuilder {
30    pub fn new() -> Self {
31        Self::default()
32    }
33
34    pub fn add_document(
35        &mut self,
36        frame_id: FrameId,
37        uri: &str,
38        title: Option<&str>,
39        content: &str,
40        tags: &HashMap<String, String>,
41    ) {
42        let tokens = tokenize(content);
43        // Convert HashMap to BTreeMap for deterministic serialization
44        let tags: BTreeMap<_, _> = tags.iter().map(|(k, v)| (k.clone(), v.clone())).collect();
45        let mut sections = chunk_sections(content);
46
47        let (content_owned, content_lower) = if content.is_empty() {
48            (String::new(), String::new())
49        } else if sections.is_empty() {
50            let owned = content.to_string();
51            let lower = owned.to_ascii_lowercase();
52            sections.push(LexSection {
53                offset: 0,
54                content: owned.clone(),
55                content_lower: lower.clone(),
56            });
57            (owned, lower)
58        } else {
59            (String::new(), String::new())
60        };
61        self.documents.push(LexDocument {
62            frame_id,
63            tokens,
64            tags,
65            content: content_owned,
66            content_lower,
67            uri: Some(uri.to_string()),
68            title: title.map(ToString::to_string),
69            sections,
70        });
71    }
72
73    pub fn finish(mut self) -> Result<LexIndexArtifact> {
74        for document in &mut self.documents {
75            document.ensure_sections();
76        }
77        let bytes = bincode::serde::encode_to_vec(&self.documents, lex_config())?;
78        let checksum = *hash(&bytes).as_bytes();
79        Ok(LexIndexArtifact {
80            bytes,
81            doc_count: self.documents.len() as u64,
82            checksum,
83        })
84    }
85}
86
87/// Serialized lexical index artifact ready to be embedded in the `.mv2` file.
88#[derive(Debug, Clone)]
89pub struct LexIndexArtifact {
90    pub bytes: Vec<u8>,
91    pub doc_count: u64,
92    pub checksum: [u8; 32],
93}
94
95/// Read-only lexical index decoded from persisted bytes.
96#[derive(Debug, Clone)]
97pub struct LexIndex {
98    documents: Vec<LexDocument>,
99}
100
101impl LexIndex {
102    pub fn decode(bytes: &[u8]) -> Result<Self> {
103        let new_config = bincode::config::standard()
104            .with_fixed_int_encoding()
105            .with_little_endian()
106            .with_limit::<LEX_DECODE_LIMIT>();
107        if let Ok((documents, read)) =
108            bincode::serde::decode_from_slice::<Vec<LexDocument>, _>(bytes, new_config)
109        {
110            if read == bytes.len() {
111                return Ok(Self::from_documents(documents));
112            }
113        }
114
115        let legacy_fixed = bincode::config::standard()
116            .with_fixed_int_encoding()
117            .with_little_endian()
118            .with_limit::<LEX_DECODE_LIMIT>();
119        if let Ok((legacy_docs, read)) =
120            bincode::serde::decode_from_slice::<Vec<LegacyLexDocument>, _>(bytes, legacy_fixed)
121        {
122            if read == bytes.len() {
123                let documents = legacy_docs.into_iter().map(legacy_to_current).collect();
124                return Ok(Self::from_documents(documents));
125            }
126        }
127
128        let legacy_config = bincode::config::standard()
129            .with_little_endian()
130            .with_limit::<LEX_DECODE_LIMIT>();
131        if let Ok((legacy_docs, read)) =
132            bincode::serde::decode_from_slice::<Vec<LegacyLexDocument>, _>(bytes, legacy_config)
133        {
134            if read == bytes.len() {
135                let documents = legacy_docs.into_iter().map(legacy_to_current).collect();
136                return Ok(Self::from_documents(documents));
137            }
138        }
139
140        Err(MemvidError::InvalidToc {
141            reason: "unsupported lex index encoding".into(),
142        })
143    }
144
145    fn from_documents(mut documents: Vec<LexDocument>) -> Self {
146        for document in &mut documents {
147            document.ensure_sections();
148        }
149        Self { documents }
150    }
151
152    pub fn search(&self, query: &str, limit: usize) -> Vec<LexSearchHit> {
153        let mut query_tokens = tokenize(query);
154        query_tokens.retain(|token| !token.is_empty());
155        if query_tokens.is_empty() {
156            return Vec::new();
157        }
158        let mut matches = self.compute_matches(&query_tokens, None, None);
159        matches.truncate(limit);
160        matches
161            .into_iter()
162            .map(|m| {
163                let snippets = build_snippets(&m.content, &m.occurrences, 160, 3);
164                LexSearchHit {
165                    frame_id: m.frame_id,
166                    score: m.score,
167                    match_count: m.occurrences.len(),
168                    snippets,
169                }
170            })
171            .collect()
172    }
173
174    pub(crate) fn documents_mut(&mut self) -> &mut [LexDocument] {
175        &mut self.documents
176    }
177
178    pub(crate) fn remove_document(&mut self, frame_id: FrameId) {
179        self.documents.retain(|doc| doc.frame_id != frame_id);
180    }
181
182    pub(crate) fn compute_matches(
183        &self,
184        query_tokens: &[String],
185        uri_filter: Option<&str>,
186        scope_filter: Option<&str>,
187    ) -> Vec<LexMatch> {
188        if query_tokens.is_empty() {
189            return Vec::new();
190        }
191
192        let mut hits = Vec::new();
193        let phrase = query_tokens.join(" ");
194        for document in &self.documents {
195            if let Some(uri) = uri_filter {
196                if !uri_matches(document.uri.as_deref(), uri) {
197                    continue;
198                }
199            } else if let Some(scope) = scope_filter {
200                match document.uri.as_deref() {
201                    Some(candidate) if candidate.starts_with(scope) => {}
202                    _ => continue,
203                }
204            }
205
206            if document.sections.is_empty() {
207                continue;
208            }
209
210            for section in &document.sections {
211                let haystack = section.content_lower.as_str();
212                if haystack.is_empty() {
213                    continue;
214                }
215
216                let mut occurrences: Vec<(usize, usize)> = Vec::new();
217
218                if query_tokens.len() == 1 {
219                    let needle = &query_tokens[0];
220                    if needle.is_empty() {
221                        continue;
222                    }
223                    let mut start = 0usize;
224                    while let Some(idx) = haystack[start..].find(needle) {
225                        let local_start = start + idx;
226                        let local_end = local_start + needle.len();
227                        occurrences.push((local_start, local_end));
228                        start = local_end;
229                    }
230                } else {
231                    let mut all_occurrences = Vec::new();
232                    let mut all_present = true;
233                    for needle in query_tokens {
234                        if needle.is_empty() {
235                            all_present = false;
236                            break;
237                        }
238                        let mut start = 0usize;
239                        let mut found_for_token = false;
240                        while let Some(idx) = haystack[start..].find(needle) {
241                            found_for_token = true;
242                            let local_start = start + idx;
243                            let local_end = local_start + needle.len();
244                            all_occurrences.push((local_start, local_end));
245                            start = local_end;
246                        }
247                        if !found_for_token {
248                            all_present = false;
249                            break;
250                        }
251                    }
252                    if !all_present {
253                        continue;
254                    }
255                    occurrences = all_occurrences;
256                }
257
258                if occurrences.is_empty() {
259                    continue;
260                }
261
262                occurrences.sort_by_key(|(start, _)| *start);
263                let mut score = occurrences.len() as f32;
264                if !phrase.is_empty() && section.content_lower.contains(&phrase) {
265                    score += 1000.0;
266                }
267                hits.push(LexMatch {
268                    frame_id: document.frame_id,
269                    score,
270                    occurrences,
271                    content: section.content.clone(),
272                    uri: document.uri.clone(),
273                    title: document.title.clone(),
274                    chunk_offset: section.offset,
275                });
276            }
277        }
278
279        hits.sort_by(|a, b| b.score.partial_cmp(&a.score).unwrap_or(Ordering::Equal));
280
281        // Deduplicate by frame_id, keeping the highest-scoring match for each frame.
282        // This prevents the same document from appearing multiple times when it has
283        // multiple sections that match the query.
284        let mut seen_frames: std::collections::HashSet<FrameId> = std::collections::HashSet::new();
285        let mut deduped = Vec::with_capacity(hits.len());
286        for hit in hits {
287            if seen_frames.insert(hit.frame_id) {
288                deduped.push(hit);
289            }
290        }
291        deduped
292    }
293}
294
295fn uri_matches(candidate: Option<&str>, expected: &str) -> bool {
296    let Some(uri) = candidate else {
297        return false;
298    };
299    if expected.contains('#') {
300        uri.eq_ignore_ascii_case(expected)
301    } else {
302        let expected_lower = expected.to_ascii_lowercase();
303        let candidate_lower = uri.to_ascii_lowercase();
304        candidate_lower.starts_with(&expected_lower)
305    }
306}
307
308#[derive(Debug, Clone, Serialize, Deserialize)]
309pub(crate) struct LexDocument {
310    pub(crate) frame_id: FrameId,
311    tokens: Vec<String>,
312    tags: BTreeMap<String, String>,
313    #[serde(default)]
314    content: String,
315    #[serde(default)]
316    pub(crate) content_lower: String,
317    #[serde(default)]
318    pub(crate) uri: Option<String>,
319    #[serde(default)]
320    pub(crate) title: Option<String>,
321    #[serde(default)]
322    sections: Vec<LexSection>,
323}
324
325#[derive(Debug, Clone, Serialize, Deserialize)]
326pub(crate) struct LexSection {
327    pub(crate) offset: usize,
328    #[serde(default)]
329    pub(crate) content: String,
330    #[serde(default)]
331    pub(crate) content_lower: String,
332}
333
334#[derive(Debug, Clone, Serialize, Deserialize)]
335struct LegacyLexDocument {
336    frame_id: FrameId,
337    tokens: Vec<String>,
338    tags: BTreeMap<String, String>,
339    #[serde(default)]
340    content: Option<String>,
341    #[serde(default)]
342    uri: Option<String>,
343    #[serde(default)]
344    title: Option<String>,
345}
346
347impl LexDocument {
348    fn ensure_sections(&mut self) {
349        if !self.sections.is_empty() {
350            return;
351        }
352
353        if self.content.is_empty() {
354            return;
355        }
356
357        if self.content_lower.is_empty() {
358            self.content_lower = self.content.to_ascii_lowercase();
359        }
360
361        self.sections.push(LexSection {
362            offset: 0,
363            content: self.content.clone(),
364            content_lower: self.content_lower.clone(),
365        });
366    }
367}
368
369fn legacy_to_current(legacy: LegacyLexDocument) -> LexDocument {
370    let content = legacy.content.unwrap_or_default();
371    let content_lower = content.to_ascii_lowercase();
372    let sections = if content.is_empty() {
373        Vec::new()
374    } else {
375        vec![LexSection {
376            offset: 0,
377            content: content.clone(),
378            content_lower: content_lower.clone(),
379        }]
380    };
381    LexDocument {
382        frame_id: legacy.frame_id,
383        tokens: legacy.tokens,
384        tags: legacy.tags,
385        content,
386        content_lower,
387        uri: legacy.uri,
388        title: legacy.title,
389        sections,
390    }
391}
392
393#[derive(Debug, Clone)]
394pub struct LexSearchHit {
395    pub frame_id: FrameId,
396    pub score: f32,
397    pub match_count: usize,
398    pub snippets: Vec<String>,
399}
400
401#[derive(Debug, Clone)]
402pub(crate) struct LexMatch {
403    pub frame_id: FrameId,
404    pub score: f32,
405    pub occurrences: Vec<(usize, usize)>,
406    pub content: String,
407    pub uri: Option<String>,
408    pub title: Option<String>,
409    pub chunk_offset: usize,
410}
411
412fn tokenize(input: &str) -> Vec<String> {
413    input
414        .split(|c: char| !is_token_char(c))
415        .filter_map(|token| {
416            if token.chars().any(|ch| ch.is_alphanumeric()) {
417                Some(token.to_lowercase())
418            } else {
419                None
420            }
421        })
422        .collect()
423}
424
425fn is_token_char(ch: char) -> bool {
426    ch.is_alphanumeric() || matches!(ch, '&' | '@' | '+' | '/' | '_')
427}
428
429fn build_snippets(
430    content: &str,
431    occurrences: &[(usize, usize)],
432    window: usize,
433    max_snippets: usize,
434) -> Vec<String> {
435    compute_snippet_slices(content, occurrences, window, max_snippets)
436        .into_iter()
437        .map(|(start, end)| content[start..end].replace('\n', " "))
438        .collect()
439}
440
441fn chunk_sections(content: &str) -> Vec<LexSection> {
442    if content.is_empty() {
443        return Vec::new();
444    }
445
446    if content.len() <= LEX_SECTION_HARD_CHARS {
447        return vec![LexSection {
448            offset: 0,
449            content: content.to_string(),
450            content_lower: content.to_ascii_lowercase(),
451        }];
452    }
453
454    let mut sections: Vec<LexSection> = Vec::new();
455    let mut chunk_start = 0usize;
456    let mut last_soft_break = None;
457    let mut iter = content.char_indices().peekable();
458
459    while let Some((idx, ch)) = iter.next() {
460        let char_end = idx + ch.len_utf8();
461        let current_len = char_end.saturating_sub(chunk_start);
462        let next_char = iter.peek().map(|(_, next)| *next);
463
464        if is_soft_boundary(ch, next_char) {
465            last_soft_break = Some(char_end);
466            if current_len < LEX_SECTION_SOFT_CHARS {
467                continue;
468            }
469        }
470
471        if current_len < LEX_SECTION_HARD_CHARS {
472            continue;
473        }
474
475        let mut split_at = last_soft_break.unwrap_or(char_end);
476        if split_at <= chunk_start {
477            split_at = char_end;
478        }
479
480        push_section(&mut sections, content, chunk_start, split_at);
481        chunk_start = split_at;
482        last_soft_break = None;
483
484        if sections.len() >= LEX_SECTION_MAX_COUNT {
485            break;
486        }
487    }
488
489    if chunk_start < content.len() {
490        if sections.len() >= LEX_SECTION_MAX_COUNT {
491            if let Some(last) = sections.last_mut() {
492                let slice = &content[last.offset..];
493                last.content = slice.to_string();
494                last.content_lower = slice.to_ascii_lowercase();
495            }
496        } else {
497            push_section(&mut sections, content, chunk_start, content.len());
498        }
499    }
500
501    if sections.is_empty() {
502        sections.push(LexSection {
503            offset: 0,
504            content: content.to_string(),
505            content_lower: content.to_ascii_lowercase(),
506        });
507    }
508
509    sections
510}
511
512fn push_section(sections: &mut Vec<LexSection>, content: &str, start: usize, end: usize) {
513    if end <= start {
514        return;
515    }
516
517    let slice = &content[start..end];
518    sections.push(LexSection {
519        offset: start,
520        content: slice.to_string(),
521        content_lower: slice.to_ascii_lowercase(),
522    });
523}
524
525fn is_soft_boundary(ch: char, next: Option<char>) -> bool {
526    match ch {
527        '.' | '!' | '?' => next.map_or(true, |n| n.is_whitespace()),
528        '\n' => true,
529        _ => false,
530    }
531}
532
533pub(crate) fn compute_snippet_slices(
534    content: &str,
535    occurrences: &[(usize, usize)],
536    window: usize,
537    max_snippets: usize,
538) -> Vec<(usize, usize)> {
539    if content.is_empty() {
540        return Vec::new();
541    }
542
543    if occurrences.is_empty() {
544        let end = advance_boundary(content, 0, window);
545        return vec![(0, end)];
546    }
547
548    let mut merged: Vec<(usize, usize)> = Vec::new();
549    for &(start, end) in occurrences {
550        let mut snippet_start = start.saturating_sub(window / 2);
551        let mut snippet_end = (end + window / 2).min(content.len());
552
553        if let Some(adj) = sentence_start_before(content, snippet_start) {
554            snippet_start = adj;
555        }
556        if let Some(adj) = sentence_end_after(content, snippet_end) {
557            snippet_end = adj;
558        }
559
560        snippet_start = prev_char_boundary(content, snippet_start);
561        snippet_end = next_char_boundary(content, snippet_end);
562
563        if snippet_end <= snippet_start {
564            continue;
565        }
566
567        if let Some(last) = merged.last_mut() {
568            if snippet_start <= last.1 + 20 {
569                last.1 = last.1.max(snippet_end);
570                continue;
571            }
572        }
573
574        merged.push((
575            snippet_start.min(content.len()),
576            snippet_end.min(content.len()),
577        ));
578        if merged.len() >= max_snippets {
579            break;
580        }
581    }
582
583    if merged.is_empty() {
584        let end = advance_boundary(content, 0, window);
585        merged.push((0, end));
586    }
587
588    merged
589}
590
591fn sentence_start_before(content: &str, idx: usize) -> Option<usize> {
592    if idx == 0 {
593        return Some(0);
594    }
595    let mut idx = idx.min(content.len());
596    idx = prev_char_boundary(content, idx);
597    let mut candidate = None;
598    for (pos, ch) in content[..idx].char_indices() {
599        if matches!(ch, '.' | '!' | '?' | '\n') {
600            candidate = Some(pos + ch.len_utf8());
601        }
602    }
603    candidate.map(|pos| {
604        let mut pos = next_char_boundary(content, pos);
605        while pos < content.len() && content.as_bytes()[pos].is_ascii_whitespace() {
606            pos += 1;
607        }
608        prev_char_boundary(content, pos)
609    })
610}
611
612fn sentence_end_after(content: &str, idx: usize) -> Option<usize> {
613    if idx >= content.len() {
614        return Some(content.len());
615    }
616    let mut idx = idx;
617    idx = prev_char_boundary(content, idx);
618    for (offset, ch) in content[idx..].char_indices() {
619        let global = idx + offset;
620        if matches!(ch, '.' | '!' | '?') {
621            return Some(next_char_boundary(content, global + ch.len_utf8()));
622        }
623        if ch == '\n' {
624            return Some(global);
625        }
626    }
627    None
628}
629
630fn prev_char_boundary(content: &str, mut idx: usize) -> usize {
631    if idx > content.len() {
632        idx = content.len();
633    }
634    while idx > 0 && !content.is_char_boundary(idx) {
635        idx -= 1;
636    }
637    idx
638}
639
640fn next_char_boundary(content: &str, mut idx: usize) -> usize {
641    if idx > content.len() {
642        idx = content.len();
643    }
644    while idx < content.len() && !content.is_char_boundary(idx) {
645        idx += 1;
646    }
647    idx
648}
649
650fn advance_boundary(content: &str, start: usize, mut window: usize) -> usize {
651    if start >= content.len() {
652        return content.len();
653    }
654    let mut last = content.len();
655    for (offset, _) in content[start..].char_indices() {
656        if window == 0 {
657            return start + offset;
658        }
659        last = start + offset;
660        window -= 1;
661    }
662    content.len().max(last)
663}
664
665#[cfg(test)]
666mod tests {
667    use super::*;
668
669    #[test]
670    fn builder_produces_artifact() {
671        let mut builder = LexIndexBuilder::new();
672        let mut tags = HashMap::new();
673        tags.insert("source".into(), "test".into());
674        builder.add_document(0, "mv2://docs/one", Some("Doc One"), "hello world", &tags);
675        builder.add_document(
676            1,
677            "mv2://docs/two",
678            Some("Doc Two"),
679            "rust systems",
680            &HashMap::new(),
681        );
682
683        let artifact = builder.finish().expect("finish");
684        assert_eq!(artifact.doc_count, 2);
685        assert!(artifact.bytes.len() > 0);
686
687        let index = LexIndex::decode(&artifact.bytes).expect("decode");
688        let hits = index.search("rust", 10);
689        assert_eq!(hits.len(), 1);
690        assert_eq!(hits[0].frame_id, 1);
691        assert!(hits[0].match_count >= 1);
692        assert!(!hits[0].snippets.is_empty());
693    }
694
695    #[test]
696    fn tokenizer_lowercases_and_filters() {
697        let tokens = tokenize("Hello, Rust-lang!");
698        assert_eq!(tokens, vec!["hello", "rust", "lang"]);
699    }
700
701    #[test]
702    fn tokenizer_retains_connector_characters() {
703        let tokens = tokenize("N&M EXPRESS LLC @ 2024");
704        assert_eq!(tokens, vec!["n&m", "express", "llc", "2024"]);
705    }
706
707    #[test]
708    fn compute_matches_deduplicates_by_frame_id() {
709        // Create a document with content long enough to be split into multiple sections.
710        // The section soft limit is 900 chars, hard limit is 1400 chars.
711        // We'll create content > 2000 chars with the search term appearing in each section.
712        let mut builder = LexIndexBuilder::new();
713
714        // Build content with "quantum" appearing in multiple sections
715        let section1 = "Quantum computing represents a revolutionary approach to computation. \
716            The fundamental principles of quantum mechanics enable quantum computers to process \
717            information in ways classical computers cannot. Quantum bits or qubits can exist in \
718            superposition states, allowing quantum algorithms to explore multiple solutions \
719            simultaneously. This quantum parallelism offers exponential speedups for certain \
720            computational problems. Researchers continue to advance quantum hardware and software. \
721            The field of quantum computing is rapidly evolving with new breakthroughs. \
722            Major tech companies invest heavily in quantum research and development. \
723            Quantum error correction remains a significant challenge for practical quantum computers.";
724
725        let section2 = "Applications of quantum computing span many domains including cryptography, \
726            drug discovery, and optimization problems. Quantum cryptography promises unbreakable \
727            encryption through quantum key distribution protocols. In the pharmaceutical industry, \
728            quantum simulations could revolutionize how we discover new medicines. Quantum \
729            algorithms like Shor's algorithm threaten current encryption standards. Financial \
730            institutions explore quantum computing for portfolio optimization. The quantum \
731            advantage may soon be demonstrated for practical real-world applications. Quantum \
732            machine learning combines quantum computing with artificial intelligence techniques. \
733            The future of quantum computing holds immense promise for scientific discovery.";
734
735        let full_content = format!("{} {}", section1, section2);
736        assert!(
737            full_content.len() > 1400,
738            "Content should be long enough to create multiple sections"
739        );
740
741        builder.add_document(
742            42, // frame_id
743            "mv2://docs/quantum",
744            Some("Quantum Computing Overview"),
745            &full_content,
746            &HashMap::new(),
747        );
748
749        let artifact = builder.finish().expect("finish should succeed");
750        let index = LexIndex::decode(&artifact.bytes).expect("decode should succeed");
751
752        // Search for "quantum" which appears many times across both sections
753        let query_tokens = tokenize("quantum");
754        let matches = index.compute_matches(&query_tokens, None, None);
755
756        // Verify: no duplicate frame_ids in results
757        let frame_ids: Vec<_> = matches.iter().map(|m| m.frame_id).collect();
758        let unique_frame_ids: std::collections::HashSet<_> = frame_ids.iter().copied().collect();
759
760        assert_eq!(
761            frame_ids.len(),
762            unique_frame_ids.len(),
763            "Results should not contain duplicate frame_ids. Found: {:?}",
764            frame_ids
765        );
766
767        // Should have exactly one result for frame_id 42
768        assert_eq!(matches.len(), 1, "Should have exactly one match");
769        assert_eq!(matches[0].frame_id, 42, "Match should be for frame_id 42");
770        assert!(
771            matches[0].score > 0.0,
772            "Match should have a positive score"
773        );
774    }
775
776    #[test]
777    fn compute_matches_keeps_highest_score_per_frame() {
778        // Test that when multiple sections match, we keep the highest-scoring one
779        let mut builder = LexIndexBuilder::new();
780
781        // Create content where "target" appears more times in the second section
782        let section1 = "This is the first section with one target mention. \
783            It contains various other words to pad the content and make it long enough \
784            to be split into multiple sections by the chunking algorithm. We need quite \
785            a bit of text here to ensure the sections are created properly. The content \
786            continues with more filler text about various topics. Keep writing to reach \
787            the section boundary. More text follows to ensure we cross the soft limit. \
788            This should be enough to trigger section creation at the boundary point.";
789
790        let section2 = "The second section has target target target multiple times. \
791            Target appears here repeatedly: target target target target. This section \
792            should score higher because it has more occurrences of the search term target. \
793            We mention target again to boost the score further. Target target target. \
794            The abundance of target keywords makes this section rank higher in relevance.";
795
796        let full_content = format!("{} {}", section1, section2);
797
798        builder.add_document(
799            99,
800            "mv2://docs/multi-section",
801            Some("Multi-Section Document"),
802            &full_content,
803            &HashMap::new(),
804        );
805
806        let artifact = builder.finish().expect("finish");
807        let index = LexIndex::decode(&artifact.bytes).expect("decode");
808
809        let query_tokens = tokenize("target");
810        let matches = index.compute_matches(&query_tokens, None, None);
811
812        // Should have exactly one result (deduplicated)
813        assert_eq!(matches.len(), 1, "Should have exactly one deduplicated match");
814
815        // The match should have the higher score (from section2 with more "target" occurrences)
816        // Section1 has 1 occurrence, Section2 has ~10+ occurrences
817        assert!(
818            matches[0].score >= 5.0,
819            "Should keep the highest-scoring match, score was: {}",
820            matches[0].score
821        );
822    }
823}
memvid_core/lex.rs

memvid_core/
lex.rs