memvid_core/
lex.rs

1use std::{
2    cmp::Ordering,
3    collections::{BTreeMap, HashMap},
4};
5
6use blake3::hash;
7use serde::{Deserialize, Serialize};
8
9use crate::{MemvidError, Result, types::FrameId};
10
11// Bincode configuration reused for deterministic layout.
12fn lex_config() -> impl bincode::config::Config {
13    bincode::config::standard()
14        .with_fixed_int_encoding()
15        .with_little_endian()
16}
17
18#[allow(clippy::cast_possible_truncation)]
19const LEX_DECODE_LIMIT: usize = crate::MAX_INDEX_BYTES as usize;
20const LEX_SECTION_SOFT_CHARS: usize = 900;
21const LEX_SECTION_HARD_CHARS: usize = 1400;
22const LEX_SECTION_MAX_COUNT: usize = 2048;
23
24/// Intermediate builder that collects documents prior to serialisation.
25#[derive(Default)]
26pub struct LexIndexBuilder {
27    documents: Vec<LexDocument>,
28}
29
30impl LexIndexBuilder {
31    #[must_use]
32    pub fn new() -> Self {
33        Self::default()
34    }
35
36    pub fn add_document(
37        &mut self,
38        frame_id: FrameId,
39        uri: &str,
40        title: Option<&str>,
41        content: &str,
42        tags: &HashMap<String, String>,
43    ) {
44        let tokens = tokenize(content);
45        // Convert HashMap to BTreeMap for deterministic serialization
46        let tags: BTreeMap<_, _> = tags.iter().map(|(k, v)| (k.clone(), v.clone())).collect();
47        let mut sections = chunk_sections(content);
48
49        let (content_owned, content_lower) = if content.is_empty() {
50            (String::new(), String::new())
51        } else if sections.is_empty() {
52            let owned = content.to_string();
53            let lower = owned.to_ascii_lowercase();
54            sections.push(LexSection {
55                offset: 0,
56                content: owned.clone(),
57                content_lower: lower.clone(),
58            });
59            (owned, lower)
60        } else {
61            (String::new(), String::new())
62        };
63        self.documents.push(LexDocument {
64            frame_id,
65            tokens,
66            tags,
67            content: content_owned,
68            content_lower,
69            uri: Some(uri.to_string()),
70            title: title.map(ToString::to_string),
71            sections,
72        });
73    }
74
75    pub fn finish(mut self) -> Result<LexIndexArtifact> {
76        for document in &mut self.documents {
77            document.ensure_sections();
78        }
79        let bytes = bincode::serde::encode_to_vec(&self.documents, lex_config())?;
80        let checksum = *hash(&bytes).as_bytes();
81        Ok(LexIndexArtifact {
82            bytes,
83            doc_count: self.documents.len() as u64,
84            checksum,
85        })
86    }
87}
88
89/// Serialized lexical index artifact ready to be embedded in the `.mv2` file.
90#[derive(Debug, Clone)]
91pub struct LexIndexArtifact {
92    pub bytes: Vec<u8>,
93    pub doc_count: u64,
94    pub checksum: [u8; 32],
95}
96
97/// Read-only lexical index decoded from persisted bytes.
98#[derive(Debug, Clone)]
99pub struct LexIndex {
100    documents: Vec<LexDocument>,
101}
102
103impl LexIndex {
104    pub fn decode(bytes: &[u8]) -> Result<Self> {
105        let new_config = bincode::config::standard()
106            .with_fixed_int_encoding()
107            .with_little_endian()
108            .with_limit::<LEX_DECODE_LIMIT>();
109        if let Ok((documents, read)) =
110            bincode::serde::decode_from_slice::<Vec<LexDocument>, _>(bytes, new_config)
111        {
112            if read == bytes.len() {
113                return Ok(Self::from_documents(documents));
114            }
115        }
116
117        let legacy_fixed = bincode::config::standard()
118            .with_fixed_int_encoding()
119            .with_little_endian()
120            .with_limit::<LEX_DECODE_LIMIT>();
121        if let Ok((legacy_docs, read)) =
122            bincode::serde::decode_from_slice::<Vec<LegacyLexDocument>, _>(bytes, legacy_fixed)
123        {
124            if read == bytes.len() {
125                let documents = legacy_docs.into_iter().map(legacy_to_current).collect();
126                return Ok(Self::from_documents(documents));
127            }
128        }
129
130        let legacy_config = bincode::config::standard()
131            .with_little_endian()
132            .with_limit::<LEX_DECODE_LIMIT>();
133        if let Ok((legacy_docs, read)) =
134            bincode::serde::decode_from_slice::<Vec<LegacyLexDocument>, _>(bytes, legacy_config)
135        {
136            if read == bytes.len() {
137                let documents = legacy_docs.into_iter().map(legacy_to_current).collect();
138                return Ok(Self::from_documents(documents));
139            }
140        }
141
142        Err(MemvidError::InvalidToc {
143            reason: "unsupported lex index encoding".into(),
144        })
145    }
146
147    fn from_documents(mut documents: Vec<LexDocument>) -> Self {
148        for document in &mut documents {
149            document.ensure_sections();
150        }
151        Self { documents }
152    }
153
154    #[must_use]
155    pub fn search(&self, query: &str, limit: usize) -> Vec<LexSearchHit> {
156        let mut query_tokens = tokenize(query);
157        query_tokens.retain(|token| !token.is_empty());
158        if query_tokens.is_empty() {
159            return Vec::new();
160        }
161        let mut matches = self.compute_matches(&query_tokens, None, None);
162        matches.truncate(limit);
163        matches
164            .into_iter()
165            .map(|m| {
166                let snippets = build_snippets(&m.content, &m.occurrences, 160, 3);
167                LexSearchHit {
168                    frame_id: m.frame_id,
169                    score: m.score,
170                    match_count: m.occurrences.len(),
171                    snippets,
172                }
173            })
174            .collect()
175    }
176
177    pub(crate) fn documents_mut(&mut self) -> &mut [LexDocument] {
178        &mut self.documents
179    }
180
181    pub(crate) fn remove_document(&mut self, frame_id: FrameId) {
182        self.documents.retain(|doc| doc.frame_id != frame_id);
183    }
184
185    pub(crate) fn compute_matches(
186        &self,
187        query_tokens: &[String],
188        uri_filter: Option<&str>,
189        scope_filter: Option<&str>,
190    ) -> Vec<LexMatch> {
191        if query_tokens.is_empty() {
192            return Vec::new();
193        }
194
195        let mut hits = Vec::new();
196        let phrase = query_tokens.join(" ");
197        for document in &self.documents {
198            if let Some(uri) = uri_filter {
199                if !uri_matches(document.uri.as_deref(), uri) {
200                    continue;
201                }
202            } else if let Some(scope) = scope_filter {
203                match document.uri.as_deref() {
204                    Some(candidate) if candidate.starts_with(scope) => {}
205                    _ => continue,
206                }
207            }
208
209            if document.sections.is_empty() {
210                continue;
211            }
212
213            for section in &document.sections {
214                let haystack = section.content_lower.as_str();
215                if haystack.is_empty() {
216                    continue;
217                }
218
219                let mut occurrences: Vec<(usize, usize)> = Vec::new();
220
221                if query_tokens.len() == 1 {
222                    let needle = &query_tokens[0];
223                    if needle.is_empty() {
224                        continue;
225                    }
226                    let mut start = 0usize;
227                    while let Some(idx) = haystack[start..].find(needle) {
228                        let local_start = start + idx;
229                        let local_end = local_start + needle.len();
230                        occurrences.push((local_start, local_end));
231                        start = local_end;
232                    }
233                } else {
234                    let mut all_occurrences = Vec::new();
235                    let mut all_present = true;
236                    for needle in query_tokens {
237                        if needle.is_empty() {
238                            all_present = false;
239                            break;
240                        }
241                        let mut start = 0usize;
242                        let mut found_for_token = false;
243                        while let Some(idx) = haystack[start..].find(needle) {
244                            found_for_token = true;
245                            let local_start = start + idx;
246                            let local_end = local_start + needle.len();
247                            all_occurrences.push((local_start, local_end));
248                            start = local_end;
249                        }
250                        if !found_for_token {
251                            all_present = false;
252                            break;
253                        }
254                    }
255                    if !all_present {
256                        continue;
257                    }
258                    occurrences = all_occurrences;
259                }
260
261                if occurrences.is_empty() {
262                    continue;
263                }
264
265                occurrences.sort_by_key(|(start, _)| *start);
266                #[allow(clippy::cast_precision_loss)]
267                let mut score = occurrences.len() as f32;
268                if !phrase.is_empty() && section.content_lower.contains(&phrase) {
269                    score += 1000.0;
270                }
271                hits.push(LexMatch {
272                    frame_id: document.frame_id,
273                    score,
274                    occurrences,
275                    content: section.content.clone(),
276                    uri: document.uri.clone(),
277                    title: document.title.clone(),
278                    chunk_offset: section.offset,
279                });
280            }
281        }
282
283        hits.sort_by(|a, b| b.score.partial_cmp(&a.score).unwrap_or(Ordering::Equal));
284
285        // Deduplicate by frame_id, keeping the highest-scoring match for each frame.
286        // This prevents the same document from appearing multiple times when it has
287        // multiple sections that match the query.
288        let mut seen_frames: std::collections::HashSet<FrameId> = std::collections::HashSet::new();
289        let mut deduped = Vec::with_capacity(hits.len());
290        for hit in hits {
291            if seen_frames.insert(hit.frame_id) {
292                deduped.push(hit);
293            }
294        }
295        deduped
296    }
297}
298
299fn uri_matches(candidate: Option<&str>, expected: &str) -> bool {
300    let Some(uri) = candidate else {
301        return false;
302    };
303    if expected.contains('#') {
304        uri.eq_ignore_ascii_case(expected)
305    } else {
306        let expected_lower = expected.to_ascii_lowercase();
307        let candidate_lower = uri.to_ascii_lowercase();
308        candidate_lower.starts_with(&expected_lower)
309    }
310}
311
312#[derive(Debug, Clone, Serialize, Deserialize)]
313pub(crate) struct LexDocument {
314    pub(crate) frame_id: FrameId,
315    tokens: Vec<String>,
316    tags: BTreeMap<String, String>,
317    #[serde(default)]
318    content: String,
319    #[serde(default)]
320    pub(crate) content_lower: String,
321    #[serde(default)]
322    pub(crate) uri: Option<String>,
323    #[serde(default)]
324    pub(crate) title: Option<String>,
325    #[serde(default)]
326    sections: Vec<LexSection>,
327}
328
329#[derive(Debug, Clone, Serialize, Deserialize)]
330pub(crate) struct LexSection {
331    pub(crate) offset: usize,
332    #[serde(default)]
333    pub(crate) content: String,
334    #[serde(default)]
335    pub(crate) content_lower: String,
336}
337
338#[derive(Debug, Clone, Serialize, Deserialize)]
339struct LegacyLexDocument {
340    frame_id: FrameId,
341    tokens: Vec<String>,
342    tags: BTreeMap<String, String>,
343    #[serde(default)]
344    content: Option<String>,
345    #[serde(default)]
346    uri: Option<String>,
347    #[serde(default)]
348    title: Option<String>,
349}
350
351impl LexDocument {
352    fn ensure_sections(&mut self) {
353        if !self.sections.is_empty() {
354            return;
355        }
356
357        if self.content.is_empty() {
358            return;
359        }
360
361        if self.content_lower.is_empty() {
362            self.content_lower = self.content.to_ascii_lowercase();
363        }
364
365        self.sections.push(LexSection {
366            offset: 0,
367            content: self.content.clone(),
368            content_lower: self.content_lower.clone(),
369        });
370    }
371}
372
373fn legacy_to_current(legacy: LegacyLexDocument) -> LexDocument {
374    let content = legacy.content.unwrap_or_default();
375    let content_lower = content.to_ascii_lowercase();
376    let sections = if content.is_empty() {
377        Vec::new()
378    } else {
379        vec![LexSection {
380            offset: 0,
381            content: content.clone(),
382            content_lower: content_lower.clone(),
383        }]
384    };
385    LexDocument {
386        frame_id: legacy.frame_id,
387        tokens: legacy.tokens,
388        tags: legacy.tags,
389        content,
390        content_lower,
391        uri: legacy.uri,
392        title: legacy.title,
393        sections,
394    }
395}
396
397#[derive(Debug, Clone)]
398pub struct LexSearchHit {
399    pub frame_id: FrameId,
400    pub score: f32,
401    pub match_count: usize,
402    pub snippets: Vec<String>,
403}
404
405#[derive(Debug, Clone)]
406pub(crate) struct LexMatch {
407    pub frame_id: FrameId,
408    pub score: f32,
409    pub occurrences: Vec<(usize, usize)>,
410    pub content: String,
411    pub uri: Option<String>,
412    pub title: Option<String>,
413    pub chunk_offset: usize,
414}
415
416fn tokenize(input: &str) -> Vec<String> {
417    input
418        .split(|c: char| !is_token_char(c))
419        .filter_map(|token| {
420            if token.chars().any(char::is_alphanumeric) {
421                Some(token.to_lowercase())
422            } else {
423                None
424            }
425        })
426        .collect()
427}
428
429fn is_token_char(ch: char) -> bool {
430    ch.is_alphanumeric() || matches!(ch, '&' | '@' | '+' | '/' | '_')
431}
432
433fn build_snippets(
434    content: &str,
435    occurrences: &[(usize, usize)],
436    window: usize,
437    max_snippets: usize,
438) -> Vec<String> {
439    compute_snippet_slices(content, occurrences, window, max_snippets)
440        .into_iter()
441        .map(|(start, end)| content[start..end].replace('\n', " "))
442        .collect()
443}
444
445fn chunk_sections(content: &str) -> Vec<LexSection> {
446    if content.is_empty() {
447        return Vec::new();
448    }
449
450    if content.len() <= LEX_SECTION_HARD_CHARS {
451        return vec![LexSection {
452            offset: 0,
453            content: content.to_string(),
454            content_lower: content.to_ascii_lowercase(),
455        }];
456    }
457
458    let mut sections: Vec<LexSection> = Vec::new();
459    let mut chunk_start = 0usize;
460    let mut last_soft_break = None;
461    let mut iter = content.char_indices().peekable();
462
463    while let Some((idx, ch)) = iter.next() {
464        let char_end = idx + ch.len_utf8();
465        let current_len = char_end.saturating_sub(chunk_start);
466        let next_char = iter.peek().map(|(_, next)| *next);
467
468        if is_soft_boundary(ch, next_char) {
469            last_soft_break = Some(char_end);
470            if current_len < LEX_SECTION_SOFT_CHARS {
471                continue;
472            }
473        }
474
475        if current_len < LEX_SECTION_HARD_CHARS {
476            continue;
477        }
478
479        let mut split_at = last_soft_break.unwrap_or(char_end);
480        if split_at <= chunk_start {
481            split_at = char_end;
482        }
483
484        push_section(&mut sections, content, chunk_start, split_at);
485        chunk_start = split_at;
486        last_soft_break = None;
487
488        if sections.len() >= LEX_SECTION_MAX_COUNT {
489            break;
490        }
491    }
492
493    if chunk_start < content.len() {
494        if sections.len() >= LEX_SECTION_MAX_COUNT {
495            if let Some(last) = sections.last_mut() {
496                let slice = &content[last.offset..];
497                last.content = slice.to_string();
498                last.content_lower = slice.to_ascii_lowercase();
499            }
500        } else {
501            push_section(&mut sections, content, chunk_start, content.len());
502        }
503    }
504
505    if sections.is_empty() {
506        sections.push(LexSection {
507            offset: 0,
508            content: content.to_string(),
509            content_lower: content.to_ascii_lowercase(),
510        });
511    }
512
513    sections
514}
515
516fn push_section(sections: &mut Vec<LexSection>, content: &str, start: usize, end: usize) {
517    if end <= start {
518        return;
519    }
520
521    let slice = &content[start..end];
522    sections.push(LexSection {
523        offset: start,
524        content: slice.to_string(),
525        content_lower: slice.to_ascii_lowercase(),
526    });
527}
528
529fn is_soft_boundary(ch: char, next: Option<char>) -> bool {
530    match ch {
531        '.' | '!' | '?' => next.is_none_or(char::is_whitespace),
532        '\n' => true,
533        _ => false,
534    }
535}
536
537pub(crate) fn compute_snippet_slices(
538    content: &str,
539    occurrences: &[(usize, usize)],
540    window: usize,
541    max_snippets: usize,
542) -> Vec<(usize, usize)> {
543    if content.is_empty() {
544        return Vec::new();
545    }
546
547    if occurrences.is_empty() {
548        let end = advance_boundary(content, 0, window);
549        return vec![(0, end)];
550    }
551
552    let mut merged: Vec<(usize, usize)> = Vec::new();
553    for &(start, end) in occurrences {
554        let mut snippet_start = start.saturating_sub(window / 2);
555        let mut snippet_end = (end + window / 2).min(content.len());
556
557        if let Some(adj) = sentence_start_before(content, snippet_start) {
558            snippet_start = adj;
559        }
560        if let Some(adj) = sentence_end_after(content, snippet_end) {
561            snippet_end = adj;
562        }
563
564        snippet_start = prev_char_boundary(content, snippet_start);
565        snippet_end = next_char_boundary(content, snippet_end);
566
567        if snippet_end <= snippet_start {
568            continue;
569        }
570
571        if let Some(last) = merged.last_mut() {
572            if snippet_start <= last.1 + 20 {
573                last.1 = last.1.max(snippet_end);
574                continue;
575            }
576        }
577
578        merged.push((
579            snippet_start.min(content.len()),
580            snippet_end.min(content.len()),
581        ));
582        if merged.len() >= max_snippets {
583            break;
584        }
585    }
586
587    if merged.is_empty() {
588        let end = advance_boundary(content, 0, window);
589        merged.push((0, end));
590    }
591
592    merged
593}
594
595fn sentence_start_before(content: &str, idx: usize) -> Option<usize> {
596    if idx == 0 {
597        return Some(0);
598    }
599    let mut idx = idx.min(content.len());
600    idx = prev_char_boundary(content, idx);
601    let mut candidate = None;
602    for (pos, ch) in content[..idx].char_indices() {
603        if matches!(ch, '.' | '!' | '?' | '\n') {
604            candidate = Some(pos + ch.len_utf8());
605        }
606    }
607    candidate.map(|pos| {
608        let mut pos = next_char_boundary(content, pos);
609        while pos < content.len() && content.as_bytes()[pos].is_ascii_whitespace() {
610            pos += 1;
611        }
612        prev_char_boundary(content, pos)
613    })
614}
615
616fn sentence_end_after(content: &str, idx: usize) -> Option<usize> {
617    if idx >= content.len() {
618        return Some(content.len());
619    }
620    let mut idx = idx;
621    idx = prev_char_boundary(content, idx);
622    for (offset, ch) in content[idx..].char_indices() {
623        let global = idx + offset;
624        if matches!(ch, '.' | '!' | '?') {
625            return Some(next_char_boundary(content, global + ch.len_utf8()));
626        }
627        if ch == '\n' {
628            return Some(global);
629        }
630    }
631    None
632}
633
634fn prev_char_boundary(content: &str, mut idx: usize) -> usize {
635    if idx > content.len() {
636        idx = content.len();
637    }
638    while idx > 0 && !content.is_char_boundary(idx) {
639        idx -= 1;
640    }
641    idx
642}
643
644fn next_char_boundary(content: &str, mut idx: usize) -> usize {
645    if idx > content.len() {
646        idx = content.len();
647    }
648    while idx < content.len() && !content.is_char_boundary(idx) {
649        idx += 1;
650    }
651    idx
652}
653
654fn advance_boundary(content: &str, start: usize, mut window: usize) -> usize {
655    if start >= content.len() {
656        return content.len();
657    }
658    let mut last = content.len();
659    for (offset, _) in content[start..].char_indices() {
660        if window == 0 {
661            return start + offset;
662        }
663        last = start + offset;
664        window -= 1;
665    }
666    content.len().max(last)
667}
668
669#[cfg(test)]
670mod tests {
671    use super::*;
672
673    #[test]
674    fn builder_produces_artifact() {
675        let mut builder = LexIndexBuilder::new();
676        let mut tags = HashMap::new();
677        tags.insert("source".into(), "test".into());
678        builder.add_document(0, "mv2://docs/one", Some("Doc One"), "hello world", &tags);
679        builder.add_document(
680            1,
681            "mv2://docs/two",
682            Some("Doc Two"),
683            "rust systems",
684            &HashMap::new(),
685        );
686
687        let artifact = builder.finish().expect("finish");
688        assert_eq!(artifact.doc_count, 2);
689        assert!(!artifact.bytes.is_empty());
690
691        let index = LexIndex::decode(&artifact.bytes).expect("decode");
692        let hits = index.search("rust", 10);
693        assert_eq!(hits.len(), 1);
694        assert_eq!(hits[0].frame_id, 1);
695        assert!(hits[0].match_count >= 1);
696        assert!(!hits[0].snippets.is_empty());
697    }
698
699    #[test]
700    fn tokenizer_lowercases_and_filters() {
701        let tokens = tokenize("Hello, Rust-lang!");
702        assert_eq!(tokens, vec!["hello", "rust", "lang"]);
703    }
704
705    #[test]
706    fn tokenizer_retains_connector_characters() {
707        let tokens = tokenize("N&M EXPRESS LLC @ 2024");
708        assert_eq!(tokens, vec!["n&m", "express", "llc", "2024"]);
709    }
710
711    #[test]
712    fn compute_matches_deduplicates_by_frame_id() {
713        // Create a document with content long enough to be split into multiple sections.
714        // The section soft limit is 900 chars, hard limit is 1400 chars.
715        // We'll create content > 2000 chars with the search term appearing in each section.
716        let mut builder = LexIndexBuilder::new();
717
718        // Build content with "quantum" appearing in multiple sections
719        let section1 = "Quantum computing represents a revolutionary approach to computation. \
720            The fundamental principles of quantum mechanics enable quantum computers to process \
721            information in ways classical computers cannot. Quantum bits or qubits can exist in \
722            superposition states, allowing quantum algorithms to explore multiple solutions \
723            simultaneously. This quantum parallelism offers exponential speedups for certain \
724            computational problems. Researchers continue to advance quantum hardware and software. \
725            The field of quantum computing is rapidly evolving with new breakthroughs. \
726            Major tech companies invest heavily in quantum research and development. \
727            Quantum error correction remains a significant challenge for practical quantum computers.";
728
729        let section2 = "Applications of quantum computing span many domains including cryptography, \
730            drug discovery, and optimization problems. Quantum cryptography promises unbreakable \
731            encryption through quantum key distribution protocols. In the pharmaceutical industry, \
732            quantum simulations could revolutionize how we discover new medicines. Quantum \
733            algorithms like Shor's algorithm threaten current encryption standards. Financial \
734            institutions explore quantum computing for portfolio optimization. The quantum \
735            advantage may soon be demonstrated for practical real-world applications. Quantum \
736            machine learning combines quantum computing with artificial intelligence techniques. \
737            The future of quantum computing holds immense promise for scientific discovery.";
738
739        let full_content = format!("{} {}", section1, section2);
740        assert!(
741            full_content.len() > 1400,
742            "Content should be long enough to create multiple sections"
743        );
744
745        builder.add_document(
746            42, // frame_id
747            "mv2://docs/quantum",
748            Some("Quantum Computing Overview"),
749            &full_content,
750            &HashMap::new(),
751        );
752
753        let artifact = builder.finish().expect("finish should succeed");
754        let index = LexIndex::decode(&artifact.bytes).expect("decode should succeed");
755
756        // Search for "quantum" which appears many times across both sections
757        let query_tokens = tokenize("quantum");
758        let matches = index.compute_matches(&query_tokens, None, None);
759
760        // Verify: no duplicate frame_ids in results
761        let frame_ids: Vec<_> = matches.iter().map(|m| m.frame_id).collect();
762        let unique_frame_ids: std::collections::HashSet<_> = frame_ids.iter().copied().collect();
763
764        assert_eq!(
765            frame_ids.len(),
766            unique_frame_ids.len(),
767            "Results should not contain duplicate frame_ids. Found: {:?}",
768            frame_ids
769        );
770
771        // Should have exactly one result for frame_id 42
772        assert_eq!(matches.len(), 1, "Should have exactly one match");
773        assert_eq!(matches[0].frame_id, 42, "Match should be for frame_id 42");
774        assert!(matches[0].score > 0.0, "Match should have a positive score");
775    }
776
777    #[test]
778    fn compute_matches_keeps_highest_score_per_frame() {
779        // Test that when multiple sections match, we keep the highest-scoring one
780        let mut builder = LexIndexBuilder::new();
781
782        // Create content where "target" appears more times in the second section
783        let section1 = "This is the first section with one target mention. \
784            It contains various other words to pad the content and make it long enough \
785            to be split into multiple sections by the chunking algorithm. We need quite \
786            a bit of text here to ensure the sections are created properly. The content \
787            continues with more filler text about various topics. Keep writing to reach \
788            the section boundary. More text follows to ensure we cross the soft limit. \
789            This should be enough to trigger section creation at the boundary point.";
790
791        let section2 = "The second section has target target target multiple times. \
792            Target appears here repeatedly: target target target target. This section \
793            should score higher because it has more occurrences of the search term target. \
794            We mention target again to boost the score further. Target target target. \
795            The abundance of target keywords makes this section rank higher in relevance.";
796
797        let full_content = format!("{} {}", section1, section2);
798
799        builder.add_document(
800            99,
801            "mv2://docs/multi-section",
802            Some("Multi-Section Document"),
803            &full_content,
804            &HashMap::new(),
805        );
806
807        let artifact = builder.finish().expect("finish");
808        let index = LexIndex::decode(&artifact.bytes).expect("decode");
809
810        let query_tokens = tokenize("target");
811        let matches = index.compute_matches(&query_tokens, None, None);
812
813        // Should have exactly one result (deduplicated)
814        assert_eq!(
815            matches.len(),
816            1,
817            "Should have exactly one deduplicated match"
818        );
819
820        // The match should have the higher score (from section2 with more "target" occurrences)
821        // Section1 has 1 occurrence, Section2 has ~10+ occurrences
822        assert!(
823            matches[0].score >= 5.0,
824            "Should keep the highest-scoring match, score was: {}",
825            matches[0].score
826        );
827    }
828}
memvid_core/lex.rs

memvid_core/
lex.rs