use std::{
cmp::Ordering,
collections::{BTreeMap, HashMap},
};
use blake3::hash;
use serde::{Deserialize, Serialize};
use crate::{MemvidError, Result, types::FrameId};
fn lex_config() -> impl bincode::config::Config {
bincode::config::standard()
.with_fixed_int_encoding()
.with_little_endian()
}
#[allow(clippy::cast_possible_truncation)]
const LEX_DECODE_LIMIT: usize = crate::MAX_INDEX_BYTES as usize;
const LEX_SECTION_SOFT_CHARS: usize = 900;
const LEX_SECTION_HARD_CHARS: usize = 1400;
const LEX_SECTION_MAX_COUNT: usize = 2048;
#[derive(Default)]
pub struct LexIndexBuilder {
documents: Vec<LexDocument>,
}
impl LexIndexBuilder {
#[must_use]
pub fn new() -> Self {
Self::default()
}
pub fn add_document(
&mut self,
frame_id: FrameId,
uri: &str,
title: Option<&str>,
content: &str,
tags: &HashMap<String, String>,
) {
let tokens = tokenize(content);
let tags: BTreeMap<_, _> = tags.iter().map(|(k, v)| (k.clone(), v.clone())).collect();
let mut sections = chunk_sections(content);
let (content_owned, content_lower) = if content.is_empty() {
(String::new(), String::new())
} else if sections.is_empty() {
let owned = content.to_string();
let lower = owned.to_ascii_lowercase();
sections.push(LexSection {
offset: 0,
content: owned.clone(),
content_lower: lower.clone(),
});
(owned, lower)
} else {
(String::new(), String::new())
};
self.documents.push(LexDocument {
frame_id,
tokens,
tags,
content: content_owned,
content_lower,
uri: Some(uri.to_string()),
title: title.map(ToString::to_string),
sections,
});
}
pub fn finish(mut self) -> Result<LexIndexArtifact> {
for document in &mut self.documents {
document.ensure_sections();
}
let bytes = bincode::serde::encode_to_vec(&self.documents, lex_config())?;
let checksum = *hash(&bytes).as_bytes();
Ok(LexIndexArtifact {
bytes,
doc_count: self.documents.len() as u64,
checksum,
})
}
}
#[derive(Debug, Clone)]
pub struct LexIndexArtifact {
pub bytes: Vec<u8>,
pub doc_count: u64,
pub checksum: [u8; 32],
}
#[derive(Debug, Clone)]
pub struct LexIndex {
documents: Vec<LexDocument>,
}
impl LexIndex {
pub fn decode(bytes: &[u8]) -> Result<Self> {
let new_config = bincode::config::standard()
.with_fixed_int_encoding()
.with_little_endian()
.with_limit::<LEX_DECODE_LIMIT>();
if let Ok((documents, read)) =
bincode::serde::decode_from_slice::<Vec<LexDocument>, _>(bytes, new_config)
{
if read == bytes.len() {
return Ok(Self::from_documents(documents));
}
}
let legacy_fixed = bincode::config::standard()
.with_fixed_int_encoding()
.with_little_endian()
.with_limit::<LEX_DECODE_LIMIT>();
if let Ok((legacy_docs, read)) =
bincode::serde::decode_from_slice::<Vec<LegacyLexDocument>, _>(bytes, legacy_fixed)
{
if read == bytes.len() {
let documents = legacy_docs.into_iter().map(legacy_to_current).collect();
return Ok(Self::from_documents(documents));
}
}
let legacy_config = bincode::config::standard()
.with_little_endian()
.with_limit::<LEX_DECODE_LIMIT>();
if let Ok((legacy_docs, read)) =
bincode::serde::decode_from_slice::<Vec<LegacyLexDocument>, _>(bytes, legacy_config)
{
if read == bytes.len() {
let documents = legacy_docs.into_iter().map(legacy_to_current).collect();
return Ok(Self::from_documents(documents));
}
}
Err(MemvidError::InvalidToc {
reason: "unsupported lex index encoding".into(),
})
}
fn from_documents(mut documents: Vec<LexDocument>) -> Self {
for document in &mut documents {
document.ensure_sections();
}
Self { documents }
}
#[must_use]
pub fn search(&self, query: &str, limit: usize) -> Vec<LexSearchHit> {
let mut query_tokens = tokenize(query);
query_tokens.retain(|token| !token.is_empty());
if query_tokens.is_empty() {
return Vec::new();
}
let mut matches = self.compute_matches(&query_tokens, None, None);
matches.truncate(limit);
matches
.into_iter()
.map(|m| {
let snippets = build_snippets(&m.content, &m.occurrences, 160, 3);
LexSearchHit {
frame_id: m.frame_id,
score: m.score,
match_count: m.occurrences.len(),
snippets,
}
})
.collect()
}
pub(crate) fn documents_mut(&mut self) -> &mut [LexDocument] {
&mut self.documents
}
pub(crate) fn remove_document(&mut self, frame_id: FrameId) {
self.documents.retain(|doc| doc.frame_id != frame_id);
}
pub(crate) fn compute_matches(
&self,
query_tokens: &[String],
uri_filter: Option<&str>,
scope_filter: Option<&str>,
) -> Vec<LexMatch> {
if query_tokens.is_empty() {
return Vec::new();
}
let mut hits = Vec::new();
let phrase = query_tokens.join(" ");
for document in &self.documents {
if let Some(uri) = uri_filter {
if !uri_matches(document.uri.as_deref(), uri) {
continue;
}
} else if let Some(scope) = scope_filter {
match document.uri.as_deref() {
Some(candidate) if candidate.starts_with(scope) => {}
_ => continue,
}
}
if document.sections.is_empty() {
continue;
}
for section in &document.sections {
let haystack = section.content_lower.as_str();
if haystack.is_empty() {
continue;
}
let mut occurrences: Vec<(usize, usize)> = Vec::new();
if query_tokens.len() == 1 {
let needle = &query_tokens[0];
if needle.is_empty() {
continue;
}
let mut start = 0usize;
while let Some(idx) = haystack[start..].find(needle) {
let local_start = start + idx;
let local_end = local_start + needle.len();
occurrences.push((local_start, local_end));
start = local_end;
}
} else {
let mut all_occurrences = Vec::new();
let mut all_present = true;
for needle in query_tokens {
if needle.is_empty() {
all_present = false;
break;
}
let mut start = 0usize;
let mut found_for_token = false;
while let Some(idx) = haystack[start..].find(needle) {
found_for_token = true;
let local_start = start + idx;
let local_end = local_start + needle.len();
all_occurrences.push((local_start, local_end));
start = local_end;
}
if !found_for_token {
all_present = false;
break;
}
}
if !all_present {
continue;
}
occurrences = all_occurrences;
}
if occurrences.is_empty() {
continue;
}
occurrences.sort_by_key(|(start, _)| *start);
#[allow(clippy::cast_precision_loss)]
let mut score = occurrences.len() as f32;
if !phrase.is_empty() && section.content_lower.contains(&phrase) {
score += 1000.0;
}
hits.push(LexMatch {
frame_id: document.frame_id,
score,
occurrences,
content: section.content.clone(),
uri: document.uri.clone(),
title: document.title.clone(),
chunk_offset: section.offset,
});
}
}
hits.sort_by(|a, b| b.score.partial_cmp(&a.score).unwrap_or(Ordering::Equal));
let mut seen_frames: std::collections::HashSet<FrameId> = std::collections::HashSet::new();
let mut deduped = Vec::with_capacity(hits.len());
for hit in hits {
if seen_frames.insert(hit.frame_id) {
deduped.push(hit);
}
}
deduped
}
}
fn uri_matches(candidate: Option<&str>, expected: &str) -> bool {
let Some(uri) = candidate else {
return false;
};
if expected.contains('#') {
uri.eq_ignore_ascii_case(expected)
} else {
let expected_lower = expected.to_ascii_lowercase();
let candidate_lower = uri.to_ascii_lowercase();
candidate_lower.starts_with(&expected_lower)
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub(crate) struct LexDocument {
pub(crate) frame_id: FrameId,
tokens: Vec<String>,
tags: BTreeMap<String, String>,
#[serde(default)]
content: String,
#[serde(default)]
pub(crate) content_lower: String,
#[serde(default)]
pub(crate) uri: Option<String>,
#[serde(default)]
pub(crate) title: Option<String>,
#[serde(default)]
sections: Vec<LexSection>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub(crate) struct LexSection {
pub(crate) offset: usize,
#[serde(default)]
pub(crate) content: String,
#[serde(default)]
pub(crate) content_lower: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
struct LegacyLexDocument {
frame_id: FrameId,
tokens: Vec<String>,
tags: BTreeMap<String, String>,
#[serde(default)]
content: Option<String>,
#[serde(default)]
uri: Option<String>,
#[serde(default)]
title: Option<String>,
}
impl LexDocument {
fn ensure_sections(&mut self) {
if !self.sections.is_empty() {
return;
}
if self.content.is_empty() {
return;
}
if self.content_lower.is_empty() {
self.content_lower = self.content.to_ascii_lowercase();
}
self.sections.push(LexSection {
offset: 0,
content: self.content.clone(),
content_lower: self.content_lower.clone(),
});
}
}
fn legacy_to_current(legacy: LegacyLexDocument) -> LexDocument {
let content = legacy.content.unwrap_or_default();
let content_lower = content.to_ascii_lowercase();
let sections = if content.is_empty() {
Vec::new()
} else {
vec![LexSection {
offset: 0,
content: content.clone(),
content_lower: content_lower.clone(),
}]
};
LexDocument {
frame_id: legacy.frame_id,
tokens: legacy.tokens,
tags: legacy.tags,
content,
content_lower,
uri: legacy.uri,
title: legacy.title,
sections,
}
}
#[derive(Debug, Clone)]
pub struct LexSearchHit {
pub frame_id: FrameId,
pub score: f32,
pub match_count: usize,
pub snippets: Vec<String>,
}
#[derive(Debug, Clone)]
pub(crate) struct LexMatch {
pub frame_id: FrameId,
pub score: f32,
pub occurrences: Vec<(usize, usize)>,
pub content: String,
pub uri: Option<String>,
pub title: Option<String>,
pub chunk_offset: usize,
}
fn tokenize(input: &str) -> Vec<String> {
input
.split(|c: char| !is_token_char(c))
.filter_map(|token| {
if token.chars().any(char::is_alphanumeric) {
Some(token.to_lowercase())
} else {
None
}
})
.collect()
}
fn is_token_char(ch: char) -> bool {
ch.is_alphanumeric() || matches!(ch, '&' | '@' | '+' | '/' | '_')
}
fn build_snippets(
content: &str,
occurrences: &[(usize, usize)],
window: usize,
max_snippets: usize,
) -> Vec<String> {
compute_snippet_slices(content, occurrences, window, max_snippets)
.into_iter()
.map(|(start, end)| content[start..end].replace('\n', " "))
.collect()
}
fn chunk_sections(content: &str) -> Vec<LexSection> {
if content.is_empty() {
return Vec::new();
}
if content.len() <= LEX_SECTION_HARD_CHARS {
return vec![LexSection {
offset: 0,
content: content.to_string(),
content_lower: content.to_ascii_lowercase(),
}];
}
let mut sections: Vec<LexSection> = Vec::new();
let mut chunk_start = 0usize;
let mut last_soft_break = None;
let mut iter = content.char_indices().peekable();
while let Some((idx, ch)) = iter.next() {
let char_end = idx + ch.len_utf8();
let current_len = char_end.saturating_sub(chunk_start);
let next_char = iter.peek().map(|(_, next)| *next);
if is_soft_boundary(ch, next_char) {
last_soft_break = Some(char_end);
if current_len < LEX_SECTION_SOFT_CHARS {
continue;
}
}
if current_len < LEX_SECTION_HARD_CHARS {
continue;
}
let mut split_at = last_soft_break.unwrap_or(char_end);
if split_at <= chunk_start {
split_at = char_end;
}
push_section(&mut sections, content, chunk_start, split_at);
chunk_start = split_at;
last_soft_break = None;
if sections.len() >= LEX_SECTION_MAX_COUNT {
break;
}
}
if chunk_start < content.len() {
if sections.len() >= LEX_SECTION_MAX_COUNT {
if let Some(last) = sections.last_mut() {
let slice = &content[last.offset..];
last.content = slice.to_string();
last.content_lower = slice.to_ascii_lowercase();
}
} else {
push_section(&mut sections, content, chunk_start, content.len());
}
}
if sections.is_empty() {
sections.push(LexSection {
offset: 0,
content: content.to_string(),
content_lower: content.to_ascii_lowercase(),
});
}
sections
}
fn push_section(sections: &mut Vec<LexSection>, content: &str, start: usize, end: usize) {
if end <= start {
return;
}
let slice = &content[start..end];
sections.push(LexSection {
offset: start,
content: slice.to_string(),
content_lower: slice.to_ascii_lowercase(),
});
}
fn is_soft_boundary(ch: char, next: Option<char>) -> bool {
match ch {
'.' | '!' | '?' => next.is_none_or(char::is_whitespace),
'\n' => true,
_ => false,
}
}
pub(crate) fn compute_snippet_slices(
content: &str,
occurrences: &[(usize, usize)],
window: usize,
max_snippets: usize,
) -> Vec<(usize, usize)> {
if content.is_empty() {
return Vec::new();
}
if occurrences.is_empty() {
let end = advance_boundary(content, 0, window);
return vec![(0, end)];
}
let mut merged: Vec<(usize, usize)> = Vec::new();
for &(start, end) in occurrences {
let mut snippet_start = start.saturating_sub(window / 2);
let mut snippet_end = (end + window / 2).min(content.len());
if let Some(adj) = sentence_start_before(content, snippet_start) {
snippet_start = adj;
}
if let Some(adj) = sentence_end_after(content, snippet_end) {
snippet_end = adj;
}
snippet_start = prev_char_boundary(content, snippet_start);
snippet_end = next_char_boundary(content, snippet_end);
if snippet_end <= snippet_start {
continue;
}
if let Some(last) = merged.last_mut() {
if snippet_start <= last.1 + 20 {
last.1 = last.1.max(snippet_end);
continue;
}
}
merged.push((
snippet_start.min(content.len()),
snippet_end.min(content.len()),
));
if merged.len() >= max_snippets {
break;
}
}
if merged.is_empty() {
let end = advance_boundary(content, 0, window);
merged.push((0, end));
}
merged
}
fn sentence_start_before(content: &str, idx: usize) -> Option<usize> {
if idx == 0 {
return Some(0);
}
let mut idx = idx.min(content.len());
idx = prev_char_boundary(content, idx);
let mut candidate = None;
for (pos, ch) in content[..idx].char_indices() {
if matches!(ch, '.' | '!' | '?' | '\n') {
candidate = Some(pos + ch.len_utf8());
}
}
candidate.map(|pos| {
let mut pos = next_char_boundary(content, pos);
while pos < content.len() && content.as_bytes()[pos].is_ascii_whitespace() {
pos += 1;
}
prev_char_boundary(content, pos)
})
}
fn sentence_end_after(content: &str, idx: usize) -> Option<usize> {
if idx >= content.len() {
return Some(content.len());
}
let mut idx = idx;
idx = prev_char_boundary(content, idx);
for (offset, ch) in content[idx..].char_indices() {
let global = idx + offset;
if matches!(ch, '.' | '!' | '?') {
return Some(next_char_boundary(content, global + ch.len_utf8()));
}
if ch == '\n' {
return Some(global);
}
}
None
}
fn prev_char_boundary(content: &str, mut idx: usize) -> usize {
if idx > content.len() {
idx = content.len();
}
while idx > 0 && !content.is_char_boundary(idx) {
idx -= 1;
}
idx
}
fn next_char_boundary(content: &str, mut idx: usize) -> usize {
if idx > content.len() {
idx = content.len();
}
while idx < content.len() && !content.is_char_boundary(idx) {
idx += 1;
}
idx
}
fn advance_boundary(content: &str, start: usize, mut window: usize) -> usize {
if start >= content.len() {
return content.len();
}
let mut last = content.len();
for (offset, _) in content[start..].char_indices() {
if window == 0 {
return start + offset;
}
last = start + offset;
window -= 1;
}
content.len().max(last)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn builder_produces_artifact() {
let mut builder = LexIndexBuilder::new();
let mut tags = HashMap::new();
tags.insert("source".into(), "test".into());
builder.add_document(0, "mv2://docs/one", Some("Doc One"), "hello world", &tags);
builder.add_document(
1,
"mv2://docs/two",
Some("Doc Two"),
"rust systems",
&HashMap::new(),
);
let artifact = builder.finish().expect("finish");
assert_eq!(artifact.doc_count, 2);
assert!(!artifact.bytes.is_empty());
let index = LexIndex::decode(&artifact.bytes).expect("decode");
let hits = index.search("rust", 10);
assert_eq!(hits.len(), 1);
assert_eq!(hits[0].frame_id, 1);
assert!(hits[0].match_count >= 1);
assert!(!hits[0].snippets.is_empty());
}
#[test]
fn tokenizer_lowercases_and_filters() {
let tokens = tokenize("Hello, Rust-lang!");
assert_eq!(tokens, vec!["hello", "rust", "lang"]);
}
#[test]
fn tokenizer_retains_connector_characters() {
let tokens = tokenize("N&M EXPRESS LLC @ 2024");
assert_eq!(tokens, vec!["n&m", "express", "llc", "2024"]);
}
#[test]
fn compute_matches_deduplicates_by_frame_id() {
let mut builder = LexIndexBuilder::new();
let section1 = "Quantum computing represents a revolutionary approach to computation. \
The fundamental principles of quantum mechanics enable quantum computers to process \
information in ways classical computers cannot. Quantum bits or qubits can exist in \
superposition states, allowing quantum algorithms to explore multiple solutions \
simultaneously. This quantum parallelism offers exponential speedups for certain \
computational problems. Researchers continue to advance quantum hardware and software. \
The field of quantum computing is rapidly evolving with new breakthroughs. \
Major tech companies invest heavily in quantum research and development. \
Quantum error correction remains a significant challenge for practical quantum computers.";
let section2 = "Applications of quantum computing span many domains including cryptography, \
drug discovery, and optimization problems. Quantum cryptography promises unbreakable \
encryption through quantum key distribution protocols. In the pharmaceutical industry, \
quantum simulations could revolutionize how we discover new medicines. Quantum \
algorithms like Shor's algorithm threaten current encryption standards. Financial \
institutions explore quantum computing for portfolio optimization. The quantum \
advantage may soon be demonstrated for practical real-world applications. Quantum \
machine learning combines quantum computing with artificial intelligence techniques. \
The future of quantum computing holds immense promise for scientific discovery.";
let full_content = format!("{} {}", section1, section2);
assert!(
full_content.len() > 1400,
"Content should be long enough to create multiple sections"
);
builder.add_document(
42, "mv2://docs/quantum",
Some("Quantum Computing Overview"),
&full_content,
&HashMap::new(),
);
let artifact = builder.finish().expect("finish should succeed");
let index = LexIndex::decode(&artifact.bytes).expect("decode should succeed");
let query_tokens = tokenize("quantum");
let matches = index.compute_matches(&query_tokens, None, None);
let frame_ids: Vec<_> = matches.iter().map(|m| m.frame_id).collect();
let unique_frame_ids: std::collections::HashSet<_> = frame_ids.iter().copied().collect();
assert_eq!(
frame_ids.len(),
unique_frame_ids.len(),
"Results should not contain duplicate frame_ids. Found: {:?}",
frame_ids
);
assert_eq!(matches.len(), 1, "Should have exactly one match");
assert_eq!(matches[0].frame_id, 42, "Match should be for frame_id 42");
assert!(matches[0].score > 0.0, "Match should have a positive score");
}
#[test]
fn compute_matches_keeps_highest_score_per_frame() {
let mut builder = LexIndexBuilder::new();
let section1 = "This is the first section with one target mention. \
It contains various other words to pad the content and make it long enough \
to be split into multiple sections by the chunking algorithm. We need quite \
a bit of text here to ensure the sections are created properly. The content \
continues with more filler text about various topics. Keep writing to reach \
the section boundary. More text follows to ensure we cross the soft limit. \
This should be enough to trigger section creation at the boundary point.";
let section2 = "The second section has target target target multiple times. \
Target appears here repeatedly: target target target target. This section \
should score higher because it has more occurrences of the search term target. \
We mention target again to boost the score further. Target target target. \
The abundance of target keywords makes this section rank higher in relevance.";
let full_content = format!("{} {}", section1, section2);
builder.add_document(
99,
"mv2://docs/multi-section",
Some("Multi-Section Document"),
&full_content,
&HashMap::new(),
);
let artifact = builder.finish().expect("finish");
let index = LexIndex::decode(&artifact.bytes).expect("decode");
let query_tokens = tokenize("target");
let matches = index.compute_matches(&query_tokens, None, None);
assert_eq!(
matches.len(),
1,
"Should have exactly one deduplicated match"
);
assert!(
matches[0].score >= 5.0,
"Should keep the highest-scoring match, score was: {}",
matches[0].score
);
}
}